1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/blkdev.h>
39#include <linux/kthread.h>
40#include <linux/raid/pq.h>
41#include <linux/async_tx.h>
42#include <linux/module.h>
43#include <linux/async.h>
44#include <linux/seq_file.h>
45#include <linux/cpu.h>
46#include <linux/slab.h>
47#include <linux/ratelimit.h>
48#include <linux/nodemask.h>
49
50#include <trace/events/block.h>
51#include <linux/list_sort.h>
52
53#include "md.h"
54#include "raid5.h"
55#include "raid0.h"
56#include "md-bitmap.h"
57#include "raid5-log.h"
58
59#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
60
61#define cpu_to_group(cpu) cpu_to_node(cpu)
62#define ANY_GROUP NUMA_NO_NODE
63
64static bool devices_handle_discard_safely = false;
65module_param(devices_handle_discard_safely, bool, 0644);
66MODULE_PARM_DESC(devices_handle_discard_safely,
67 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
68static struct workqueue_struct *raid5_wq;
69
70static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
71{
72 int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
73 return &conf->stripe_hashtbl[hash];
74}
75
76static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
77{
78 return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
79}
80
81static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
82{
83 spin_lock_irq(conf->hash_locks + hash);
84 spin_lock(&conf->device_lock);
85}
86
87static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
88{
89 spin_unlock(&conf->device_lock);
90 spin_unlock_irq(conf->hash_locks + hash);
91}
92
93static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
94{
95 int i;
96 spin_lock_irq(conf->hash_locks);
97 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
98 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
99 spin_lock(&conf->device_lock);
100}
101
102static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
103{
104 int i;
105 spin_unlock(&conf->device_lock);
106 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
107 spin_unlock(conf->hash_locks + i);
108 spin_unlock_irq(conf->hash_locks);
109}
110
111
112static inline int raid6_d0(struct stripe_head *sh)
113{
114 if (sh->ddf_layout)
115
116 return 0;
117
118 if (sh->qd_idx == sh->disks - 1)
119 return 0;
120 else
121 return sh->qd_idx + 1;
122}
123static inline int raid6_next_disk(int disk, int raid_disks)
124{
125 disk++;
126 return (disk < raid_disks) ? disk : 0;
127}
128
129
130
131
132
133
134static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
135 int *count, int syndrome_disks)
136{
137 int slot = *count;
138
139 if (sh->ddf_layout)
140 (*count)++;
141 if (idx == sh->pd_idx)
142 return syndrome_disks;
143 if (idx == sh->qd_idx)
144 return syndrome_disks + 1;
145 if (!sh->ddf_layout)
146 (*count)++;
147 return slot;
148}
149
150static void print_raid5_conf (struct r5conf *conf);
151
152static int stripe_operations_active(struct stripe_head *sh)
153{
154 return sh->check_state || sh->reconstruct_state ||
155 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
156 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
157}
158
159static bool stripe_is_lowprio(struct stripe_head *sh)
160{
161 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
162 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
163 !test_bit(STRIPE_R5C_CACHING, &sh->state);
164}
165
166static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
167{
168 struct r5conf *conf = sh->raid_conf;
169 struct r5worker_group *group;
170 int thread_cnt;
171 int i, cpu = sh->cpu;
172
173 if (!cpu_online(cpu)) {
174 cpu = cpumask_any(cpu_online_mask);
175 sh->cpu = cpu;
176 }
177
178 if (list_empty(&sh->lru)) {
179 struct r5worker_group *group;
180 group = conf->worker_groups + cpu_to_group(cpu);
181 if (stripe_is_lowprio(sh))
182 list_add_tail(&sh->lru, &group->loprio_list);
183 else
184 list_add_tail(&sh->lru, &group->handle_list);
185 group->stripes_cnt++;
186 sh->group = group;
187 }
188
189 if (conf->worker_cnt_per_group == 0) {
190 md_wakeup_thread(conf->mddev->thread);
191 return;
192 }
193
194 group = conf->worker_groups + cpu_to_group(sh->cpu);
195
196 group->workers[0].working = true;
197
198 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
199
200 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
201
202 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
203 if (group->workers[i].working == false) {
204 group->workers[i].working = true;
205 queue_work_on(sh->cpu, raid5_wq,
206 &group->workers[i].work);
207 thread_cnt--;
208 }
209 }
210}
211
212static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
213 struct list_head *temp_inactive_list)
214{
215 int i;
216 int injournal = 0;
217
218 BUG_ON(!list_empty(&sh->lru));
219 BUG_ON(atomic_read(&conf->active_stripes)==0);
220
221 if (r5c_is_writeback(conf->log))
222 for (i = sh->disks; i--; )
223 if (test_bit(R5_InJournal, &sh->dev[i].flags))
224 injournal++;
225
226
227
228
229
230
231
232 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
233 (conf->quiesce && r5c_is_writeback(conf->log) &&
234 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
235 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
236 r5c_make_stripe_write_out(sh);
237 set_bit(STRIPE_HANDLE, &sh->state);
238 }
239
240 if (test_bit(STRIPE_HANDLE, &sh->state)) {
241 if (test_bit(STRIPE_DELAYED, &sh->state) &&
242 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
243 list_add_tail(&sh->lru, &conf->delayed_list);
244 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
245 sh->bm_seq - conf->seq_write > 0)
246 list_add_tail(&sh->lru, &conf->bitmap_list);
247 else {
248 clear_bit(STRIPE_DELAYED, &sh->state);
249 clear_bit(STRIPE_BIT_DELAY, &sh->state);
250 if (conf->worker_cnt_per_group == 0) {
251 if (stripe_is_lowprio(sh))
252 list_add_tail(&sh->lru,
253 &conf->loprio_list);
254 else
255 list_add_tail(&sh->lru,
256 &conf->handle_list);
257 } else {
258 raid5_wakeup_stripe_thread(sh);
259 return;
260 }
261 }
262 md_wakeup_thread(conf->mddev->thread);
263 } else {
264 BUG_ON(stripe_operations_active(sh));
265 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
266 if (atomic_dec_return(&conf->preread_active_stripes)
267 < IO_THRESHOLD)
268 md_wakeup_thread(conf->mddev->thread);
269 atomic_dec(&conf->active_stripes);
270 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
271 if (!r5c_is_writeback(conf->log))
272 list_add_tail(&sh->lru, temp_inactive_list);
273 else {
274 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
275 if (injournal == 0)
276 list_add_tail(&sh->lru, temp_inactive_list);
277 else if (injournal == conf->raid_disks - conf->max_degraded) {
278
279 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
280 atomic_inc(&conf->r5c_cached_full_stripes);
281 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
282 atomic_dec(&conf->r5c_cached_partial_stripes);
283 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
284 r5c_check_cached_full_stripe(conf);
285 } else
286
287
288
289
290
291 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
292 }
293 }
294 }
295}
296
297static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
298 struct list_head *temp_inactive_list)
299{
300 if (atomic_dec_and_test(&sh->count))
301 do_release_stripe(conf, sh, temp_inactive_list);
302}
303
304
305
306
307
308
309
310
311static void release_inactive_stripe_list(struct r5conf *conf,
312 struct list_head *temp_inactive_list,
313 int hash)
314{
315 int size;
316 bool do_wakeup = false;
317 unsigned long flags;
318
319 if (hash == NR_STRIPE_HASH_LOCKS) {
320 size = NR_STRIPE_HASH_LOCKS;
321 hash = NR_STRIPE_HASH_LOCKS - 1;
322 } else
323 size = 1;
324 while (size) {
325 struct list_head *list = &temp_inactive_list[size - 1];
326
327
328
329
330
331 if (!list_empty_careful(list)) {
332 spin_lock_irqsave(conf->hash_locks + hash, flags);
333 if (list_empty(conf->inactive_list + hash) &&
334 !list_empty(list))
335 atomic_dec(&conf->empty_inactive_list_nr);
336 list_splice_tail_init(list, conf->inactive_list + hash);
337 do_wakeup = true;
338 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
339 }
340 size--;
341 hash--;
342 }
343
344 if (do_wakeup) {
345 wake_up(&conf->wait_for_stripe);
346 if (atomic_read(&conf->active_stripes) == 0)
347 wake_up(&conf->wait_for_quiescent);
348 if (conf->retry_read_aligned)
349 md_wakeup_thread(conf->mddev->thread);
350 }
351}
352
353
354static int release_stripe_list(struct r5conf *conf,
355 struct list_head *temp_inactive_list)
356{
357 struct stripe_head *sh, *t;
358 int count = 0;
359 struct llist_node *head;
360
361 head = llist_del_all(&conf->released_stripes);
362 head = llist_reverse_order(head);
363 llist_for_each_entry_safe(sh, t, head, release_list) {
364 int hash;
365
366
367 smp_mb();
368 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
369
370
371
372
373
374 hash = sh->hash_lock_index;
375 __release_stripe(conf, sh, &temp_inactive_list[hash]);
376 count++;
377 }
378
379 return count;
380}
381
382void raid5_release_stripe(struct stripe_head *sh)
383{
384 struct r5conf *conf = sh->raid_conf;
385 unsigned long flags;
386 struct list_head list;
387 int hash;
388 bool wakeup;
389
390
391
392 if (atomic_add_unless(&sh->count, -1, 1))
393 return;
394
395 if (unlikely(!conf->mddev->thread) ||
396 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
397 goto slow_path;
398 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
399 if (wakeup)
400 md_wakeup_thread(conf->mddev->thread);
401 return;
402slow_path:
403
404 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
405 INIT_LIST_HEAD(&list);
406 hash = sh->hash_lock_index;
407 do_release_stripe(conf, sh, &list);
408 spin_unlock_irqrestore(&conf->device_lock, flags);
409 release_inactive_stripe_list(conf, &list, hash);
410 }
411}
412
413static inline void remove_hash(struct stripe_head *sh)
414{
415 pr_debug("remove_hash(), stripe %llu\n",
416 (unsigned long long)sh->sector);
417
418 hlist_del_init(&sh->hash);
419}
420
421static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
422{
423 struct hlist_head *hp = stripe_hash(conf, sh->sector);
424
425 pr_debug("insert_hash(), stripe %llu\n",
426 (unsigned long long)sh->sector);
427
428 hlist_add_head(&sh->hash, hp);
429}
430
431
432static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
433{
434 struct stripe_head *sh = NULL;
435 struct list_head *first;
436
437 if (list_empty(conf->inactive_list + hash))
438 goto out;
439 first = (conf->inactive_list + hash)->next;
440 sh = list_entry(first, struct stripe_head, lru);
441 list_del_init(first);
442 remove_hash(sh);
443 atomic_inc(&conf->active_stripes);
444 BUG_ON(hash != sh->hash_lock_index);
445 if (list_empty(conf->inactive_list + hash))
446 atomic_inc(&conf->empty_inactive_list_nr);
447out:
448 return sh;
449}
450
451static void shrink_buffers(struct stripe_head *sh)
452{
453 struct page *p;
454 int i;
455 int num = sh->raid_conf->pool_size;
456
457 for (i = 0; i < num ; i++) {
458 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
459 p = sh->dev[i].page;
460 if (!p)
461 continue;
462 sh->dev[i].page = NULL;
463 put_page(p);
464 }
465}
466
467static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
468{
469 int i;
470 int num = sh->raid_conf->pool_size;
471
472 for (i = 0; i < num; i++) {
473 struct page *page;
474
475 if (!(page = alloc_page(gfp))) {
476 return 1;
477 }
478 sh->dev[i].page = page;
479 sh->dev[i].orig_page = page;
480 }
481
482 return 0;
483}
484
485static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
486 struct stripe_head *sh);
487
488static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
489{
490 struct r5conf *conf = sh->raid_conf;
491 int i, seq;
492
493 BUG_ON(atomic_read(&sh->count) != 0);
494 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
495 BUG_ON(stripe_operations_active(sh));
496 BUG_ON(sh->batch_head);
497
498 pr_debug("init_stripe called, stripe %llu\n",
499 (unsigned long long)sector);
500retry:
501 seq = read_seqcount_begin(&conf->gen_lock);
502 sh->generation = conf->generation - previous;
503 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
504 sh->sector = sector;
505 stripe_set_idx(sector, conf, previous, sh);
506 sh->state = 0;
507
508 for (i = sh->disks; i--; ) {
509 struct r5dev *dev = &sh->dev[i];
510
511 if (dev->toread || dev->read || dev->towrite || dev->written ||
512 test_bit(R5_LOCKED, &dev->flags)) {
513 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
514 (unsigned long long)sh->sector, i, dev->toread,
515 dev->read, dev->towrite, dev->written,
516 test_bit(R5_LOCKED, &dev->flags));
517 WARN_ON(1);
518 }
519 dev->flags = 0;
520 dev->sector = raid5_compute_blocknr(sh, i, previous);
521 }
522 if (read_seqcount_retry(&conf->gen_lock, seq))
523 goto retry;
524 sh->overwrite_disks = 0;
525 insert_hash(conf, sh);
526 sh->cpu = smp_processor_id();
527 set_bit(STRIPE_BATCH_READY, &sh->state);
528}
529
530static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
531 short generation)
532{
533 struct stripe_head *sh;
534
535 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
536 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
537 if (sh->sector == sector && sh->generation == generation)
538 return sh;
539 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
540 return NULL;
541}
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556int raid5_calc_degraded(struct r5conf *conf)
557{
558 int degraded, degraded2;
559 int i;
560
561 rcu_read_lock();
562 degraded = 0;
563 for (i = 0; i < conf->previous_raid_disks; i++) {
564 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
565 if (rdev && test_bit(Faulty, &rdev->flags))
566 rdev = rcu_dereference(conf->disks[i].replacement);
567 if (!rdev || test_bit(Faulty, &rdev->flags))
568 degraded++;
569 else if (test_bit(In_sync, &rdev->flags))
570 ;
571 else
572
573
574
575
576
577
578
579
580
581 if (conf->raid_disks >= conf->previous_raid_disks)
582 degraded++;
583 }
584 rcu_read_unlock();
585 if (conf->raid_disks == conf->previous_raid_disks)
586 return degraded;
587 rcu_read_lock();
588 degraded2 = 0;
589 for (i = 0; i < conf->raid_disks; i++) {
590 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
591 if (rdev && test_bit(Faulty, &rdev->flags))
592 rdev = rcu_dereference(conf->disks[i].replacement);
593 if (!rdev || test_bit(Faulty, &rdev->flags))
594 degraded2++;
595 else if (test_bit(In_sync, &rdev->flags))
596 ;
597 else
598
599
600
601
602
603 if (conf->raid_disks <= conf->previous_raid_disks)
604 degraded2++;
605 }
606 rcu_read_unlock();
607 if (degraded2 > degraded)
608 return degraded2;
609 return degraded;
610}
611
612static int has_failed(struct r5conf *conf)
613{
614 int degraded;
615
616 if (conf->mddev->reshape_position == MaxSector)
617 return conf->mddev->degraded > conf->max_degraded;
618
619 degraded = raid5_calc_degraded(conf);
620 if (degraded > conf->max_degraded)
621 return 1;
622 return 0;
623}
624
625struct stripe_head *
626raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
627 int previous, int noblock, int noquiesce)
628{
629 struct stripe_head *sh;
630 int hash = stripe_hash_locks_hash(conf, sector);
631 int inc_empty_inactive_list_flag;
632
633 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
634
635 spin_lock_irq(conf->hash_locks + hash);
636
637 do {
638 wait_event_lock_irq(conf->wait_for_quiescent,
639 conf->quiesce == 0 || noquiesce,
640 *(conf->hash_locks + hash));
641 sh = __find_stripe(conf, sector, conf->generation - previous);
642 if (!sh) {
643 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
644 sh = get_free_stripe(conf, hash);
645 if (!sh && !test_bit(R5_DID_ALLOC,
646 &conf->cache_state))
647 set_bit(R5_ALLOC_MORE,
648 &conf->cache_state);
649 }
650 if (noblock && sh == NULL)
651 break;
652
653 r5c_check_stripe_cache_usage(conf);
654 if (!sh) {
655 set_bit(R5_INACTIVE_BLOCKED,
656 &conf->cache_state);
657 r5l_wake_reclaim(conf->log, 0);
658 wait_event_lock_irq(
659 conf->wait_for_stripe,
660 !list_empty(conf->inactive_list + hash) &&
661 (atomic_read(&conf->active_stripes)
662 < (conf->max_nr_stripes * 3 / 4)
663 || !test_bit(R5_INACTIVE_BLOCKED,
664 &conf->cache_state)),
665 *(conf->hash_locks + hash));
666 clear_bit(R5_INACTIVE_BLOCKED,
667 &conf->cache_state);
668 } else {
669 init_stripe(sh, sector, previous);
670 atomic_inc(&sh->count);
671 }
672 } else if (!atomic_inc_not_zero(&sh->count)) {
673 spin_lock(&conf->device_lock);
674 if (!atomic_read(&sh->count)) {
675 if (!test_bit(STRIPE_HANDLE, &sh->state))
676 atomic_inc(&conf->active_stripes);
677 BUG_ON(list_empty(&sh->lru) &&
678 !test_bit(STRIPE_EXPANDING, &sh->state));
679 inc_empty_inactive_list_flag = 0;
680 if (!list_empty(conf->inactive_list + hash))
681 inc_empty_inactive_list_flag = 1;
682 list_del_init(&sh->lru);
683 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
684 atomic_inc(&conf->empty_inactive_list_nr);
685 if (sh->group) {
686 sh->group->stripes_cnt--;
687 sh->group = NULL;
688 }
689 }
690 atomic_inc(&sh->count);
691 spin_unlock(&conf->device_lock);
692 }
693 } while (sh == NULL);
694
695 spin_unlock_irq(conf->hash_locks + hash);
696 return sh;
697}
698
699static bool is_full_stripe_write(struct stripe_head *sh)
700{
701 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
702 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
703}
704
705static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
706 __acquires(&sh1->stripe_lock)
707 __acquires(&sh2->stripe_lock)
708{
709 if (sh1 > sh2) {
710 spin_lock_irq(&sh2->stripe_lock);
711 spin_lock_nested(&sh1->stripe_lock, 1);
712 } else {
713 spin_lock_irq(&sh1->stripe_lock);
714 spin_lock_nested(&sh2->stripe_lock, 1);
715 }
716}
717
718static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
719 __releases(&sh1->stripe_lock)
720 __releases(&sh2->stripe_lock)
721{
722 spin_unlock(&sh1->stripe_lock);
723 spin_unlock_irq(&sh2->stripe_lock);
724}
725
726
727static bool stripe_can_batch(struct stripe_head *sh)
728{
729 struct r5conf *conf = sh->raid_conf;
730
731 if (raid5_has_log(conf) || raid5_has_ppl(conf))
732 return false;
733 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
734 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
735 is_full_stripe_write(sh);
736}
737
738
739static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
740{
741 struct stripe_head *head;
742 sector_t head_sector, tmp_sec;
743 int hash;
744 int dd_idx;
745 int inc_empty_inactive_list_flag;
746
747
748 tmp_sec = sh->sector;
749 if (!sector_div(tmp_sec, conf->chunk_sectors))
750 return;
751 head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
752
753 hash = stripe_hash_locks_hash(conf, head_sector);
754 spin_lock_irq(conf->hash_locks + hash);
755 head = __find_stripe(conf, head_sector, conf->generation);
756 if (head && !atomic_inc_not_zero(&head->count)) {
757 spin_lock(&conf->device_lock);
758 if (!atomic_read(&head->count)) {
759 if (!test_bit(STRIPE_HANDLE, &head->state))
760 atomic_inc(&conf->active_stripes);
761 BUG_ON(list_empty(&head->lru) &&
762 !test_bit(STRIPE_EXPANDING, &head->state));
763 inc_empty_inactive_list_flag = 0;
764 if (!list_empty(conf->inactive_list + hash))
765 inc_empty_inactive_list_flag = 1;
766 list_del_init(&head->lru);
767 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
768 atomic_inc(&conf->empty_inactive_list_nr);
769 if (head->group) {
770 head->group->stripes_cnt--;
771 head->group = NULL;
772 }
773 }
774 atomic_inc(&head->count);
775 spin_unlock(&conf->device_lock);
776 }
777 spin_unlock_irq(conf->hash_locks + hash);
778
779 if (!head)
780 return;
781 if (!stripe_can_batch(head))
782 goto out;
783
784 lock_two_stripes(head, sh);
785
786 if (!stripe_can_batch(head) || !stripe_can_batch(sh))
787 goto unlock_out;
788
789 if (sh->batch_head)
790 goto unlock_out;
791
792 dd_idx = 0;
793 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
794 dd_idx++;
795 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
796 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
797 goto unlock_out;
798
799 if (head->batch_head) {
800 spin_lock(&head->batch_head->batch_lock);
801
802 if (!stripe_can_batch(head)) {
803 spin_unlock(&head->batch_head->batch_lock);
804 goto unlock_out;
805 }
806
807
808
809
810
811
812
813 sh->batch_head = head->batch_head;
814
815
816
817
818
819 list_add(&sh->batch_list, &head->batch_list);
820 spin_unlock(&head->batch_head->batch_lock);
821 } else {
822 head->batch_head = head;
823 sh->batch_head = head->batch_head;
824 spin_lock(&head->batch_lock);
825 list_add_tail(&sh->batch_list, &head->batch_list);
826 spin_unlock(&head->batch_lock);
827 }
828
829 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
830 if (atomic_dec_return(&conf->preread_active_stripes)
831 < IO_THRESHOLD)
832 md_wakeup_thread(conf->mddev->thread);
833
834 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
835 int seq = sh->bm_seq;
836 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
837 sh->batch_head->bm_seq > seq)
838 seq = sh->batch_head->bm_seq;
839 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
840 sh->batch_head->bm_seq = seq;
841 }
842
843 atomic_inc(&sh->count);
844unlock_out:
845 unlock_two_stripes(head, sh);
846out:
847 raid5_release_stripe(head);
848}
849
850
851
852
853static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
854{
855 sector_t progress = conf->reshape_progress;
856
857
858
859
860 smp_rmb();
861 if (progress == MaxSector)
862 return 0;
863 if (sh->generation == conf->generation - 1)
864 return 0;
865
866
867
868 return 1;
869}
870
871static void dispatch_bio_list(struct bio_list *tmp)
872{
873 struct bio *bio;
874
875 while ((bio = bio_list_pop(tmp)))
876 submit_bio_noacct(bio);
877}
878
879static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
880{
881 const struct r5pending_data *da = list_entry(a,
882 struct r5pending_data, sibling);
883 const struct r5pending_data *db = list_entry(b,
884 struct r5pending_data, sibling);
885 if (da->sector > db->sector)
886 return 1;
887 if (da->sector < db->sector)
888 return -1;
889 return 0;
890}
891
892static void dispatch_defer_bios(struct r5conf *conf, int target,
893 struct bio_list *list)
894{
895 struct r5pending_data *data;
896 struct list_head *first, *next = NULL;
897 int cnt = 0;
898
899 if (conf->pending_data_cnt == 0)
900 return;
901
902 list_sort(NULL, &conf->pending_list, cmp_stripe);
903
904 first = conf->pending_list.next;
905
906
907 if (conf->next_pending_data)
908 list_move_tail(&conf->pending_list,
909 &conf->next_pending_data->sibling);
910
911 while (!list_empty(&conf->pending_list)) {
912 data = list_first_entry(&conf->pending_list,
913 struct r5pending_data, sibling);
914 if (&data->sibling == first)
915 first = data->sibling.next;
916 next = data->sibling.next;
917
918 bio_list_merge(list, &data->bios);
919 list_move(&data->sibling, &conf->free_list);
920 cnt++;
921 if (cnt >= target)
922 break;
923 }
924 conf->pending_data_cnt -= cnt;
925 BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
926
927 if (next != &conf->pending_list)
928 conf->next_pending_data = list_entry(next,
929 struct r5pending_data, sibling);
930 else
931 conf->next_pending_data = NULL;
932
933 if (first != &conf->pending_list)
934 list_move_tail(&conf->pending_list, first);
935}
936
937static void flush_deferred_bios(struct r5conf *conf)
938{
939 struct bio_list tmp = BIO_EMPTY_LIST;
940
941 if (conf->pending_data_cnt == 0)
942 return;
943
944 spin_lock(&conf->pending_bios_lock);
945 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
946 BUG_ON(conf->pending_data_cnt != 0);
947 spin_unlock(&conf->pending_bios_lock);
948
949 dispatch_bio_list(&tmp);
950}
951
952static void defer_issue_bios(struct r5conf *conf, sector_t sector,
953 struct bio_list *bios)
954{
955 struct bio_list tmp = BIO_EMPTY_LIST;
956 struct r5pending_data *ent;
957
958 spin_lock(&conf->pending_bios_lock);
959 ent = list_first_entry(&conf->free_list, struct r5pending_data,
960 sibling);
961 list_move_tail(&ent->sibling, &conf->pending_list);
962 ent->sector = sector;
963 bio_list_init(&ent->bios);
964 bio_list_merge(&ent->bios, bios);
965 conf->pending_data_cnt++;
966 if (conf->pending_data_cnt >= PENDING_IO_MAX)
967 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
968
969 spin_unlock(&conf->pending_bios_lock);
970
971 dispatch_bio_list(&tmp);
972}
973
974static void
975raid5_end_read_request(struct bio *bi);
976static void
977raid5_end_write_request(struct bio *bi);
978
979static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
980{
981 struct r5conf *conf = sh->raid_conf;
982 int i, disks = sh->disks;
983 struct stripe_head *head_sh = sh;
984 struct bio_list pending_bios = BIO_EMPTY_LIST;
985 bool should_defer;
986
987 might_sleep();
988
989 if (log_stripe(sh, s) == 0)
990 return;
991
992 should_defer = conf->batch_bio_dispatch && conf->group_cnt;
993
994 for (i = disks; i--; ) {
995 int op, op_flags = 0;
996 int replace_only = 0;
997 struct bio *bi, *rbi;
998 struct md_rdev *rdev, *rrdev = NULL;
999
1000 sh = head_sh;
1001 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1002 op = REQ_OP_WRITE;
1003 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1004 op_flags = REQ_FUA;
1005 if (test_bit(R5_Discard, &sh->dev[i].flags))
1006 op = REQ_OP_DISCARD;
1007 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1008 op = REQ_OP_READ;
1009 else if (test_and_clear_bit(R5_WantReplace,
1010 &sh->dev[i].flags)) {
1011 op = REQ_OP_WRITE;
1012 replace_only = 1;
1013 } else
1014 continue;
1015 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1016 op_flags |= REQ_SYNC;
1017
1018again:
1019 bi = &sh->dev[i].req;
1020 rbi = &sh->dev[i].rreq;
1021
1022 rcu_read_lock();
1023 rrdev = rcu_dereference(conf->disks[i].replacement);
1024 smp_mb();
1025 rdev = rcu_dereference(conf->disks[i].rdev);
1026 if (!rdev) {
1027 rdev = rrdev;
1028 rrdev = NULL;
1029 }
1030 if (op_is_write(op)) {
1031 if (replace_only)
1032 rdev = NULL;
1033 if (rdev == rrdev)
1034
1035 rrdev = NULL;
1036 } else {
1037 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1038 rdev = rrdev;
1039 rrdev = NULL;
1040 }
1041
1042 if (rdev && test_bit(Faulty, &rdev->flags))
1043 rdev = NULL;
1044 if (rdev)
1045 atomic_inc(&rdev->nr_pending);
1046 if (rrdev && test_bit(Faulty, &rrdev->flags))
1047 rrdev = NULL;
1048 if (rrdev)
1049 atomic_inc(&rrdev->nr_pending);
1050 rcu_read_unlock();
1051
1052
1053
1054
1055
1056 while (op_is_write(op) && rdev &&
1057 test_bit(WriteErrorSeen, &rdev->flags)) {
1058 sector_t first_bad;
1059 int bad_sectors;
1060 int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
1061 &first_bad, &bad_sectors);
1062 if (!bad)
1063 break;
1064
1065 if (bad < 0) {
1066 set_bit(BlockedBadBlocks, &rdev->flags);
1067 if (!conf->mddev->external &&
1068 conf->mddev->sb_flags) {
1069
1070
1071
1072
1073 md_check_recovery(conf->mddev);
1074 }
1075
1076
1077
1078
1079
1080 atomic_inc(&rdev->nr_pending);
1081 md_wait_for_blocked_rdev(rdev, conf->mddev);
1082 } else {
1083
1084 rdev_dec_pending(rdev, conf->mddev);
1085 rdev = NULL;
1086 }
1087 }
1088
1089 if (rdev) {
1090 if (s->syncing || s->expanding || s->expanded
1091 || s->replacing)
1092 md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
1093
1094 set_bit(STRIPE_IO_STARTED, &sh->state);
1095
1096 bio_set_dev(bi, rdev->bdev);
1097 bio_set_op_attrs(bi, op, op_flags);
1098 bi->bi_end_io = op_is_write(op)
1099 ? raid5_end_write_request
1100 : raid5_end_read_request;
1101 bi->bi_private = sh;
1102
1103 pr_debug("%s: for %llu schedule op %d on disc %d\n",
1104 __func__, (unsigned long long)sh->sector,
1105 bi->bi_opf, i);
1106 atomic_inc(&sh->count);
1107 if (sh != head_sh)
1108 atomic_inc(&head_sh->count);
1109 if (use_new_offset(conf, sh))
1110 bi->bi_iter.bi_sector = (sh->sector
1111 + rdev->new_data_offset);
1112 else
1113 bi->bi_iter.bi_sector = (sh->sector
1114 + rdev->data_offset);
1115 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1116 bi->bi_opf |= REQ_NOMERGE;
1117
1118 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1119 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1120
1121 if (!op_is_write(op) &&
1122 test_bit(R5_InJournal, &sh->dev[i].flags))
1123
1124
1125
1126
1127
1128 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1129 else
1130 sh->dev[i].vec.bv_page = sh->dev[i].page;
1131 bi->bi_vcnt = 1;
1132 bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1133 bi->bi_io_vec[0].bv_offset = 0;
1134 bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1135 bi->bi_write_hint = sh->dev[i].write_hint;
1136 if (!rrdev)
1137 sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
1138
1139
1140
1141
1142 if (op == REQ_OP_DISCARD)
1143 bi->bi_vcnt = 0;
1144 if (rrdev)
1145 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1146
1147 if (conf->mddev->gendisk)
1148 trace_block_bio_remap(bi->bi_disk->queue,
1149 bi, disk_devt(conf->mddev->gendisk),
1150 sh->dev[i].sector);
1151 if (should_defer && op_is_write(op))
1152 bio_list_add(&pending_bios, bi);
1153 else
1154 submit_bio_noacct(bi);
1155 }
1156 if (rrdev) {
1157 if (s->syncing || s->expanding || s->expanded
1158 || s->replacing)
1159 md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
1160
1161 set_bit(STRIPE_IO_STARTED, &sh->state);
1162
1163 bio_set_dev(rbi, rrdev->bdev);
1164 bio_set_op_attrs(rbi, op, op_flags);
1165 BUG_ON(!op_is_write(op));
1166 rbi->bi_end_io = raid5_end_write_request;
1167 rbi->bi_private = sh;
1168
1169 pr_debug("%s: for %llu schedule op %d on "
1170 "replacement disc %d\n",
1171 __func__, (unsigned long long)sh->sector,
1172 rbi->bi_opf, i);
1173 atomic_inc(&sh->count);
1174 if (sh != head_sh)
1175 atomic_inc(&head_sh->count);
1176 if (use_new_offset(conf, sh))
1177 rbi->bi_iter.bi_sector = (sh->sector
1178 + rrdev->new_data_offset);
1179 else
1180 rbi->bi_iter.bi_sector = (sh->sector
1181 + rrdev->data_offset);
1182 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1183 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1184 sh->dev[i].rvec.bv_page = sh->dev[i].page;
1185 rbi->bi_vcnt = 1;
1186 rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1187 rbi->bi_io_vec[0].bv_offset = 0;
1188 rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1189 rbi->bi_write_hint = sh->dev[i].write_hint;
1190 sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
1191
1192
1193
1194
1195 if (op == REQ_OP_DISCARD)
1196 rbi->bi_vcnt = 0;
1197 if (conf->mddev->gendisk)
1198 trace_block_bio_remap(rbi->bi_disk->queue,
1199 rbi, disk_devt(conf->mddev->gendisk),
1200 sh->dev[i].sector);
1201 if (should_defer && op_is_write(op))
1202 bio_list_add(&pending_bios, rbi);
1203 else
1204 submit_bio_noacct(rbi);
1205 }
1206 if (!rdev && !rrdev) {
1207 if (op_is_write(op))
1208 set_bit(STRIPE_DEGRADED, &sh->state);
1209 pr_debug("skip op %d on disc %d for sector %llu\n",
1210 bi->bi_opf, i, (unsigned long long)sh->sector);
1211 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1212 set_bit(STRIPE_HANDLE, &sh->state);
1213 }
1214
1215 if (!head_sh->batch_head)
1216 continue;
1217 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1218 batch_list);
1219 if (sh != head_sh)
1220 goto again;
1221 }
1222
1223 if (should_defer && !bio_list_empty(&pending_bios))
1224 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1225}
1226
1227static struct dma_async_tx_descriptor *
1228async_copy_data(int frombio, struct bio *bio, struct page **page,
1229 sector_t sector, struct dma_async_tx_descriptor *tx,
1230 struct stripe_head *sh, int no_skipcopy)
1231{
1232 struct bio_vec bvl;
1233 struct bvec_iter iter;
1234 struct page *bio_page;
1235 int page_offset;
1236 struct async_submit_ctl submit;
1237 enum async_tx_flags flags = 0;
1238 struct r5conf *conf = sh->raid_conf;
1239
1240 if (bio->bi_iter.bi_sector >= sector)
1241 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1242 else
1243 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1244
1245 if (frombio)
1246 flags |= ASYNC_TX_FENCE;
1247 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1248
1249 bio_for_each_segment(bvl, bio, iter) {
1250 int len = bvl.bv_len;
1251 int clen;
1252 int b_offset = 0;
1253
1254 if (page_offset < 0) {
1255 b_offset = -page_offset;
1256 page_offset += b_offset;
1257 len -= b_offset;
1258 }
1259
1260 if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1261 clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1262 else
1263 clen = len;
1264
1265 if (clen > 0) {
1266 b_offset += bvl.bv_offset;
1267 bio_page = bvl.bv_page;
1268 if (frombio) {
1269 if (conf->skip_copy &&
1270 b_offset == 0 && page_offset == 0 &&
1271 clen == RAID5_STRIPE_SIZE(conf) &&
1272 !no_skipcopy)
1273 *page = bio_page;
1274 else
1275 tx = async_memcpy(*page, bio_page, page_offset,
1276 b_offset, clen, &submit);
1277 } else
1278 tx = async_memcpy(bio_page, *page, b_offset,
1279 page_offset, clen, &submit);
1280 }
1281
1282 submit.depend_tx = tx;
1283
1284 if (clen < len)
1285 break;
1286 page_offset += len;
1287 }
1288
1289 return tx;
1290}
1291
1292static void ops_complete_biofill(void *stripe_head_ref)
1293{
1294 struct stripe_head *sh = stripe_head_ref;
1295 int i;
1296 struct r5conf *conf = sh->raid_conf;
1297
1298 pr_debug("%s: stripe %llu\n", __func__,
1299 (unsigned long long)sh->sector);
1300
1301
1302 for (i = sh->disks; i--; ) {
1303 struct r5dev *dev = &sh->dev[i];
1304
1305
1306
1307
1308
1309
1310 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1311 struct bio *rbi, *rbi2;
1312
1313 BUG_ON(!dev->read);
1314 rbi = dev->read;
1315 dev->read = NULL;
1316 while (rbi && rbi->bi_iter.bi_sector <
1317 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1318 rbi2 = r5_next_bio(conf, rbi, dev->sector);
1319 bio_endio(rbi);
1320 rbi = rbi2;
1321 }
1322 }
1323 }
1324 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1325
1326 set_bit(STRIPE_HANDLE, &sh->state);
1327 raid5_release_stripe(sh);
1328}
1329
1330static void ops_run_biofill(struct stripe_head *sh)
1331{
1332 struct dma_async_tx_descriptor *tx = NULL;
1333 struct async_submit_ctl submit;
1334 int i;
1335 struct r5conf *conf = sh->raid_conf;
1336
1337 BUG_ON(sh->batch_head);
1338 pr_debug("%s: stripe %llu\n", __func__,
1339 (unsigned long long)sh->sector);
1340
1341 for (i = sh->disks; i--; ) {
1342 struct r5dev *dev = &sh->dev[i];
1343 if (test_bit(R5_Wantfill, &dev->flags)) {
1344 struct bio *rbi;
1345 spin_lock_irq(&sh->stripe_lock);
1346 dev->read = rbi = dev->toread;
1347 dev->toread = NULL;
1348 spin_unlock_irq(&sh->stripe_lock);
1349 while (rbi && rbi->bi_iter.bi_sector <
1350 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1351 tx = async_copy_data(0, rbi, &dev->page,
1352 dev->sector, tx, sh, 0);
1353 rbi = r5_next_bio(conf, rbi, dev->sector);
1354 }
1355 }
1356 }
1357
1358 atomic_inc(&sh->count);
1359 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1360 async_trigger_callback(&submit);
1361}
1362
1363static void mark_target_uptodate(struct stripe_head *sh, int target)
1364{
1365 struct r5dev *tgt;
1366
1367 if (target < 0)
1368 return;
1369
1370 tgt = &sh->dev[target];
1371 set_bit(R5_UPTODATE, &tgt->flags);
1372 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1373 clear_bit(R5_Wantcompute, &tgt->flags);
1374}
1375
1376static void ops_complete_compute(void *stripe_head_ref)
1377{
1378 struct stripe_head *sh = stripe_head_ref;
1379
1380 pr_debug("%s: stripe %llu\n", __func__,
1381 (unsigned long long)sh->sector);
1382
1383
1384 mark_target_uptodate(sh, sh->ops.target);
1385 mark_target_uptodate(sh, sh->ops.target2);
1386
1387 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1388 if (sh->check_state == check_state_compute_run)
1389 sh->check_state = check_state_compute_result;
1390 set_bit(STRIPE_HANDLE, &sh->state);
1391 raid5_release_stripe(sh);
1392}
1393
1394
1395static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1396{
1397 return percpu->scribble + i * percpu->scribble_obj_size;
1398}
1399
1400
1401static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1402 struct raid5_percpu *percpu, int i)
1403{
1404 return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1405}
1406
1407static struct dma_async_tx_descriptor *
1408ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1409{
1410 int disks = sh->disks;
1411 struct page **xor_srcs = to_addr_page(percpu, 0);
1412 int target = sh->ops.target;
1413 struct r5dev *tgt = &sh->dev[target];
1414 struct page *xor_dest = tgt->page;
1415 int count = 0;
1416 struct dma_async_tx_descriptor *tx;
1417 struct async_submit_ctl submit;
1418 int i;
1419
1420 BUG_ON(sh->batch_head);
1421
1422 pr_debug("%s: stripe %llu block: %d\n",
1423 __func__, (unsigned long long)sh->sector, target);
1424 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1425
1426 for (i = disks; i--; )
1427 if (i != target)
1428 xor_srcs[count++] = sh->dev[i].page;
1429
1430 atomic_inc(&sh->count);
1431
1432 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1433 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1434 if (unlikely(count == 1))
1435 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
1436 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1437 else
1438 tx = async_xor(xor_dest, xor_srcs, 0, count,
1439 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1440
1441 return tx;
1442}
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453static int set_syndrome_sources(struct page **srcs,
1454 struct stripe_head *sh,
1455 int srctype)
1456{
1457 int disks = sh->disks;
1458 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1459 int d0_idx = raid6_d0(sh);
1460 int count;
1461 int i;
1462
1463 for (i = 0; i < disks; i++)
1464 srcs[i] = NULL;
1465
1466 count = 0;
1467 i = d0_idx;
1468 do {
1469 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1470 struct r5dev *dev = &sh->dev[i];
1471
1472 if (i == sh->qd_idx || i == sh->pd_idx ||
1473 (srctype == SYNDROME_SRC_ALL) ||
1474 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1475 (test_bit(R5_Wantdrain, &dev->flags) ||
1476 test_bit(R5_InJournal, &dev->flags))) ||
1477 (srctype == SYNDROME_SRC_WRITTEN &&
1478 (dev->written ||
1479 test_bit(R5_InJournal, &dev->flags)))) {
1480 if (test_bit(R5_InJournal, &dev->flags))
1481 srcs[slot] = sh->dev[i].orig_page;
1482 else
1483 srcs[slot] = sh->dev[i].page;
1484 }
1485 i = raid6_next_disk(i, disks);
1486 } while (i != d0_idx);
1487
1488 return syndrome_disks;
1489}
1490
1491static struct dma_async_tx_descriptor *
1492ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1493{
1494 int disks = sh->disks;
1495 struct page **blocks = to_addr_page(percpu, 0);
1496 int target;
1497 int qd_idx = sh->qd_idx;
1498 struct dma_async_tx_descriptor *tx;
1499 struct async_submit_ctl submit;
1500 struct r5dev *tgt;
1501 struct page *dest;
1502 int i;
1503 int count;
1504
1505 BUG_ON(sh->batch_head);
1506 if (sh->ops.target < 0)
1507 target = sh->ops.target2;
1508 else if (sh->ops.target2 < 0)
1509 target = sh->ops.target;
1510 else
1511
1512 BUG();
1513 BUG_ON(target < 0);
1514 pr_debug("%s: stripe %llu block: %d\n",
1515 __func__, (unsigned long long)sh->sector, target);
1516
1517 tgt = &sh->dev[target];
1518 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1519 dest = tgt->page;
1520
1521 atomic_inc(&sh->count);
1522
1523 if (target == qd_idx) {
1524 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1525 blocks[count] = NULL;
1526 BUG_ON(blocks[count+1] != dest);
1527 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1528 ops_complete_compute, sh,
1529 to_addr_conv(sh, percpu, 0));
1530 tx = async_gen_syndrome(blocks, 0, count+2,
1531 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1532 } else {
1533
1534 count = 0;
1535 for (i = disks; i-- ; ) {
1536 if (i == target || i == qd_idx)
1537 continue;
1538 blocks[count++] = sh->dev[i].page;
1539 }
1540
1541 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1542 NULL, ops_complete_compute, sh,
1543 to_addr_conv(sh, percpu, 0));
1544 tx = async_xor(dest, blocks, 0, count,
1545 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1546 }
1547
1548 return tx;
1549}
1550
1551static struct dma_async_tx_descriptor *
1552ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1553{
1554 int i, count, disks = sh->disks;
1555 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1556 int d0_idx = raid6_d0(sh);
1557 int faila = -1, failb = -1;
1558 int target = sh->ops.target;
1559 int target2 = sh->ops.target2;
1560 struct r5dev *tgt = &sh->dev[target];
1561 struct r5dev *tgt2 = &sh->dev[target2];
1562 struct dma_async_tx_descriptor *tx;
1563 struct page **blocks = to_addr_page(percpu, 0);
1564 struct async_submit_ctl submit;
1565
1566 BUG_ON(sh->batch_head);
1567 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1568 __func__, (unsigned long long)sh->sector, target, target2);
1569 BUG_ON(target < 0 || target2 < 0);
1570 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1571 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1572
1573
1574
1575
1576 for (i = 0; i < disks ; i++)
1577 blocks[i] = NULL;
1578 count = 0;
1579 i = d0_idx;
1580 do {
1581 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1582
1583 blocks[slot] = sh->dev[i].page;
1584
1585 if (i == target)
1586 faila = slot;
1587 if (i == target2)
1588 failb = slot;
1589 i = raid6_next_disk(i, disks);
1590 } while (i != d0_idx);
1591
1592 BUG_ON(faila == failb);
1593 if (failb < faila)
1594 swap(faila, failb);
1595 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1596 __func__, (unsigned long long)sh->sector, faila, failb);
1597
1598 atomic_inc(&sh->count);
1599
1600 if (failb == syndrome_disks+1) {
1601
1602 if (faila == syndrome_disks) {
1603
1604 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1605 ops_complete_compute, sh,
1606 to_addr_conv(sh, percpu, 0));
1607 return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1608 RAID5_STRIPE_SIZE(sh->raid_conf),
1609 &submit);
1610 } else {
1611 struct page *dest;
1612 int data_target;
1613 int qd_idx = sh->qd_idx;
1614
1615
1616 if (target == qd_idx)
1617 data_target = target2;
1618 else
1619 data_target = target;
1620
1621 count = 0;
1622 for (i = disks; i-- ; ) {
1623 if (i == data_target || i == qd_idx)
1624 continue;
1625 blocks[count++] = sh->dev[i].page;
1626 }
1627 dest = sh->dev[data_target].page;
1628 init_async_submit(&submit,
1629 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1630 NULL, NULL, NULL,
1631 to_addr_conv(sh, percpu, 0));
1632 tx = async_xor(dest, blocks, 0, count,
1633 RAID5_STRIPE_SIZE(sh->raid_conf),
1634 &submit);
1635
1636 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1637 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1638 ops_complete_compute, sh,
1639 to_addr_conv(sh, percpu, 0));
1640 return async_gen_syndrome(blocks, 0, count+2,
1641 RAID5_STRIPE_SIZE(sh->raid_conf),
1642 &submit);
1643 }
1644 } else {
1645 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1646 ops_complete_compute, sh,
1647 to_addr_conv(sh, percpu, 0));
1648 if (failb == syndrome_disks) {
1649
1650 return async_raid6_datap_recov(syndrome_disks+2,
1651 RAID5_STRIPE_SIZE(sh->raid_conf),
1652 faila,
1653 blocks, &submit);
1654 } else {
1655
1656 return async_raid6_2data_recov(syndrome_disks+2,
1657 RAID5_STRIPE_SIZE(sh->raid_conf),
1658 faila, failb,
1659 blocks, &submit);
1660 }
1661 }
1662}
1663
1664static void ops_complete_prexor(void *stripe_head_ref)
1665{
1666 struct stripe_head *sh = stripe_head_ref;
1667
1668 pr_debug("%s: stripe %llu\n", __func__,
1669 (unsigned long long)sh->sector);
1670
1671 if (r5c_is_writeback(sh->raid_conf->log))
1672
1673
1674
1675
1676 r5c_release_extra_page(sh);
1677}
1678
1679static struct dma_async_tx_descriptor *
1680ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1681 struct dma_async_tx_descriptor *tx)
1682{
1683 int disks = sh->disks;
1684 struct page **xor_srcs = to_addr_page(percpu, 0);
1685 int count = 0, pd_idx = sh->pd_idx, i;
1686 struct async_submit_ctl submit;
1687
1688
1689 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1690
1691 BUG_ON(sh->batch_head);
1692 pr_debug("%s: stripe %llu\n", __func__,
1693 (unsigned long long)sh->sector);
1694
1695 for (i = disks; i--; ) {
1696 struct r5dev *dev = &sh->dev[i];
1697
1698 if (test_bit(R5_InJournal, &dev->flags))
1699 xor_srcs[count++] = dev->orig_page;
1700 else if (test_bit(R5_Wantdrain, &dev->flags))
1701 xor_srcs[count++] = dev->page;
1702 }
1703
1704 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1705 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1706 tx = async_xor(xor_dest, xor_srcs, 0, count,
1707 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1708
1709 return tx;
1710}
1711
1712static struct dma_async_tx_descriptor *
1713ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1714 struct dma_async_tx_descriptor *tx)
1715{
1716 struct page **blocks = to_addr_page(percpu, 0);
1717 int count;
1718 struct async_submit_ctl submit;
1719
1720 pr_debug("%s: stripe %llu\n", __func__,
1721 (unsigned long long)sh->sector);
1722
1723 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1724
1725 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1726 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1727 tx = async_gen_syndrome(blocks, 0, count+2,
1728 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1729
1730 return tx;
1731}
1732
1733static struct dma_async_tx_descriptor *
1734ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1735{
1736 struct r5conf *conf = sh->raid_conf;
1737 int disks = sh->disks;
1738 int i;
1739 struct stripe_head *head_sh = sh;
1740
1741 pr_debug("%s: stripe %llu\n", __func__,
1742 (unsigned long long)sh->sector);
1743
1744 for (i = disks; i--; ) {
1745 struct r5dev *dev;
1746 struct bio *chosen;
1747
1748 sh = head_sh;
1749 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1750 struct bio *wbi;
1751
1752again:
1753 dev = &sh->dev[i];
1754
1755
1756
1757
1758 clear_bit(R5_InJournal, &dev->flags);
1759 spin_lock_irq(&sh->stripe_lock);
1760 chosen = dev->towrite;
1761 dev->towrite = NULL;
1762 sh->overwrite_disks = 0;
1763 BUG_ON(dev->written);
1764 wbi = dev->written = chosen;
1765 spin_unlock_irq(&sh->stripe_lock);
1766 WARN_ON(dev->page != dev->orig_page);
1767
1768 while (wbi && wbi->bi_iter.bi_sector <
1769 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1770 if (wbi->bi_opf & REQ_FUA)
1771 set_bit(R5_WantFUA, &dev->flags);
1772 if (wbi->bi_opf & REQ_SYNC)
1773 set_bit(R5_SyncIO, &dev->flags);
1774 if (bio_op(wbi) == REQ_OP_DISCARD)
1775 set_bit(R5_Discard, &dev->flags);
1776 else {
1777 tx = async_copy_data(1, wbi, &dev->page,
1778 dev->sector, tx, sh,
1779 r5c_is_writeback(conf->log));
1780 if (dev->page != dev->orig_page &&
1781 !r5c_is_writeback(conf->log)) {
1782 set_bit(R5_SkipCopy, &dev->flags);
1783 clear_bit(R5_UPTODATE, &dev->flags);
1784 clear_bit(R5_OVERWRITE, &dev->flags);
1785 }
1786 }
1787 wbi = r5_next_bio(conf, wbi, dev->sector);
1788 }
1789
1790 if (head_sh->batch_head) {
1791 sh = list_first_entry(&sh->batch_list,
1792 struct stripe_head,
1793 batch_list);
1794 if (sh == head_sh)
1795 continue;
1796 goto again;
1797 }
1798 }
1799 }
1800
1801 return tx;
1802}
1803
1804static void ops_complete_reconstruct(void *stripe_head_ref)
1805{
1806 struct stripe_head *sh = stripe_head_ref;
1807 int disks = sh->disks;
1808 int pd_idx = sh->pd_idx;
1809 int qd_idx = sh->qd_idx;
1810 int i;
1811 bool fua = false, sync = false, discard = false;
1812
1813 pr_debug("%s: stripe %llu\n", __func__,
1814 (unsigned long long)sh->sector);
1815
1816 for (i = disks; i--; ) {
1817 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1818 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1819 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1820 }
1821
1822 for (i = disks; i--; ) {
1823 struct r5dev *dev = &sh->dev[i];
1824
1825 if (dev->written || i == pd_idx || i == qd_idx) {
1826 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
1827 set_bit(R5_UPTODATE, &dev->flags);
1828 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
1829 set_bit(R5_Expanded, &dev->flags);
1830 }
1831 if (fua)
1832 set_bit(R5_WantFUA, &dev->flags);
1833 if (sync)
1834 set_bit(R5_SyncIO, &dev->flags);
1835 }
1836 }
1837
1838 if (sh->reconstruct_state == reconstruct_state_drain_run)
1839 sh->reconstruct_state = reconstruct_state_drain_result;
1840 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1841 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1842 else {
1843 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1844 sh->reconstruct_state = reconstruct_state_result;
1845 }
1846
1847 set_bit(STRIPE_HANDLE, &sh->state);
1848 raid5_release_stripe(sh);
1849}
1850
1851static void
1852ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1853 struct dma_async_tx_descriptor *tx)
1854{
1855 int disks = sh->disks;
1856 struct page **xor_srcs;
1857 struct async_submit_ctl submit;
1858 int count, pd_idx = sh->pd_idx, i;
1859 struct page *xor_dest;
1860 int prexor = 0;
1861 unsigned long flags;
1862 int j = 0;
1863 struct stripe_head *head_sh = sh;
1864 int last_stripe;
1865
1866 pr_debug("%s: stripe %llu\n", __func__,
1867 (unsigned long long)sh->sector);
1868
1869 for (i = 0; i < sh->disks; i++) {
1870 if (pd_idx == i)
1871 continue;
1872 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1873 break;
1874 }
1875 if (i >= sh->disks) {
1876 atomic_inc(&sh->count);
1877 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1878 ops_complete_reconstruct(sh);
1879 return;
1880 }
1881again:
1882 count = 0;
1883 xor_srcs = to_addr_page(percpu, j);
1884
1885
1886
1887 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1888 prexor = 1;
1889 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1890 for (i = disks; i--; ) {
1891 struct r5dev *dev = &sh->dev[i];
1892 if (head_sh->dev[i].written ||
1893 test_bit(R5_InJournal, &head_sh->dev[i].flags))
1894 xor_srcs[count++] = dev->page;
1895 }
1896 } else {
1897 xor_dest = sh->dev[pd_idx].page;
1898 for (i = disks; i--; ) {
1899 struct r5dev *dev = &sh->dev[i];
1900 if (i != pd_idx)
1901 xor_srcs[count++] = dev->page;
1902 }
1903 }
1904
1905
1906
1907
1908
1909
1910 last_stripe = !head_sh->batch_head ||
1911 list_first_entry(&sh->batch_list,
1912 struct stripe_head, batch_list) == head_sh;
1913 if (last_stripe) {
1914 flags = ASYNC_TX_ACK |
1915 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1916
1917 atomic_inc(&head_sh->count);
1918 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1919 to_addr_conv(sh, percpu, j));
1920 } else {
1921 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1922 init_async_submit(&submit, flags, tx, NULL, NULL,
1923 to_addr_conv(sh, percpu, j));
1924 }
1925
1926 if (unlikely(count == 1))
1927 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
1928 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1929 else
1930 tx = async_xor(xor_dest, xor_srcs, 0, count,
1931 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1932 if (!last_stripe) {
1933 j++;
1934 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1935 batch_list);
1936 goto again;
1937 }
1938}
1939
1940static void
1941ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1942 struct dma_async_tx_descriptor *tx)
1943{
1944 struct async_submit_ctl submit;
1945 struct page **blocks;
1946 int count, i, j = 0;
1947 struct stripe_head *head_sh = sh;
1948 int last_stripe;
1949 int synflags;
1950 unsigned long txflags;
1951
1952 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1953
1954 for (i = 0; i < sh->disks; i++) {
1955 if (sh->pd_idx == i || sh->qd_idx == i)
1956 continue;
1957 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1958 break;
1959 }
1960 if (i >= sh->disks) {
1961 atomic_inc(&sh->count);
1962 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1963 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1964 ops_complete_reconstruct(sh);
1965 return;
1966 }
1967
1968again:
1969 blocks = to_addr_page(percpu, j);
1970
1971 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1972 synflags = SYNDROME_SRC_WRITTEN;
1973 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1974 } else {
1975 synflags = SYNDROME_SRC_ALL;
1976 txflags = ASYNC_TX_ACK;
1977 }
1978
1979 count = set_syndrome_sources(blocks, sh, synflags);
1980 last_stripe = !head_sh->batch_head ||
1981 list_first_entry(&sh->batch_list,
1982 struct stripe_head, batch_list) == head_sh;
1983
1984 if (last_stripe) {
1985 atomic_inc(&head_sh->count);
1986 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1987 head_sh, to_addr_conv(sh, percpu, j));
1988 } else
1989 init_async_submit(&submit, 0, tx, NULL, NULL,
1990 to_addr_conv(sh, percpu, j));
1991 tx = async_gen_syndrome(blocks, 0, count+2,
1992 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1993 if (!last_stripe) {
1994 j++;
1995 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1996 batch_list);
1997 goto again;
1998 }
1999}
2000
2001static void ops_complete_check(void *stripe_head_ref)
2002{
2003 struct stripe_head *sh = stripe_head_ref;
2004
2005 pr_debug("%s: stripe %llu\n", __func__,
2006 (unsigned long long)sh->sector);
2007
2008 sh->check_state = check_state_check_result;
2009 set_bit(STRIPE_HANDLE, &sh->state);
2010 raid5_release_stripe(sh);
2011}
2012
2013static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2014{
2015 int disks = sh->disks;
2016 int pd_idx = sh->pd_idx;
2017 int qd_idx = sh->qd_idx;
2018 struct page *xor_dest;
2019 struct page **xor_srcs = to_addr_page(percpu, 0);
2020 struct dma_async_tx_descriptor *tx;
2021 struct async_submit_ctl submit;
2022 int count;
2023 int i;
2024
2025 pr_debug("%s: stripe %llu\n", __func__,
2026 (unsigned long long)sh->sector);
2027
2028 BUG_ON(sh->batch_head);
2029 count = 0;
2030 xor_dest = sh->dev[pd_idx].page;
2031 xor_srcs[count++] = xor_dest;
2032 for (i = disks; i--; ) {
2033 if (i == pd_idx || i == qd_idx)
2034 continue;
2035 xor_srcs[count++] = sh->dev[i].page;
2036 }
2037
2038 init_async_submit(&submit, 0, NULL, NULL, NULL,
2039 to_addr_conv(sh, percpu, 0));
2040 tx = async_xor_val(xor_dest, xor_srcs, 0, count,
2041 RAID5_STRIPE_SIZE(sh->raid_conf),
2042 &sh->ops.zero_sum_result, &submit);
2043
2044 atomic_inc(&sh->count);
2045 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2046 tx = async_trigger_callback(&submit);
2047}
2048
2049static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2050{
2051 struct page **srcs = to_addr_page(percpu, 0);
2052 struct async_submit_ctl submit;
2053 int count;
2054
2055 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2056 (unsigned long long)sh->sector, checkp);
2057
2058 BUG_ON(sh->batch_head);
2059 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2060 if (!checkp)
2061 srcs[count] = NULL;
2062
2063 atomic_inc(&sh->count);
2064 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2065 sh, to_addr_conv(sh, percpu, 0));
2066 async_syndrome_val(srcs, 0, count+2,
2067 RAID5_STRIPE_SIZE(sh->raid_conf),
2068 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2069}
2070
2071static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2072{
2073 int overlap_clear = 0, i, disks = sh->disks;
2074 struct dma_async_tx_descriptor *tx = NULL;
2075 struct r5conf *conf = sh->raid_conf;
2076 int level = conf->level;
2077 struct raid5_percpu *percpu;
2078 unsigned long cpu;
2079
2080 cpu = get_cpu();
2081 percpu = per_cpu_ptr(conf->percpu, cpu);
2082 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2083 ops_run_biofill(sh);
2084 overlap_clear++;
2085 }
2086
2087 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2088 if (level < 6)
2089 tx = ops_run_compute5(sh, percpu);
2090 else {
2091 if (sh->ops.target2 < 0 || sh->ops.target < 0)
2092 tx = ops_run_compute6_1(sh, percpu);
2093 else
2094 tx = ops_run_compute6_2(sh, percpu);
2095 }
2096
2097 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2098 async_tx_ack(tx);
2099 }
2100
2101 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2102 if (level < 6)
2103 tx = ops_run_prexor5(sh, percpu, tx);
2104 else
2105 tx = ops_run_prexor6(sh, percpu, tx);
2106 }
2107
2108 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2109 tx = ops_run_partial_parity(sh, percpu, tx);
2110
2111 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2112 tx = ops_run_biodrain(sh, tx);
2113 overlap_clear++;
2114 }
2115
2116 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2117 if (level < 6)
2118 ops_run_reconstruct5(sh, percpu, tx);
2119 else
2120 ops_run_reconstruct6(sh, percpu, tx);
2121 }
2122
2123 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2124 if (sh->check_state == check_state_run)
2125 ops_run_check_p(sh, percpu);
2126 else if (sh->check_state == check_state_run_q)
2127 ops_run_check_pq(sh, percpu, 0);
2128 else if (sh->check_state == check_state_run_pq)
2129 ops_run_check_pq(sh, percpu, 1);
2130 else
2131 BUG();
2132 }
2133
2134 if (overlap_clear && !sh->batch_head)
2135 for (i = disks; i--; ) {
2136 struct r5dev *dev = &sh->dev[i];
2137 if (test_and_clear_bit(R5_Overlap, &dev->flags))
2138 wake_up(&sh->raid_conf->wait_for_overlap);
2139 }
2140 put_cpu();
2141}
2142
2143static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2144{
2145 if (sh->ppl_page)
2146 __free_page(sh->ppl_page);
2147 kmem_cache_free(sc, sh);
2148}
2149
2150static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2151 int disks, struct r5conf *conf)
2152{
2153 struct stripe_head *sh;
2154 int i;
2155
2156 sh = kmem_cache_zalloc(sc, gfp);
2157 if (sh) {
2158 spin_lock_init(&sh->stripe_lock);
2159 spin_lock_init(&sh->batch_lock);
2160 INIT_LIST_HEAD(&sh->batch_list);
2161 INIT_LIST_HEAD(&sh->lru);
2162 INIT_LIST_HEAD(&sh->r5c);
2163 INIT_LIST_HEAD(&sh->log_list);
2164 atomic_set(&sh->count, 1);
2165 sh->raid_conf = conf;
2166 sh->log_start = MaxSector;
2167 for (i = 0; i < disks; i++) {
2168 struct r5dev *dev = &sh->dev[i];
2169
2170 bio_init(&dev->req, &dev->vec, 1);
2171 bio_init(&dev->rreq, &dev->rvec, 1);
2172 }
2173
2174 if (raid5_has_ppl(conf)) {
2175 sh->ppl_page = alloc_page(gfp);
2176 if (!sh->ppl_page) {
2177 free_stripe(sc, sh);
2178 sh = NULL;
2179 }
2180 }
2181 }
2182 return sh;
2183}
2184static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2185{
2186 struct stripe_head *sh;
2187
2188 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2189 if (!sh)
2190 return 0;
2191
2192 if (grow_buffers(sh, gfp)) {
2193 shrink_buffers(sh);
2194 free_stripe(conf->slab_cache, sh);
2195 return 0;
2196 }
2197 sh->hash_lock_index =
2198 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2199
2200 atomic_inc(&conf->active_stripes);
2201
2202 raid5_release_stripe(sh);
2203 conf->max_nr_stripes++;
2204 return 1;
2205}
2206
2207static int grow_stripes(struct r5conf *conf, int num)
2208{
2209 struct kmem_cache *sc;
2210 size_t namelen = sizeof(conf->cache_name[0]);
2211 int devs = max(conf->raid_disks, conf->previous_raid_disks);
2212
2213 if (conf->mddev->gendisk)
2214 snprintf(conf->cache_name[0], namelen,
2215 "raid%d-%s", conf->level, mdname(conf->mddev));
2216 else
2217 snprintf(conf->cache_name[0], namelen,
2218 "raid%d-%p", conf->level, conf->mddev);
2219 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2220
2221 conf->active_name = 0;
2222 sc = kmem_cache_create(conf->cache_name[conf->active_name],
2223 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2224 0, 0, NULL);
2225 if (!sc)
2226 return 1;
2227 conf->slab_cache = sc;
2228 conf->pool_size = devs;
2229 while (num--)
2230 if (!grow_one_stripe(conf, GFP_KERNEL))
2231 return 1;
2232
2233 return 0;
2234}
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252static int scribble_alloc(struct raid5_percpu *percpu,
2253 int num, int cnt)
2254{
2255 size_t obj_size =
2256 sizeof(struct page *) * (num+2) +
2257 sizeof(addr_conv_t) * (num+2);
2258 void *scribble;
2259
2260
2261
2262
2263
2264
2265 scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2266 if (!scribble)
2267 return -ENOMEM;
2268
2269 kvfree(percpu->scribble);
2270
2271 percpu->scribble = scribble;
2272 percpu->scribble_obj_size = obj_size;
2273 return 0;
2274}
2275
2276static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2277{
2278 unsigned long cpu;
2279 int err = 0;
2280
2281
2282
2283
2284
2285
2286 if (conf->scribble_disks >= new_disks &&
2287 conf->scribble_sectors >= new_sectors)
2288 return 0;
2289 mddev_suspend(conf->mddev);
2290 get_online_cpus();
2291
2292 for_each_present_cpu(cpu) {
2293 struct raid5_percpu *percpu;
2294
2295 percpu = per_cpu_ptr(conf->percpu, cpu);
2296 err = scribble_alloc(percpu, new_disks,
2297 new_sectors / RAID5_STRIPE_SECTORS(conf));
2298 if (err)
2299 break;
2300 }
2301
2302 put_online_cpus();
2303 mddev_resume(conf->mddev);
2304 if (!err) {
2305 conf->scribble_disks = new_disks;
2306 conf->scribble_sectors = new_sectors;
2307 }
2308 return err;
2309}
2310
2311static int resize_stripes(struct r5conf *conf, int newsize)
2312{
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336 struct stripe_head *osh, *nsh;
2337 LIST_HEAD(newstripes);
2338 struct disk_info *ndisks;
2339 int err = 0;
2340 struct kmem_cache *sc;
2341 int i;
2342 int hash, cnt;
2343
2344 md_allow_write(conf->mddev);
2345
2346
2347 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2348 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2349 0, 0, NULL);
2350 if (!sc)
2351 return -ENOMEM;
2352
2353
2354 mutex_lock(&conf->cache_size_mutex);
2355
2356 for (i = conf->max_nr_stripes; i; i--) {
2357 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2358 if (!nsh)
2359 break;
2360
2361 list_add(&nsh->lru, &newstripes);
2362 }
2363 if (i) {
2364
2365 while (!list_empty(&newstripes)) {
2366 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2367 list_del(&nsh->lru);
2368 free_stripe(sc, nsh);
2369 }
2370 kmem_cache_destroy(sc);
2371 mutex_unlock(&conf->cache_size_mutex);
2372 return -ENOMEM;
2373 }
2374
2375
2376
2377
2378 hash = 0;
2379 cnt = 0;
2380 list_for_each_entry(nsh, &newstripes, lru) {
2381 lock_device_hash_lock(conf, hash);
2382 wait_event_cmd(conf->wait_for_stripe,
2383 !list_empty(conf->inactive_list + hash),
2384 unlock_device_hash_lock(conf, hash),
2385 lock_device_hash_lock(conf, hash));
2386 osh = get_free_stripe(conf, hash);
2387 unlock_device_hash_lock(conf, hash);
2388
2389 for(i=0; i<conf->pool_size; i++) {
2390 nsh->dev[i].page = osh->dev[i].page;
2391 nsh->dev[i].orig_page = osh->dev[i].page;
2392 }
2393 nsh->hash_lock_index = hash;
2394 free_stripe(conf->slab_cache, osh);
2395 cnt++;
2396 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2397 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2398 hash++;
2399 cnt = 0;
2400 }
2401 }
2402 kmem_cache_destroy(conf->slab_cache);
2403
2404
2405
2406
2407
2408
2409 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2410 if (ndisks) {
2411 for (i = 0; i < conf->pool_size; i++)
2412 ndisks[i] = conf->disks[i];
2413
2414 for (i = conf->pool_size; i < newsize; i++) {
2415 ndisks[i].extra_page = alloc_page(GFP_NOIO);
2416 if (!ndisks[i].extra_page)
2417 err = -ENOMEM;
2418 }
2419
2420 if (err) {
2421 for (i = conf->pool_size; i < newsize; i++)
2422 if (ndisks[i].extra_page)
2423 put_page(ndisks[i].extra_page);
2424 kfree(ndisks);
2425 } else {
2426 kfree(conf->disks);
2427 conf->disks = ndisks;
2428 }
2429 } else
2430 err = -ENOMEM;
2431
2432 mutex_unlock(&conf->cache_size_mutex);
2433
2434 conf->slab_cache = sc;
2435 conf->active_name = 1-conf->active_name;
2436
2437
2438 while(!list_empty(&newstripes)) {
2439 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2440 list_del_init(&nsh->lru);
2441
2442 for (i=conf->raid_disks; i < newsize; i++)
2443 if (nsh->dev[i].page == NULL) {
2444 struct page *p = alloc_page(GFP_NOIO);
2445 nsh->dev[i].page = p;
2446 nsh->dev[i].orig_page = p;
2447 if (!p)
2448 err = -ENOMEM;
2449 }
2450 raid5_release_stripe(nsh);
2451 }
2452
2453
2454 if (!err)
2455 conf->pool_size = newsize;
2456 return err;
2457}
2458
2459static int drop_one_stripe(struct r5conf *conf)
2460{
2461 struct stripe_head *sh;
2462 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2463
2464 spin_lock_irq(conf->hash_locks + hash);
2465 sh = get_free_stripe(conf, hash);
2466 spin_unlock_irq(conf->hash_locks + hash);
2467 if (!sh)
2468 return 0;
2469 BUG_ON(atomic_read(&sh->count));
2470 shrink_buffers(sh);
2471 free_stripe(conf->slab_cache, sh);
2472 atomic_dec(&conf->active_stripes);
2473 conf->max_nr_stripes--;
2474 return 1;
2475}
2476
2477static void shrink_stripes(struct r5conf *conf)
2478{
2479 while (conf->max_nr_stripes &&
2480 drop_one_stripe(conf))
2481 ;
2482
2483 kmem_cache_destroy(conf->slab_cache);
2484 conf->slab_cache = NULL;
2485}
2486
2487static void raid5_end_read_request(struct bio * bi)
2488{
2489 struct stripe_head *sh = bi->bi_private;
2490 struct r5conf *conf = sh->raid_conf;
2491 int disks = sh->disks, i;
2492 char b[BDEVNAME_SIZE];
2493 struct md_rdev *rdev = NULL;
2494 sector_t s;
2495
2496 for (i=0 ; i<disks; i++)
2497 if (bi == &sh->dev[i].req)
2498 break;
2499
2500 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2501 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2502 bi->bi_status);
2503 if (i == disks) {
2504 bio_reset(bi);
2505 BUG();
2506 return;
2507 }
2508 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2509
2510
2511
2512
2513
2514 rdev = conf->disks[i].replacement;
2515 if (!rdev)
2516 rdev = conf->disks[i].rdev;
2517
2518 if (use_new_offset(conf, sh))
2519 s = sh->sector + rdev->new_data_offset;
2520 else
2521 s = sh->sector + rdev->data_offset;
2522 if (!bi->bi_status) {
2523 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2524 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2525
2526
2527
2528
2529 pr_info_ratelimited(
2530 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2531 mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2532 (unsigned long long)s,
2533 bdevname(rdev->bdev, b));
2534 atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
2535 clear_bit(R5_ReadError, &sh->dev[i].flags);
2536 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2537 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2538 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2539
2540 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2541
2542
2543
2544
2545 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2546
2547 if (atomic_read(&rdev->read_errors))
2548 atomic_set(&rdev->read_errors, 0);
2549 } else {
2550 const char *bdn = bdevname(rdev->bdev, b);
2551 int retry = 0;
2552 int set_bad = 0;
2553
2554 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2555 if (!(bi->bi_status == BLK_STS_PROTECTION))
2556 atomic_inc(&rdev->read_errors);
2557 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2558 pr_warn_ratelimited(
2559 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2560 mdname(conf->mddev),
2561 (unsigned long long)s,
2562 bdn);
2563 else if (conf->mddev->degraded >= conf->max_degraded) {
2564 set_bad = 1;
2565 pr_warn_ratelimited(
2566 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2567 mdname(conf->mddev),
2568 (unsigned long long)s,
2569 bdn);
2570 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2571
2572 set_bad = 1;
2573 pr_warn_ratelimited(
2574 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2575 mdname(conf->mddev),
2576 (unsigned long long)s,
2577 bdn);
2578 } else if (atomic_read(&rdev->read_errors)
2579 > conf->max_nr_stripes) {
2580 if (!test_bit(Faulty, &rdev->flags)) {
2581 pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2582 mdname(conf->mddev),
2583 atomic_read(&rdev->read_errors),
2584 conf->max_nr_stripes);
2585 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2586 mdname(conf->mddev), bdn);
2587 }
2588 } else
2589 retry = 1;
2590 if (set_bad && test_bit(In_sync, &rdev->flags)
2591 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2592 retry = 1;
2593 if (retry)
2594 if (sh->qd_idx >= 0 && sh->pd_idx == i)
2595 set_bit(R5_ReadError, &sh->dev[i].flags);
2596 else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2597 set_bit(R5_ReadError, &sh->dev[i].flags);
2598 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2599 } else
2600 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2601 else {
2602 clear_bit(R5_ReadError, &sh->dev[i].flags);
2603 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2604 if (!(set_bad
2605 && test_bit(In_sync, &rdev->flags)
2606 && rdev_set_badblocks(
2607 rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
2608 md_error(conf->mddev, rdev);
2609 }
2610 }
2611 rdev_dec_pending(rdev, conf->mddev);
2612 bio_reset(bi);
2613 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2614 set_bit(STRIPE_HANDLE, &sh->state);
2615 raid5_release_stripe(sh);
2616}
2617
2618static void raid5_end_write_request(struct bio *bi)
2619{
2620 struct stripe_head *sh = bi->bi_private;
2621 struct r5conf *conf = sh->raid_conf;
2622 int disks = sh->disks, i;
2623 struct md_rdev *rdev;
2624 sector_t first_bad;
2625 int bad_sectors;
2626 int replacement = 0;
2627
2628 for (i = 0 ; i < disks; i++) {
2629 if (bi == &sh->dev[i].req) {
2630 rdev = conf->disks[i].rdev;
2631 break;
2632 }
2633 if (bi == &sh->dev[i].rreq) {
2634 rdev = conf->disks[i].replacement;
2635 if (rdev)
2636 replacement = 1;
2637 else
2638
2639
2640
2641
2642 rdev = conf->disks[i].rdev;
2643 break;
2644 }
2645 }
2646 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2647 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2648 bi->bi_status);
2649 if (i == disks) {
2650 bio_reset(bi);
2651 BUG();
2652 return;
2653 }
2654
2655 if (replacement) {
2656 if (bi->bi_status)
2657 md_error(conf->mddev, rdev);
2658 else if (is_badblock(rdev, sh->sector,
2659 RAID5_STRIPE_SECTORS(conf),
2660 &first_bad, &bad_sectors))
2661 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2662 } else {
2663 if (bi->bi_status) {
2664 set_bit(STRIPE_DEGRADED, &sh->state);
2665 set_bit(WriteErrorSeen, &rdev->flags);
2666 set_bit(R5_WriteError, &sh->dev[i].flags);
2667 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2668 set_bit(MD_RECOVERY_NEEDED,
2669 &rdev->mddev->recovery);
2670 } else if (is_badblock(rdev, sh->sector,
2671 RAID5_STRIPE_SECTORS(conf),
2672 &first_bad, &bad_sectors)) {
2673 set_bit(R5_MadeGood, &sh->dev[i].flags);
2674 if (test_bit(R5_ReadError, &sh->dev[i].flags))
2675
2676
2677
2678
2679 set_bit(R5_ReWrite, &sh->dev[i].flags);
2680 }
2681 }
2682 rdev_dec_pending(rdev, conf->mddev);
2683
2684 if (sh->batch_head && bi->bi_status && !replacement)
2685 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2686
2687 bio_reset(bi);
2688 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2689 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2690 set_bit(STRIPE_HANDLE, &sh->state);
2691 raid5_release_stripe(sh);
2692
2693 if (sh->batch_head && sh != sh->batch_head)
2694 raid5_release_stripe(sh->batch_head);
2695}
2696
2697static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2698{
2699 char b[BDEVNAME_SIZE];
2700 struct r5conf *conf = mddev->private;
2701 unsigned long flags;
2702 pr_debug("raid456: error called\n");
2703
2704 spin_lock_irqsave(&conf->device_lock, flags);
2705
2706 if (test_bit(In_sync, &rdev->flags) &&
2707 mddev->degraded == conf->max_degraded) {
2708
2709
2710
2711
2712 conf->recovery_disabled = mddev->recovery_disabled;
2713 spin_unlock_irqrestore(&conf->device_lock, flags);
2714 return;
2715 }
2716
2717 set_bit(Faulty, &rdev->flags);
2718 clear_bit(In_sync, &rdev->flags);
2719 mddev->degraded = raid5_calc_degraded(conf);
2720 spin_unlock_irqrestore(&conf->device_lock, flags);
2721 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2722
2723 set_bit(Blocked, &rdev->flags);
2724 set_mask_bits(&mddev->sb_flags, 0,
2725 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2726 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2727 "md/raid:%s: Operation continuing on %d devices.\n",
2728 mdname(mddev),
2729 bdevname(rdev->bdev, b),
2730 mdname(mddev),
2731 conf->raid_disks - mddev->degraded);
2732 r5c_update_on_rdev_error(mddev, rdev);
2733}
2734
2735
2736
2737
2738
2739sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2740 int previous, int *dd_idx,
2741 struct stripe_head *sh)
2742{
2743 sector_t stripe, stripe2;
2744 sector_t chunk_number;
2745 unsigned int chunk_offset;
2746 int pd_idx, qd_idx;
2747 int ddf_layout = 0;
2748 sector_t new_sector;
2749 int algorithm = previous ? conf->prev_algo
2750 : conf->algorithm;
2751 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2752 : conf->chunk_sectors;
2753 int raid_disks = previous ? conf->previous_raid_disks
2754 : conf->raid_disks;
2755 int data_disks = raid_disks - conf->max_degraded;
2756
2757
2758
2759
2760
2761
2762 chunk_offset = sector_div(r_sector, sectors_per_chunk);
2763 chunk_number = r_sector;
2764
2765
2766
2767
2768 stripe = chunk_number;
2769 *dd_idx = sector_div(stripe, data_disks);
2770 stripe2 = stripe;
2771
2772
2773
2774 pd_idx = qd_idx = -1;
2775 switch(conf->level) {
2776 case 4:
2777 pd_idx = data_disks;
2778 break;
2779 case 5:
2780 switch (algorithm) {
2781 case ALGORITHM_LEFT_ASYMMETRIC:
2782 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2783 if (*dd_idx >= pd_idx)
2784 (*dd_idx)++;
2785 break;
2786 case ALGORITHM_RIGHT_ASYMMETRIC:
2787 pd_idx = sector_div(stripe2, raid_disks);
2788 if (*dd_idx >= pd_idx)
2789 (*dd_idx)++;
2790 break;
2791 case ALGORITHM_LEFT_SYMMETRIC:
2792 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2793 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2794 break;
2795 case ALGORITHM_RIGHT_SYMMETRIC:
2796 pd_idx = sector_div(stripe2, raid_disks);
2797 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2798 break;
2799 case ALGORITHM_PARITY_0:
2800 pd_idx = 0;
2801 (*dd_idx)++;
2802 break;
2803 case ALGORITHM_PARITY_N:
2804 pd_idx = data_disks;
2805 break;
2806 default:
2807 BUG();
2808 }
2809 break;
2810 case 6:
2811
2812 switch (algorithm) {
2813 case ALGORITHM_LEFT_ASYMMETRIC:
2814 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2815 qd_idx = pd_idx + 1;
2816 if (pd_idx == raid_disks-1) {
2817 (*dd_idx)++;
2818 qd_idx = 0;
2819 } else if (*dd_idx >= pd_idx)
2820 (*dd_idx) += 2;
2821 break;
2822 case ALGORITHM_RIGHT_ASYMMETRIC:
2823 pd_idx = sector_div(stripe2, raid_disks);
2824 qd_idx = pd_idx + 1;
2825 if (pd_idx == raid_disks-1) {
2826 (*dd_idx)++;
2827 qd_idx = 0;
2828 } else if (*dd_idx >= pd_idx)
2829 (*dd_idx) += 2;
2830 break;
2831 case ALGORITHM_LEFT_SYMMETRIC:
2832 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2833 qd_idx = (pd_idx + 1) % raid_disks;
2834 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2835 break;
2836 case ALGORITHM_RIGHT_SYMMETRIC:
2837 pd_idx = sector_div(stripe2, raid_disks);
2838 qd_idx = (pd_idx + 1) % raid_disks;
2839 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2840 break;
2841
2842 case ALGORITHM_PARITY_0:
2843 pd_idx = 0;
2844 qd_idx = 1;
2845 (*dd_idx) += 2;
2846 break;
2847 case ALGORITHM_PARITY_N:
2848 pd_idx = data_disks;
2849 qd_idx = data_disks + 1;
2850 break;
2851
2852 case ALGORITHM_ROTATING_ZERO_RESTART:
2853
2854
2855
2856 pd_idx = sector_div(stripe2, raid_disks);
2857 qd_idx = pd_idx + 1;
2858 if (pd_idx == raid_disks-1) {
2859 (*dd_idx)++;
2860 qd_idx = 0;
2861 } else if (*dd_idx >= pd_idx)
2862 (*dd_idx) += 2;
2863 ddf_layout = 1;
2864 break;
2865
2866 case ALGORITHM_ROTATING_N_RESTART:
2867
2868
2869
2870
2871 stripe2 += 1;
2872 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2873 qd_idx = pd_idx + 1;
2874 if (pd_idx == raid_disks-1) {
2875 (*dd_idx)++;
2876 qd_idx = 0;
2877 } else if (*dd_idx >= pd_idx)
2878 (*dd_idx) += 2;
2879 ddf_layout = 1;
2880 break;
2881
2882 case ALGORITHM_ROTATING_N_CONTINUE:
2883
2884 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2885 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2886 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2887 ddf_layout = 1;
2888 break;
2889
2890 case ALGORITHM_LEFT_ASYMMETRIC_6:
2891
2892 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2893 if (*dd_idx >= pd_idx)
2894 (*dd_idx)++;
2895 qd_idx = raid_disks - 1;
2896 break;
2897
2898 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2899 pd_idx = sector_div(stripe2, raid_disks-1);
2900 if (*dd_idx >= pd_idx)
2901 (*dd_idx)++;
2902 qd_idx = raid_disks - 1;
2903 break;
2904
2905 case ALGORITHM_LEFT_SYMMETRIC_6:
2906 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2907 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2908 qd_idx = raid_disks - 1;
2909 break;
2910
2911 case ALGORITHM_RIGHT_SYMMETRIC_6:
2912 pd_idx = sector_div(stripe2, raid_disks-1);
2913 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2914 qd_idx = raid_disks - 1;
2915 break;
2916
2917 case ALGORITHM_PARITY_0_6:
2918 pd_idx = 0;
2919 (*dd_idx)++;
2920 qd_idx = raid_disks - 1;
2921 break;
2922
2923 default:
2924 BUG();
2925 }
2926 break;
2927 }
2928
2929 if (sh) {
2930 sh->pd_idx = pd_idx;
2931 sh->qd_idx = qd_idx;
2932 sh->ddf_layout = ddf_layout;
2933 }
2934
2935
2936
2937 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2938 return new_sector;
2939}
2940
2941sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2942{
2943 struct r5conf *conf = sh->raid_conf;
2944 int raid_disks = sh->disks;
2945 int data_disks = raid_disks - conf->max_degraded;
2946 sector_t new_sector = sh->sector, check;
2947 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2948 : conf->chunk_sectors;
2949 int algorithm = previous ? conf->prev_algo
2950 : conf->algorithm;
2951 sector_t stripe;
2952 int chunk_offset;
2953 sector_t chunk_number;
2954 int dummy1, dd_idx = i;
2955 sector_t r_sector;
2956 struct stripe_head sh2;
2957
2958 chunk_offset = sector_div(new_sector, sectors_per_chunk);
2959 stripe = new_sector;
2960
2961 if (i == sh->pd_idx)
2962 return 0;
2963 switch(conf->level) {
2964 case 4: break;
2965 case 5:
2966 switch (algorithm) {
2967 case ALGORITHM_LEFT_ASYMMETRIC:
2968 case ALGORITHM_RIGHT_ASYMMETRIC:
2969 if (i > sh->pd_idx)
2970 i--;
2971 break;
2972 case ALGORITHM_LEFT_SYMMETRIC:
2973 case ALGORITHM_RIGHT_SYMMETRIC:
2974 if (i < sh->pd_idx)
2975 i += raid_disks;
2976 i -= (sh->pd_idx + 1);
2977 break;
2978 case ALGORITHM_PARITY_0:
2979 i -= 1;
2980 break;
2981 case ALGORITHM_PARITY_N:
2982 break;
2983 default:
2984 BUG();
2985 }
2986 break;
2987 case 6:
2988 if (i == sh->qd_idx)
2989 return 0;
2990 switch (algorithm) {
2991 case ALGORITHM_LEFT_ASYMMETRIC:
2992 case ALGORITHM_RIGHT_ASYMMETRIC:
2993 case ALGORITHM_ROTATING_ZERO_RESTART:
2994 case ALGORITHM_ROTATING_N_RESTART:
2995 if (sh->pd_idx == raid_disks-1)
2996 i--;
2997 else if (i > sh->pd_idx)
2998 i -= 2;
2999 break;
3000 case ALGORITHM_LEFT_SYMMETRIC:
3001 case ALGORITHM_RIGHT_SYMMETRIC:
3002 if (sh->pd_idx == raid_disks-1)
3003 i--;
3004 else {
3005
3006 if (i < sh->pd_idx)
3007 i += raid_disks;
3008 i -= (sh->pd_idx + 2);
3009 }
3010 break;
3011 case ALGORITHM_PARITY_0:
3012 i -= 2;
3013 break;
3014 case ALGORITHM_PARITY_N:
3015 break;
3016 case ALGORITHM_ROTATING_N_CONTINUE:
3017
3018 if (sh->pd_idx == 0)
3019 i--;
3020 else {
3021
3022 if (i < sh->pd_idx)
3023 i += raid_disks;
3024 i -= (sh->pd_idx + 1);
3025 }
3026 break;
3027 case ALGORITHM_LEFT_ASYMMETRIC_6:
3028 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3029 if (i > sh->pd_idx)
3030 i--;
3031 break;
3032 case ALGORITHM_LEFT_SYMMETRIC_6:
3033 case ALGORITHM_RIGHT_SYMMETRIC_6:
3034 if (i < sh->pd_idx)
3035 i += data_disks + 1;
3036 i -= (sh->pd_idx + 1);
3037 break;
3038 case ALGORITHM_PARITY_0_6:
3039 i -= 1;
3040 break;
3041 default:
3042 BUG();
3043 }
3044 break;
3045 }
3046
3047 chunk_number = stripe * data_disks + i;
3048 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3049
3050 check = raid5_compute_sector(conf, r_sector,
3051 previous, &dummy1, &sh2);
3052 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3053 || sh2.qd_idx != sh->qd_idx) {
3054 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3055 mdname(conf->mddev));
3056 return 0;
3057 }
3058 return r_sector;
3059}
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099static inline bool delay_towrite(struct r5conf *conf,
3100 struct r5dev *dev,
3101 struct stripe_head_state *s)
3102{
3103
3104 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3105 !test_bit(R5_Insync, &dev->flags) && s->injournal)
3106 return true;
3107
3108 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3109 s->injournal > 0)
3110 return true;
3111
3112 if (s->log_failed && s->injournal)
3113 return true;
3114 return false;
3115}
3116
3117static void
3118schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3119 int rcw, int expand)
3120{
3121 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3122 struct r5conf *conf = sh->raid_conf;
3123 int level = conf->level;
3124
3125 if (rcw) {
3126
3127
3128
3129
3130
3131
3132 r5c_release_extra_page(sh);
3133
3134 for (i = disks; i--; ) {
3135 struct r5dev *dev = &sh->dev[i];
3136
3137 if (dev->towrite && !delay_towrite(conf, dev, s)) {
3138 set_bit(R5_LOCKED, &dev->flags);
3139 set_bit(R5_Wantdrain, &dev->flags);
3140 if (!expand)
3141 clear_bit(R5_UPTODATE, &dev->flags);
3142 s->locked++;
3143 } else if (test_bit(R5_InJournal, &dev->flags)) {
3144 set_bit(R5_LOCKED, &dev->flags);
3145 s->locked++;
3146 }
3147 }
3148
3149
3150
3151
3152 if (!expand) {
3153 if (!s->locked)
3154
3155 return;
3156 sh->reconstruct_state = reconstruct_state_drain_run;
3157 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3158 } else
3159 sh->reconstruct_state = reconstruct_state_run;
3160
3161 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3162
3163 if (s->locked + conf->max_degraded == disks)
3164 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3165 atomic_inc(&conf->pending_full_writes);
3166 } else {
3167 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3168 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3169 BUG_ON(level == 6 &&
3170 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3171 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3172
3173 for (i = disks; i--; ) {
3174 struct r5dev *dev = &sh->dev[i];
3175 if (i == pd_idx || i == qd_idx)
3176 continue;
3177
3178 if (dev->towrite &&
3179 (test_bit(R5_UPTODATE, &dev->flags) ||
3180 test_bit(R5_Wantcompute, &dev->flags))) {
3181 set_bit(R5_Wantdrain, &dev->flags);
3182 set_bit(R5_LOCKED, &dev->flags);
3183 clear_bit(R5_UPTODATE, &dev->flags);
3184 s->locked++;
3185 } else if (test_bit(R5_InJournal, &dev->flags)) {
3186 set_bit(R5_LOCKED, &dev->flags);
3187 s->locked++;
3188 }
3189 }
3190 if (!s->locked)
3191
3192 return;
3193 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3194 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3195 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3196 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3197 }
3198
3199
3200
3201
3202 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3203 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3204 s->locked++;
3205
3206 if (level == 6) {
3207 int qd_idx = sh->qd_idx;
3208 struct r5dev *dev = &sh->dev[qd_idx];
3209
3210 set_bit(R5_LOCKED, &dev->flags);
3211 clear_bit(R5_UPTODATE, &dev->flags);
3212 s->locked++;
3213 }
3214
3215 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3216 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3217 !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3218 test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3219 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3220
3221 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3222 __func__, (unsigned long long)sh->sector,
3223 s->locked, s->ops_request);
3224}
3225
3226
3227
3228
3229
3230
3231static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3232 int forwrite, int previous)
3233{
3234 struct bio **bip;
3235 struct r5conf *conf = sh->raid_conf;
3236 int firstwrite=0;
3237
3238 pr_debug("adding bi b#%llu to stripe s#%llu\n",
3239 (unsigned long long)bi->bi_iter.bi_sector,
3240 (unsigned long long)sh->sector);
3241
3242 spin_lock_irq(&sh->stripe_lock);
3243 sh->dev[dd_idx].write_hint = bi->bi_write_hint;
3244
3245 if (sh->batch_head)
3246 goto overlap;
3247 if (forwrite) {
3248 bip = &sh->dev[dd_idx].towrite;
3249 if (*bip == NULL)
3250 firstwrite = 1;
3251 } else
3252 bip = &sh->dev[dd_idx].toread;
3253 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3254 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3255 goto overlap;
3256 bip = & (*bip)->bi_next;
3257 }
3258 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3259 goto overlap;
3260
3261 if (forwrite && raid5_has_ppl(conf)) {
3262
3263
3264
3265
3266
3267
3268
3269 sector_t sector;
3270 sector_t first = 0;
3271 sector_t last = 0;
3272 int count = 0;
3273 int i;
3274
3275 for (i = 0; i < sh->disks; i++) {
3276 if (i != sh->pd_idx &&
3277 (i == dd_idx || sh->dev[i].towrite)) {
3278 sector = sh->dev[i].sector;
3279 if (count == 0 || sector < first)
3280 first = sector;
3281 if (sector > last)
3282 last = sector;
3283 count++;
3284 }
3285 }
3286
3287 if (first + conf->chunk_sectors * (count - 1) != last)
3288 goto overlap;
3289 }
3290
3291 if (!forwrite || previous)
3292 clear_bit(STRIPE_BATCH_READY, &sh->state);
3293
3294 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3295 if (*bip)
3296 bi->bi_next = *bip;
3297 *bip = bi;
3298 bio_inc_remaining(bi);
3299 md_write_inc(conf->mddev, bi);
3300
3301 if (forwrite) {
3302
3303 sector_t sector = sh->dev[dd_idx].sector;
3304 for (bi=sh->dev[dd_idx].towrite;
3305 sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3306 bi && bi->bi_iter.bi_sector <= sector;
3307 bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
3308 if (bio_end_sector(bi) >= sector)
3309 sector = bio_end_sector(bi);
3310 }
3311 if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3312 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3313 sh->overwrite_disks++;
3314 }
3315
3316 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3317 (unsigned long long)(*bip)->bi_iter.bi_sector,
3318 (unsigned long long)sh->sector, dd_idx);
3319
3320 if (conf->mddev->bitmap && firstwrite) {
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3334 spin_unlock_irq(&sh->stripe_lock);
3335 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3336 RAID5_STRIPE_SECTORS(conf), 0);
3337 spin_lock_irq(&sh->stripe_lock);
3338 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3339 if (!sh->batch_head) {
3340 sh->bm_seq = conf->seq_flush+1;
3341 set_bit(STRIPE_BIT_DELAY, &sh->state);
3342 }
3343 }
3344 spin_unlock_irq(&sh->stripe_lock);
3345
3346 if (stripe_can_batch(sh))
3347 stripe_add_to_batch_list(conf, sh);
3348 return 1;
3349
3350 overlap:
3351 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3352 spin_unlock_irq(&sh->stripe_lock);
3353 return 0;
3354}
3355
3356static void end_reshape(struct r5conf *conf);
3357
3358static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3359 struct stripe_head *sh)
3360{
3361 int sectors_per_chunk =
3362 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3363 int dd_idx;
3364 int chunk_offset = sector_div(stripe, sectors_per_chunk);
3365 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3366
3367 raid5_compute_sector(conf,
3368 stripe * (disks - conf->max_degraded)
3369 *sectors_per_chunk + chunk_offset,
3370 previous,
3371 &dd_idx, sh);
3372}
3373
3374static void
3375handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3376 struct stripe_head_state *s, int disks)
3377{
3378 int i;
3379 BUG_ON(sh->batch_head);
3380 for (i = disks; i--; ) {
3381 struct bio *bi;
3382 int bitmap_end = 0;
3383
3384 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3385 struct md_rdev *rdev;
3386 rcu_read_lock();
3387 rdev = rcu_dereference(conf->disks[i].rdev);
3388 if (rdev && test_bit(In_sync, &rdev->flags) &&
3389 !test_bit(Faulty, &rdev->flags))
3390 atomic_inc(&rdev->nr_pending);
3391 else
3392 rdev = NULL;
3393 rcu_read_unlock();
3394 if (rdev) {
3395 if (!rdev_set_badblocks(
3396 rdev,
3397 sh->sector,
3398 RAID5_STRIPE_SECTORS(conf), 0))
3399 md_error(conf->mddev, rdev);
3400 rdev_dec_pending(rdev, conf->mddev);
3401 }
3402 }
3403 spin_lock_irq(&sh->stripe_lock);
3404
3405 bi = sh->dev[i].towrite;
3406 sh->dev[i].towrite = NULL;
3407 sh->overwrite_disks = 0;
3408 spin_unlock_irq(&sh->stripe_lock);
3409 if (bi)
3410 bitmap_end = 1;
3411
3412 log_stripe_write_finished(sh);
3413
3414 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3415 wake_up(&conf->wait_for_overlap);
3416
3417 while (bi && bi->bi_iter.bi_sector <
3418 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3419 struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
3420
3421 md_write_end(conf->mddev);
3422 bio_io_error(bi);
3423 bi = nextbi;
3424 }
3425 if (bitmap_end)
3426 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3427 RAID5_STRIPE_SECTORS(conf), 0, 0);
3428 bitmap_end = 0;
3429
3430 bi = sh->dev[i].written;
3431 sh->dev[i].written = NULL;
3432 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3433 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3434 sh->dev[i].page = sh->dev[i].orig_page;
3435 }
3436
3437 if (bi) bitmap_end = 1;
3438 while (bi && bi->bi_iter.bi_sector <
3439 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3440 struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
3441
3442 md_write_end(conf->mddev);
3443 bio_io_error(bi);
3444 bi = bi2;
3445 }
3446
3447
3448
3449
3450 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3451 s->failed > conf->max_degraded &&
3452 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3453 test_bit(R5_ReadError, &sh->dev[i].flags))) {
3454 spin_lock_irq(&sh->stripe_lock);
3455 bi = sh->dev[i].toread;
3456 sh->dev[i].toread = NULL;
3457 spin_unlock_irq(&sh->stripe_lock);
3458 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3459 wake_up(&conf->wait_for_overlap);
3460 if (bi)
3461 s->to_read--;
3462 while (bi && bi->bi_iter.bi_sector <
3463 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3464 struct bio *nextbi =
3465 r5_next_bio(conf, bi, sh->dev[i].sector);
3466
3467 bio_io_error(bi);
3468 bi = nextbi;
3469 }
3470 }
3471 if (bitmap_end)
3472 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3473 RAID5_STRIPE_SECTORS(conf), 0, 0);
3474
3475
3476
3477 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3478 }
3479 s->to_write = 0;
3480 s->written = 0;
3481
3482 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3483 if (atomic_dec_and_test(&conf->pending_full_writes))
3484 md_wakeup_thread(conf->mddev->thread);
3485}
3486
3487static void
3488handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3489 struct stripe_head_state *s)
3490{
3491 int abort = 0;
3492 int i;
3493
3494 BUG_ON(sh->batch_head);
3495 clear_bit(STRIPE_SYNCING, &sh->state);
3496 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3497 wake_up(&conf->wait_for_overlap);
3498 s->syncing = 0;
3499 s->replacing = 0;
3500
3501
3502
3503
3504
3505
3506
3507 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3508
3509
3510
3511 rcu_read_lock();
3512 for (i = 0; i < conf->raid_disks; i++) {
3513 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3514 if (rdev
3515 && !test_bit(Faulty, &rdev->flags)
3516 && !test_bit(In_sync, &rdev->flags)
3517 && !rdev_set_badblocks(rdev, sh->sector,
3518 RAID5_STRIPE_SECTORS(conf), 0))
3519 abort = 1;
3520 rdev = rcu_dereference(conf->disks[i].replacement);
3521 if (rdev
3522 && !test_bit(Faulty, &rdev->flags)
3523 && !test_bit(In_sync, &rdev->flags)
3524 && !rdev_set_badblocks(rdev, sh->sector,
3525 RAID5_STRIPE_SECTORS(conf), 0))
3526 abort = 1;
3527 }
3528 rcu_read_unlock();
3529 if (abort)
3530 conf->recovery_disabled =
3531 conf->mddev->recovery_disabled;
3532 }
3533 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
3534}
3535
3536static int want_replace(struct stripe_head *sh, int disk_idx)
3537{
3538 struct md_rdev *rdev;
3539 int rv = 0;
3540
3541 rcu_read_lock();
3542 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3543 if (rdev
3544 && !test_bit(Faulty, &rdev->flags)
3545 && !test_bit(In_sync, &rdev->flags)
3546 && (rdev->recovery_offset <= sh->sector
3547 || rdev->mddev->recovery_cp <= sh->sector))
3548 rv = 1;
3549 rcu_read_unlock();
3550 return rv;
3551}
3552
3553static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3554 int disk_idx, int disks)
3555{
3556 struct r5dev *dev = &sh->dev[disk_idx];
3557 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3558 &sh->dev[s->failed_num[1]] };
3559 int i;
3560 bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3561
3562
3563 if (test_bit(R5_LOCKED, &dev->flags) ||
3564 test_bit(R5_UPTODATE, &dev->flags))
3565
3566
3567
3568 return 0;
3569
3570 if (dev->toread ||
3571 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3572
3573 return 1;
3574
3575 if (s->syncing || s->expanding ||
3576 (s->replacing && want_replace(sh, disk_idx)))
3577
3578
3579
3580 return 1;
3581
3582 if ((s->failed >= 1 && fdev[0]->toread) ||
3583 (s->failed >= 2 && fdev[1]->toread))
3584
3585
3586
3587 return 1;
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597 if (!s->failed || !s->to_write)
3598 return 0;
3599
3600 if (test_bit(R5_Insync, &dev->flags) &&
3601 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3602
3603
3604
3605
3606
3607 return 0;
3608
3609 for (i = 0; i < s->failed && i < 2; i++) {
3610 if (fdev[i]->towrite &&
3611 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3612 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3613
3614
3615
3616
3617
3618 return 1;
3619
3620 if (s->failed >= 2 &&
3621 (fdev[i]->towrite ||
3622 s->failed_num[i] == sh->pd_idx ||
3623 s->failed_num[i] == sh->qd_idx) &&
3624 !test_bit(R5_UPTODATE, &fdev[i]->flags))
3625
3626
3627
3628
3629 force_rcw = true;
3630 }
3631
3632
3633
3634
3635
3636
3637
3638
3639 if (!force_rcw &&
3640 sh->sector < sh->raid_conf->mddev->recovery_cp)
3641
3642 return 0;
3643 for (i = 0; i < s->failed && i < 2; i++) {
3644 if (s->failed_num[i] != sh->pd_idx &&
3645 s->failed_num[i] != sh->qd_idx &&
3646 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3647 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3648 return 1;
3649 }
3650
3651 return 0;
3652}
3653
3654
3655
3656
3657
3658
3659
3660static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3661 int disk_idx, int disks)
3662{
3663 struct r5dev *dev = &sh->dev[disk_idx];
3664
3665
3666 if (need_this_block(sh, s, disk_idx, disks)) {
3667
3668
3669
3670 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3671 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3672 BUG_ON(sh->batch_head);
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683 if ((s->uptodate == disks - 1) &&
3684 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3685 (s->failed && (disk_idx == s->failed_num[0] ||
3686 disk_idx == s->failed_num[1])))) {
3687
3688
3689
3690 pr_debug("Computing stripe %llu block %d\n",
3691 (unsigned long long)sh->sector, disk_idx);
3692 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3693 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3694 set_bit(R5_Wantcompute, &dev->flags);
3695 sh->ops.target = disk_idx;
3696 sh->ops.target2 = -1;
3697 s->req_compute = 1;
3698
3699
3700
3701
3702
3703
3704 s->uptodate++;
3705 return 1;
3706 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3707
3708
3709
3710 int other;
3711 for (other = disks; other--; ) {
3712 if (other == disk_idx)
3713 continue;
3714 if (!test_bit(R5_UPTODATE,
3715 &sh->dev[other].flags))
3716 break;
3717 }
3718 BUG_ON(other < 0);
3719 pr_debug("Computing stripe %llu blocks %d,%d\n",
3720 (unsigned long long)sh->sector,
3721 disk_idx, other);
3722 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3723 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3724 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3725 set_bit(R5_Wantcompute, &sh->dev[other].flags);
3726 sh->ops.target = disk_idx;
3727 sh->ops.target2 = other;
3728 s->uptodate += 2;
3729 s->req_compute = 1;
3730 return 1;
3731 } else if (test_bit(R5_Insync, &dev->flags)) {
3732 set_bit(R5_LOCKED, &dev->flags);
3733 set_bit(R5_Wantread, &dev->flags);
3734 s->locked++;
3735 pr_debug("Reading block %d (sync=%d)\n",
3736 disk_idx, s->syncing);
3737 }
3738 }
3739
3740 return 0;
3741}
3742
3743
3744
3745
3746static void handle_stripe_fill(struct stripe_head *sh,
3747 struct stripe_head_state *s,
3748 int disks)
3749{
3750 int i;
3751
3752
3753
3754
3755
3756 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3757 !sh->reconstruct_state) {
3758
3759
3760
3761
3762
3763
3764
3765
3766 if (s->injournal && s->failed) {
3767 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3768 r5c_make_stripe_write_out(sh);
3769 goto out;
3770 }
3771
3772 for (i = disks; i--; )
3773 if (fetch_block(sh, s, i, disks))
3774 break;
3775 }
3776out:
3777 set_bit(STRIPE_HANDLE, &sh->state);
3778}
3779
3780static void break_stripe_batch_list(struct stripe_head *head_sh,
3781 unsigned long handle_flags);
3782
3783
3784
3785
3786
3787static void handle_stripe_clean_event(struct r5conf *conf,
3788 struct stripe_head *sh, int disks)
3789{
3790 int i;
3791 struct r5dev *dev;
3792 int discard_pending = 0;
3793 struct stripe_head *head_sh = sh;
3794 bool do_endio = false;
3795
3796 for (i = disks; i--; )
3797 if (sh->dev[i].written) {
3798 dev = &sh->dev[i];
3799 if (!test_bit(R5_LOCKED, &dev->flags) &&
3800 (test_bit(R5_UPTODATE, &dev->flags) ||
3801 test_bit(R5_Discard, &dev->flags) ||
3802 test_bit(R5_SkipCopy, &dev->flags))) {
3803
3804 struct bio *wbi, *wbi2;
3805 pr_debug("Return write for disc %d\n", i);
3806 if (test_and_clear_bit(R5_Discard, &dev->flags))
3807 clear_bit(R5_UPTODATE, &dev->flags);
3808 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3809 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3810 }
3811 do_endio = true;
3812
3813returnbi:
3814 dev->page = dev->orig_page;
3815 wbi = dev->written;
3816 dev->written = NULL;
3817 while (wbi && wbi->bi_iter.bi_sector <
3818 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
3819 wbi2 = r5_next_bio(conf, wbi, dev->sector);
3820 md_write_end(conf->mddev);
3821 bio_endio(wbi);
3822 wbi = wbi2;
3823 }
3824 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3825 RAID5_STRIPE_SECTORS(conf),
3826 !test_bit(STRIPE_DEGRADED, &sh->state),
3827 0);
3828 if (head_sh->batch_head) {
3829 sh = list_first_entry(&sh->batch_list,
3830 struct stripe_head,
3831 batch_list);
3832 if (sh != head_sh) {
3833 dev = &sh->dev[i];
3834 goto returnbi;
3835 }
3836 }
3837 sh = head_sh;
3838 dev = &sh->dev[i];
3839 } else if (test_bit(R5_Discard, &dev->flags))
3840 discard_pending = 1;
3841 }
3842
3843 log_stripe_write_finished(sh);
3844
3845 if (!discard_pending &&
3846 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
3847 int hash;
3848 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
3849 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3850 if (sh->qd_idx >= 0) {
3851 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
3852 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
3853 }
3854
3855 clear_bit(STRIPE_DISCARD, &sh->state);
3856
3857
3858
3859
3860
3861unhash:
3862 hash = sh->hash_lock_index;
3863 spin_lock_irq(conf->hash_locks + hash);
3864 remove_hash(sh);
3865 spin_unlock_irq(conf->hash_locks + hash);
3866 if (head_sh->batch_head) {
3867 sh = list_first_entry(&sh->batch_list,
3868 struct stripe_head, batch_list);
3869 if (sh != head_sh)
3870 goto unhash;
3871 }
3872 sh = head_sh;
3873
3874 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
3875 set_bit(STRIPE_HANDLE, &sh->state);
3876
3877 }
3878
3879 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3880 if (atomic_dec_and_test(&conf->pending_full_writes))
3881 md_wakeup_thread(conf->mddev->thread);
3882
3883 if (head_sh->batch_head && do_endio)
3884 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3885}
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895static inline bool uptodate_for_rmw(struct r5dev *dev)
3896{
3897 return (test_bit(R5_UPTODATE, &dev->flags)) &&
3898 (!test_bit(R5_InJournal, &dev->flags) ||
3899 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
3900}
3901
3902static int handle_stripe_dirtying(struct r5conf *conf,
3903 struct stripe_head *sh,
3904 struct stripe_head_state *s,
3905 int disks)
3906{
3907 int rmw = 0, rcw = 0, i;
3908 sector_t recovery_cp = conf->mddev->recovery_cp;
3909
3910
3911
3912
3913
3914
3915
3916
3917 if (conf->rmw_level == PARITY_DISABLE_RMW ||
3918 (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
3919 s->failed == 0)) {
3920
3921
3922
3923 rcw = 1; rmw = 2;
3924 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3925 conf->rmw_level, (unsigned long long)recovery_cp,
3926 (unsigned long long)sh->sector);
3927 } else for (i = disks; i--; ) {
3928
3929 struct r5dev *dev = &sh->dev[i];
3930 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3931 i == sh->pd_idx || i == sh->qd_idx ||
3932 test_bit(R5_InJournal, &dev->flags)) &&
3933 !test_bit(R5_LOCKED, &dev->flags) &&
3934 !(uptodate_for_rmw(dev) ||
3935 test_bit(R5_Wantcompute, &dev->flags))) {
3936 if (test_bit(R5_Insync, &dev->flags))
3937 rmw++;
3938 else
3939 rmw += 2*disks;
3940 }
3941
3942 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3943 i != sh->pd_idx && i != sh->qd_idx &&
3944 !test_bit(R5_LOCKED, &dev->flags) &&
3945 !(test_bit(R5_UPTODATE, &dev->flags) ||
3946 test_bit(R5_Wantcompute, &dev->flags))) {
3947 if (test_bit(R5_Insync, &dev->flags))
3948 rcw++;
3949 else
3950 rcw += 2*disks;
3951 }
3952 }
3953
3954 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
3955 (unsigned long long)sh->sector, sh->state, rmw, rcw);
3956 set_bit(STRIPE_HANDLE, &sh->state);
3957 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
3958
3959 if (conf->mddev->queue)
3960 blk_add_trace_msg(conf->mddev->queue,
3961 "raid5 rmw %llu %d",
3962 (unsigned long long)sh->sector, rmw);
3963 for (i = disks; i--; ) {
3964 struct r5dev *dev = &sh->dev[i];
3965 if (test_bit(R5_InJournal, &dev->flags) &&
3966 dev->page == dev->orig_page &&
3967 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
3968
3969 struct page *p = alloc_page(GFP_NOIO);
3970
3971 if (p) {
3972 dev->orig_page = p;
3973 continue;
3974 }
3975
3976
3977
3978
3979
3980 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
3981 &conf->cache_state)) {
3982 r5c_use_extra_page(sh);
3983 break;
3984 }
3985
3986
3987 set_bit(STRIPE_DELAYED, &sh->state);
3988 s->waiting_extra_page = 1;
3989 return -EAGAIN;
3990 }
3991 }
3992
3993 for (i = disks; i--; ) {
3994 struct r5dev *dev = &sh->dev[i];
3995 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3996 i == sh->pd_idx || i == sh->qd_idx ||
3997 test_bit(R5_InJournal, &dev->flags)) &&
3998 !test_bit(R5_LOCKED, &dev->flags) &&
3999 !(uptodate_for_rmw(dev) ||
4000 test_bit(R5_Wantcompute, &dev->flags)) &&
4001 test_bit(R5_Insync, &dev->flags)) {
4002 if (test_bit(STRIPE_PREREAD_ACTIVE,
4003 &sh->state)) {
4004 pr_debug("Read_old block %d for r-m-w\n",
4005 i);
4006 set_bit(R5_LOCKED, &dev->flags);
4007 set_bit(R5_Wantread, &dev->flags);
4008 s->locked++;
4009 } else
4010 set_bit(STRIPE_DELAYED, &sh->state);
4011 }
4012 }
4013 }
4014 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
4015
4016 int qread =0;
4017 rcw = 0;
4018 for (i = disks; i--; ) {
4019 struct r5dev *dev = &sh->dev[i];
4020 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4021 i != sh->pd_idx && i != sh->qd_idx &&
4022 !test_bit(R5_LOCKED, &dev->flags) &&
4023 !(test_bit(R5_UPTODATE, &dev->flags) ||
4024 test_bit(R5_Wantcompute, &dev->flags))) {
4025 rcw++;
4026 if (test_bit(R5_Insync, &dev->flags) &&
4027 test_bit(STRIPE_PREREAD_ACTIVE,
4028 &sh->state)) {
4029 pr_debug("Read_old block "
4030 "%d for Reconstruct\n", i);
4031 set_bit(R5_LOCKED, &dev->flags);
4032 set_bit(R5_Wantread, &dev->flags);
4033 s->locked++;
4034 qread++;
4035 } else
4036 set_bit(STRIPE_DELAYED, &sh->state);
4037 }
4038 }
4039 if (rcw && conf->mddev->queue)
4040 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4041 (unsigned long long)sh->sector,
4042 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4043 }
4044
4045 if (rcw > disks && rmw > disks &&
4046 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4047 set_bit(STRIPE_DELAYED, &sh->state);
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4060 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4061 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4062 schedule_reconstruction(sh, s, rcw == 0, 0);
4063 return 0;
4064}
4065
4066static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4067 struct stripe_head_state *s, int disks)
4068{
4069 struct r5dev *dev = NULL;
4070
4071 BUG_ON(sh->batch_head);
4072 set_bit(STRIPE_HANDLE, &sh->state);
4073
4074 switch (sh->check_state) {
4075 case check_state_idle:
4076
4077 if (s->failed == 0) {
4078 BUG_ON(s->uptodate != disks);
4079 sh->check_state = check_state_run;
4080 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4081 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4082 s->uptodate--;
4083 break;
4084 }
4085 dev = &sh->dev[s->failed_num[0]];
4086 fallthrough;
4087 case check_state_compute_result:
4088 sh->check_state = check_state_idle;
4089 if (!dev)
4090 dev = &sh->dev[sh->pd_idx];
4091
4092
4093 if (test_bit(STRIPE_INSYNC, &sh->state))
4094 break;
4095
4096
4097 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4098 BUG_ON(s->uptodate != disks);
4099
4100 set_bit(R5_LOCKED, &dev->flags);
4101 s->locked++;
4102 set_bit(R5_Wantwrite, &dev->flags);
4103
4104 clear_bit(STRIPE_DEGRADED, &sh->state);
4105 set_bit(STRIPE_INSYNC, &sh->state);
4106 break;
4107 case check_state_run:
4108 break;
4109 case check_state_check_result:
4110 sh->check_state = check_state_idle;
4111
4112
4113
4114
4115 if (s->failed)
4116 break;
4117
4118
4119
4120
4121
4122 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4123
4124
4125
4126 set_bit(STRIPE_INSYNC, &sh->state);
4127 else {
4128 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4129 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4130
4131 set_bit(STRIPE_INSYNC, &sh->state);
4132 pr_warn_ratelimited("%s: mismatch sector in range "
4133 "%llu-%llu\n", mdname(conf->mddev),
4134 (unsigned long long) sh->sector,
4135 (unsigned long long) sh->sector +
4136 RAID5_STRIPE_SECTORS(conf));
4137 } else {
4138 sh->check_state = check_state_compute_run;
4139 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4140 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4141 set_bit(R5_Wantcompute,
4142 &sh->dev[sh->pd_idx].flags);
4143 sh->ops.target = sh->pd_idx;
4144 sh->ops.target2 = -1;
4145 s->uptodate++;
4146 }
4147 }
4148 break;
4149 case check_state_compute_run:
4150 break;
4151 default:
4152 pr_err("%s: unknown check_state: %d sector: %llu\n",
4153 __func__, sh->check_state,
4154 (unsigned long long) sh->sector);
4155 BUG();
4156 }
4157}
4158
4159static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4160 struct stripe_head_state *s,
4161 int disks)
4162{
4163 int pd_idx = sh->pd_idx;
4164 int qd_idx = sh->qd_idx;
4165 struct r5dev *dev;
4166
4167 BUG_ON(sh->batch_head);
4168 set_bit(STRIPE_HANDLE, &sh->state);
4169
4170 BUG_ON(s->failed > 2);
4171
4172
4173
4174
4175
4176
4177
4178 switch (sh->check_state) {
4179 case check_state_idle:
4180
4181 if (s->failed == s->q_failed) {
4182
4183
4184
4185
4186 sh->check_state = check_state_run;
4187 }
4188 if (!s->q_failed && s->failed < 2) {
4189
4190
4191
4192 if (sh->check_state == check_state_run)
4193 sh->check_state = check_state_run_pq;
4194 else
4195 sh->check_state = check_state_run_q;
4196 }
4197
4198
4199 sh->ops.zero_sum_result = 0;
4200
4201 if (sh->check_state == check_state_run) {
4202
4203 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4204 s->uptodate--;
4205 }
4206 if (sh->check_state >= check_state_run &&
4207 sh->check_state <= check_state_run_pq) {
4208
4209
4210
4211 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4212 break;
4213 }
4214
4215
4216 BUG_ON(s->failed != 2);
4217 fallthrough;
4218 case check_state_compute_result:
4219 sh->check_state = check_state_idle;
4220
4221
4222 if (test_bit(STRIPE_INSYNC, &sh->state))
4223 break;
4224
4225
4226
4227
4228 dev = NULL;
4229 if (s->failed == 2) {
4230 dev = &sh->dev[s->failed_num[1]];
4231 s->locked++;
4232 set_bit(R5_LOCKED, &dev->flags);
4233 set_bit(R5_Wantwrite, &dev->flags);
4234 }
4235 if (s->failed >= 1) {
4236 dev = &sh->dev[s->failed_num[0]];
4237 s->locked++;
4238 set_bit(R5_LOCKED, &dev->flags);
4239 set_bit(R5_Wantwrite, &dev->flags);
4240 }
4241 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4242 dev = &sh->dev[pd_idx];
4243 s->locked++;
4244 set_bit(R5_LOCKED, &dev->flags);
4245 set_bit(R5_Wantwrite, &dev->flags);
4246 }
4247 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4248 dev = &sh->dev[qd_idx];
4249 s->locked++;
4250 set_bit(R5_LOCKED, &dev->flags);
4251 set_bit(R5_Wantwrite, &dev->flags);
4252 }
4253 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4254 "%s: disk%td not up to date\n",
4255 mdname(conf->mddev),
4256 dev - (struct r5dev *) &sh->dev)) {
4257 clear_bit(R5_LOCKED, &dev->flags);
4258 clear_bit(R5_Wantwrite, &dev->flags);
4259 s->locked--;
4260 }
4261 clear_bit(STRIPE_DEGRADED, &sh->state);
4262
4263 set_bit(STRIPE_INSYNC, &sh->state);
4264 break;
4265 case check_state_run:
4266 case check_state_run_q:
4267 case check_state_run_pq:
4268 break;
4269 case check_state_check_result:
4270 sh->check_state = check_state_idle;
4271
4272
4273
4274
4275
4276 if (sh->ops.zero_sum_result == 0) {
4277
4278 if (!s->failed)
4279 set_bit(STRIPE_INSYNC, &sh->state);
4280 else {
4281
4282
4283
4284
4285 sh->check_state = check_state_compute_result;
4286
4287
4288
4289
4290
4291 }
4292 } else {
4293 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4294 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4295
4296 set_bit(STRIPE_INSYNC, &sh->state);
4297 pr_warn_ratelimited("%s: mismatch sector in range "
4298 "%llu-%llu\n", mdname(conf->mddev),
4299 (unsigned long long) sh->sector,
4300 (unsigned long long) sh->sector +
4301 RAID5_STRIPE_SECTORS(conf));
4302 } else {
4303 int *target = &sh->ops.target;
4304
4305 sh->ops.target = -1;
4306 sh->ops.target2 = -1;
4307 sh->check_state = check_state_compute_run;
4308 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4309 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4310 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4311 set_bit(R5_Wantcompute,
4312 &sh->dev[pd_idx].flags);
4313 *target = pd_idx;
4314 target = &sh->ops.target2;
4315 s->uptodate++;
4316 }
4317 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4318 set_bit(R5_Wantcompute,
4319 &sh->dev[qd_idx].flags);
4320 *target = qd_idx;
4321 s->uptodate++;
4322 }
4323 }
4324 }
4325 break;
4326 case check_state_compute_run:
4327 break;
4328 default:
4329 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4330 __func__, sh->check_state,
4331 (unsigned long long) sh->sector);
4332 BUG();
4333 }
4334}
4335
4336static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4337{
4338 int i;
4339
4340
4341
4342
4343 struct dma_async_tx_descriptor *tx = NULL;
4344 BUG_ON(sh->batch_head);
4345 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4346 for (i = 0; i < sh->disks; i++)
4347 if (i != sh->pd_idx && i != sh->qd_idx) {
4348 int dd_idx, j;
4349 struct stripe_head *sh2;
4350 struct async_submit_ctl submit;
4351
4352 sector_t bn = raid5_compute_blocknr(sh, i, 1);
4353 sector_t s = raid5_compute_sector(conf, bn, 0,
4354 &dd_idx, NULL);
4355 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4356 if (sh2 == NULL)
4357
4358
4359
4360
4361 continue;
4362 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4363 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4364
4365 raid5_release_stripe(sh2);
4366 continue;
4367 }
4368
4369
4370 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4371 tx = async_memcpy(sh2->dev[dd_idx].page,
4372 sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf),
4373 &submit);
4374
4375 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4376 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4377 for (j = 0; j < conf->raid_disks; j++)
4378 if (j != sh2->pd_idx &&
4379 j != sh2->qd_idx &&
4380 !test_bit(R5_Expanded, &sh2->dev[j].flags))
4381 break;
4382 if (j == conf->raid_disks) {
4383 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4384 set_bit(STRIPE_HANDLE, &sh2->state);
4385 }
4386 raid5_release_stripe(sh2);
4387
4388 }
4389
4390 async_tx_quiesce(&tx);
4391}
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4408{
4409 struct r5conf *conf = sh->raid_conf;
4410 int disks = sh->disks;
4411 struct r5dev *dev;
4412 int i;
4413 int do_recovery = 0;
4414
4415 memset(s, 0, sizeof(*s));
4416
4417 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4418 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4419 s->failed_num[0] = -1;
4420 s->failed_num[1] = -1;
4421 s->log_failed = r5l_log_disk_error(conf);
4422
4423
4424 rcu_read_lock();
4425 for (i=disks; i--; ) {
4426 struct md_rdev *rdev;
4427 sector_t first_bad;
4428 int bad_sectors;
4429 int is_bad = 0;
4430
4431 dev = &sh->dev[i];
4432
4433 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4434 i, dev->flags,
4435 dev->toread, dev->towrite, dev->written);
4436
4437
4438
4439
4440
4441 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4442 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4443 set_bit(R5_Wantfill, &dev->flags);
4444
4445
4446 if (test_bit(R5_LOCKED, &dev->flags))
4447 s->locked++;
4448 if (test_bit(R5_UPTODATE, &dev->flags))
4449 s->uptodate++;
4450 if (test_bit(R5_Wantcompute, &dev->flags)) {
4451 s->compute++;
4452 BUG_ON(s->compute > 2);
4453 }
4454
4455 if (test_bit(R5_Wantfill, &dev->flags))
4456 s->to_fill++;
4457 else if (dev->toread)
4458 s->to_read++;
4459 if (dev->towrite) {
4460 s->to_write++;
4461 if (!test_bit(R5_OVERWRITE, &dev->flags))
4462 s->non_overwrite++;
4463 }
4464 if (dev->written)
4465 s->written++;
4466
4467
4468
4469 rdev = rcu_dereference(conf->disks[i].replacement);
4470 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4471 rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4472 !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4473 &first_bad, &bad_sectors))
4474 set_bit(R5_ReadRepl, &dev->flags);
4475 else {
4476 if (rdev && !test_bit(Faulty, &rdev->flags))
4477 set_bit(R5_NeedReplace, &dev->flags);
4478 else
4479 clear_bit(R5_NeedReplace, &dev->flags);
4480 rdev = rcu_dereference(conf->disks[i].rdev);
4481 clear_bit(R5_ReadRepl, &dev->flags);
4482 }
4483 if (rdev && test_bit(Faulty, &rdev->flags))
4484 rdev = NULL;
4485 if (rdev) {
4486 is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4487 &first_bad, &bad_sectors);
4488 if (s->blocked_rdev == NULL
4489 && (test_bit(Blocked, &rdev->flags)
4490 || is_bad < 0)) {
4491 if (is_bad < 0)
4492 set_bit(BlockedBadBlocks,
4493 &rdev->flags);
4494 s->blocked_rdev = rdev;
4495 atomic_inc(&rdev->nr_pending);
4496 }
4497 }
4498 clear_bit(R5_Insync, &dev->flags);
4499 if (!rdev)
4500 ;
4501 else if (is_bad) {
4502
4503 if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4504 test_bit(R5_UPTODATE, &dev->flags)) {
4505
4506
4507
4508 set_bit(R5_Insync, &dev->flags);
4509 set_bit(R5_ReadError, &dev->flags);
4510 }
4511 } else if (test_bit(In_sync, &rdev->flags))
4512 set_bit(R5_Insync, &dev->flags);
4513 else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
4514
4515 set_bit(R5_Insync, &dev->flags);
4516 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4517 test_bit(R5_Expanded, &dev->flags))
4518
4519
4520
4521
4522 set_bit(R5_Insync, &dev->flags);
4523
4524 if (test_bit(R5_WriteError, &dev->flags)) {
4525
4526
4527 struct md_rdev *rdev2 = rcu_dereference(
4528 conf->disks[i].rdev);
4529 if (rdev2 == rdev)
4530 clear_bit(R5_Insync, &dev->flags);
4531 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4532 s->handle_bad_blocks = 1;
4533 atomic_inc(&rdev2->nr_pending);
4534 } else
4535 clear_bit(R5_WriteError, &dev->flags);
4536 }
4537 if (test_bit(R5_MadeGood, &dev->flags)) {
4538
4539
4540 struct md_rdev *rdev2 = rcu_dereference(
4541 conf->disks[i].rdev);
4542 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4543 s->handle_bad_blocks = 1;
4544 atomic_inc(&rdev2->nr_pending);
4545 } else
4546 clear_bit(R5_MadeGood, &dev->flags);
4547 }
4548 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4549 struct md_rdev *rdev2 = rcu_dereference(
4550 conf->disks[i].replacement);
4551 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4552 s->handle_bad_blocks = 1;
4553 atomic_inc(&rdev2->nr_pending);
4554 } else
4555 clear_bit(R5_MadeGoodRepl, &dev->flags);
4556 }
4557 if (!test_bit(R5_Insync, &dev->flags)) {
4558
4559 clear_bit(R5_ReadError, &dev->flags);
4560 clear_bit(R5_ReWrite, &dev->flags);
4561 }
4562 if (test_bit(R5_ReadError, &dev->flags))
4563 clear_bit(R5_Insync, &dev->flags);
4564 if (!test_bit(R5_Insync, &dev->flags)) {
4565 if (s->failed < 2)
4566 s->failed_num[s->failed] = i;
4567 s->failed++;
4568 if (rdev && !test_bit(Faulty, &rdev->flags))
4569 do_recovery = 1;
4570 else if (!rdev) {
4571 rdev = rcu_dereference(
4572 conf->disks[i].replacement);
4573 if (rdev && !test_bit(Faulty, &rdev->flags))
4574 do_recovery = 1;
4575 }
4576 }
4577
4578 if (test_bit(R5_InJournal, &dev->flags))
4579 s->injournal++;
4580 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4581 s->just_cached++;
4582 }
4583 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4584
4585
4586
4587
4588
4589
4590
4591
4592 if (do_recovery ||
4593 sh->sector >= conf->mddev->recovery_cp ||
4594 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4595 s->syncing = 1;
4596 else
4597 s->replacing = 1;
4598 }
4599 rcu_read_unlock();
4600}
4601
4602
4603
4604
4605
4606static int clear_batch_ready(struct stripe_head *sh)
4607{
4608 struct stripe_head *tmp;
4609 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4610 return (sh->batch_head && sh->batch_head != sh);
4611 spin_lock(&sh->stripe_lock);
4612 if (!sh->batch_head) {
4613 spin_unlock(&sh->stripe_lock);
4614 return 0;
4615 }
4616
4617
4618
4619
4620
4621 if (sh->batch_head != sh) {
4622 spin_unlock(&sh->stripe_lock);
4623 return 1;
4624 }
4625 spin_lock(&sh->batch_lock);
4626 list_for_each_entry(tmp, &sh->batch_list, batch_list)
4627 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4628 spin_unlock(&sh->batch_lock);
4629 spin_unlock(&sh->stripe_lock);
4630
4631
4632
4633
4634
4635 return 0;
4636}
4637
4638static void break_stripe_batch_list(struct stripe_head *head_sh,
4639 unsigned long handle_flags)
4640{
4641 struct stripe_head *sh, *next;
4642 int i;
4643 int do_wakeup = 0;
4644
4645 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4646
4647 list_del_init(&sh->batch_list);
4648
4649 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4650 (1 << STRIPE_SYNCING) |
4651 (1 << STRIPE_REPLACED) |
4652 (1 << STRIPE_DELAYED) |
4653 (1 << STRIPE_BIT_DELAY) |
4654 (1 << STRIPE_FULL_WRITE) |
4655 (1 << STRIPE_BIOFILL_RUN) |
4656 (1 << STRIPE_COMPUTE_RUN) |
4657 (1 << STRIPE_DISCARD) |
4658 (1 << STRIPE_BATCH_READY) |
4659 (1 << STRIPE_BATCH_ERR) |
4660 (1 << STRIPE_BITMAP_PENDING)),
4661 "stripe state: %lx\n", sh->state);
4662 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4663 (1 << STRIPE_REPLACED)),
4664 "head stripe state: %lx\n", head_sh->state);
4665
4666 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4667 (1 << STRIPE_PREREAD_ACTIVE) |
4668 (1 << STRIPE_DEGRADED) |
4669 (1 << STRIPE_ON_UNPLUG_LIST)),
4670 head_sh->state & (1 << STRIPE_INSYNC));
4671
4672 sh->check_state = head_sh->check_state;
4673 sh->reconstruct_state = head_sh->reconstruct_state;
4674 spin_lock_irq(&sh->stripe_lock);
4675 sh->batch_head = NULL;
4676 spin_unlock_irq(&sh->stripe_lock);
4677 for (i = 0; i < sh->disks; i++) {
4678 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4679 do_wakeup = 1;
4680 sh->dev[i].flags = head_sh->dev[i].flags &
4681 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4682 }
4683 if (handle_flags == 0 ||
4684 sh->state & handle_flags)
4685 set_bit(STRIPE_HANDLE, &sh->state);
4686 raid5_release_stripe(sh);
4687 }
4688 spin_lock_irq(&head_sh->stripe_lock);
4689 head_sh->batch_head = NULL;
4690 spin_unlock_irq(&head_sh->stripe_lock);
4691 for (i = 0; i < head_sh->disks; i++)
4692 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4693 do_wakeup = 1;
4694 if (head_sh->state & handle_flags)
4695 set_bit(STRIPE_HANDLE, &head_sh->state);
4696
4697 if (do_wakeup)
4698 wake_up(&head_sh->raid_conf->wait_for_overlap);
4699}
4700
4701static void handle_stripe(struct stripe_head *sh)
4702{
4703 struct stripe_head_state s;
4704 struct r5conf *conf = sh->raid_conf;
4705 int i;
4706 int prexor;
4707 int disks = sh->disks;
4708 struct r5dev *pdev, *qdev;
4709
4710 clear_bit(STRIPE_HANDLE, &sh->state);
4711
4712
4713
4714
4715
4716
4717
4718 if (clear_batch_ready(sh))
4719 return;
4720
4721 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4722
4723
4724 set_bit(STRIPE_HANDLE, &sh->state);
4725 return;
4726 }
4727
4728 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4729 break_stripe_batch_list(sh, 0);
4730
4731 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4732 spin_lock(&sh->stripe_lock);
4733
4734
4735
4736
4737 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4738 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4739 !test_bit(STRIPE_DISCARD, &sh->state) &&
4740 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4741 set_bit(STRIPE_SYNCING, &sh->state);
4742 clear_bit(STRIPE_INSYNC, &sh->state);
4743 clear_bit(STRIPE_REPLACED, &sh->state);
4744 }
4745 spin_unlock(&sh->stripe_lock);
4746 }
4747 clear_bit(STRIPE_DELAYED, &sh->state);
4748
4749 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4750 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4751 (unsigned long long)sh->sector, sh->state,
4752 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4753 sh->check_state, sh->reconstruct_state);
4754
4755 analyse_stripe(sh, &s);
4756
4757 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4758 goto finish;
4759
4760 if (s.handle_bad_blocks ||
4761 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4762 set_bit(STRIPE_HANDLE, &sh->state);
4763 goto finish;
4764 }
4765
4766 if (unlikely(s.blocked_rdev)) {
4767 if (s.syncing || s.expanding || s.expanded ||
4768 s.replacing || s.to_write || s.written) {
4769 set_bit(STRIPE_HANDLE, &sh->state);
4770 goto finish;
4771 }
4772
4773 rdev_dec_pending(s.blocked_rdev, conf->mddev);
4774 s.blocked_rdev = NULL;
4775 }
4776
4777 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4778 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4779 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4780 }
4781
4782 pr_debug("locked=%d uptodate=%d to_read=%d"
4783 " to_write=%d failed=%d failed_num=%d,%d\n",
4784 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4785 s.failed_num[0], s.failed_num[1]);
4786
4787
4788
4789
4790
4791
4792
4793 if (s.failed > conf->max_degraded ||
4794 (s.log_failed && s.injournal == 0)) {
4795 sh->check_state = 0;
4796 sh->reconstruct_state = 0;
4797 break_stripe_batch_list(sh, 0);
4798 if (s.to_read+s.to_write+s.written)
4799 handle_failed_stripe(conf, sh, &s, disks);
4800 if (s.syncing + s.replacing)
4801 handle_failed_sync(conf, sh, &s);
4802 }
4803
4804
4805
4806
4807 prexor = 0;
4808 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4809 prexor = 1;
4810 if (sh->reconstruct_state == reconstruct_state_drain_result ||
4811 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4812 sh->reconstruct_state = reconstruct_state_idle;
4813
4814
4815
4816
4817 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4818 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4819 BUG_ON(sh->qd_idx >= 0 &&
4820 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4821 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4822 for (i = disks; i--; ) {
4823 struct r5dev *dev = &sh->dev[i];
4824 if (test_bit(R5_LOCKED, &dev->flags) &&
4825 (i == sh->pd_idx || i == sh->qd_idx ||
4826 dev->written || test_bit(R5_InJournal,
4827 &dev->flags))) {
4828 pr_debug("Writing block %d\n", i);
4829 set_bit(R5_Wantwrite, &dev->flags);
4830 if (prexor)
4831 continue;
4832 if (s.failed > 1)
4833 continue;
4834 if (!test_bit(R5_Insync, &dev->flags) ||
4835 ((i == sh->pd_idx || i == sh->qd_idx) &&
4836 s.failed == 0))
4837 set_bit(STRIPE_INSYNC, &sh->state);
4838 }
4839 }
4840 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4841 s.dec_preread_active = 1;
4842 }
4843
4844
4845
4846
4847
4848 pdev = &sh->dev[sh->pd_idx];
4849 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
4850 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
4851 qdev = &sh->dev[sh->qd_idx];
4852 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
4853 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
4854 || conf->level < 6;
4855
4856 if (s.written &&
4857 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
4858 && !test_bit(R5_LOCKED, &pdev->flags)
4859 && (test_bit(R5_UPTODATE, &pdev->flags) ||
4860 test_bit(R5_Discard, &pdev->flags))))) &&
4861 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
4862 && !test_bit(R5_LOCKED, &qdev->flags)
4863 && (test_bit(R5_UPTODATE, &qdev->flags) ||
4864 test_bit(R5_Discard, &qdev->flags))))))
4865 handle_stripe_clean_event(conf, sh, disks);
4866
4867 if (s.just_cached)
4868 r5c_handle_cached_data_endio(conf, sh, disks);
4869 log_stripe_write_finished(sh);
4870
4871
4872
4873
4874
4875 if (s.to_read || s.non_overwrite
4876 || (s.to_write && s.failed)
4877 || (s.syncing && (s.uptodate + s.compute < disks))
4878 || s.replacing
4879 || s.expanding)
4880 handle_stripe_fill(sh, &s, disks);
4881
4882
4883
4884
4885
4886
4887 r5c_finish_stripe_write_out(conf, sh, &s);
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4899 if (!r5c_is_writeback(conf->log)) {
4900 if (s.to_write)
4901 handle_stripe_dirtying(conf, sh, &s, disks);
4902 } else {
4903 int ret = 0;
4904
4905
4906 if (s.to_write)
4907 ret = r5c_try_caching_write(conf, sh, &s,
4908 disks);
4909
4910
4911
4912
4913
4914
4915
4916 if (ret == -EAGAIN ||
4917
4918 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4919 s.injournal > 0)) {
4920 ret = handle_stripe_dirtying(conf, sh, &s,
4921 disks);
4922 if (ret == -EAGAIN)
4923 goto finish;
4924 }
4925 }
4926 }
4927
4928
4929
4930
4931
4932
4933 if (sh->check_state ||
4934 (s.syncing && s.locked == 0 &&
4935 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4936 !test_bit(STRIPE_INSYNC, &sh->state))) {
4937 if (conf->level == 6)
4938 handle_parity_checks6(conf, sh, &s, disks);
4939 else
4940 handle_parity_checks5(conf, sh, &s, disks);
4941 }
4942
4943 if ((s.replacing || s.syncing) && s.locked == 0
4944 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
4945 && !test_bit(STRIPE_REPLACED, &sh->state)) {
4946
4947 for (i = 0; i < conf->raid_disks; i++)
4948 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
4949 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
4950 set_bit(R5_WantReplace, &sh->dev[i].flags);
4951 set_bit(R5_LOCKED, &sh->dev[i].flags);
4952 s.locked++;
4953 }
4954 if (s.replacing)
4955 set_bit(STRIPE_INSYNC, &sh->state);
4956 set_bit(STRIPE_REPLACED, &sh->state);
4957 }
4958 if ((s.syncing || s.replacing) && s.locked == 0 &&
4959 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4960 test_bit(STRIPE_INSYNC, &sh->state)) {
4961 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
4962 clear_bit(STRIPE_SYNCING, &sh->state);
4963 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
4964 wake_up(&conf->wait_for_overlap);
4965 }
4966
4967
4968
4969
4970 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
4971 for (i = 0; i < s.failed; i++) {
4972 struct r5dev *dev = &sh->dev[s.failed_num[i]];
4973 if (test_bit(R5_ReadError, &dev->flags)
4974 && !test_bit(R5_LOCKED, &dev->flags)
4975 && test_bit(R5_UPTODATE, &dev->flags)
4976 ) {
4977 if (!test_bit(R5_ReWrite, &dev->flags)) {
4978 set_bit(R5_Wantwrite, &dev->flags);
4979 set_bit(R5_ReWrite, &dev->flags);
4980 } else
4981
4982 set_bit(R5_Wantread, &dev->flags);
4983 set_bit(R5_LOCKED, &dev->flags);
4984 s.locked++;
4985 }
4986 }
4987
4988
4989 if (sh->reconstruct_state == reconstruct_state_result) {
4990 struct stripe_head *sh_src
4991 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
4992 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
4993
4994
4995
4996 set_bit(STRIPE_DELAYED, &sh->state);
4997 set_bit(STRIPE_HANDLE, &sh->state);
4998 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
4999 &sh_src->state))
5000 atomic_inc(&conf->preread_active_stripes);
5001 raid5_release_stripe(sh_src);
5002 goto finish;
5003 }
5004 if (sh_src)
5005 raid5_release_stripe(sh_src);
5006
5007 sh->reconstruct_state = reconstruct_state_idle;
5008 clear_bit(STRIPE_EXPANDING, &sh->state);
5009 for (i = conf->raid_disks; i--; ) {
5010 set_bit(R5_Wantwrite, &sh->dev[i].flags);
5011 set_bit(R5_LOCKED, &sh->dev[i].flags);
5012 s.locked++;
5013 }
5014 }
5015
5016 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5017 !sh->reconstruct_state) {
5018
5019 sh->disks = conf->raid_disks;
5020 stripe_set_idx(sh->sector, conf, 0, sh);
5021 schedule_reconstruction(sh, &s, 1, 1);
5022 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
5023 clear_bit(STRIPE_EXPAND_READY, &sh->state);
5024 atomic_dec(&conf->reshape_stripes);
5025 wake_up(&conf->wait_for_overlap);
5026 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5027 }
5028
5029 if (s.expanding && s.locked == 0 &&
5030 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5031 handle_stripe_expansion(conf, sh);
5032
5033finish:
5034
5035 if (unlikely(s.blocked_rdev)) {
5036 if (conf->mddev->external)
5037 md_wait_for_blocked_rdev(s.blocked_rdev,
5038 conf->mddev);
5039 else
5040
5041
5042
5043
5044 rdev_dec_pending(s.blocked_rdev,
5045 conf->mddev);
5046 }
5047
5048 if (s.handle_bad_blocks)
5049 for (i = disks; i--; ) {
5050 struct md_rdev *rdev;
5051 struct r5dev *dev = &sh->dev[i];
5052 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5053
5054 rdev = conf->disks[i].rdev;
5055 if (!rdev_set_badblocks(rdev, sh->sector,
5056 RAID5_STRIPE_SECTORS(conf), 0))
5057 md_error(conf->mddev, rdev);
5058 rdev_dec_pending(rdev, conf->mddev);
5059 }
5060 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5061 rdev = conf->disks[i].rdev;
5062 rdev_clear_badblocks(rdev, sh->sector,
5063 RAID5_STRIPE_SECTORS(conf), 0);
5064 rdev_dec_pending(rdev, conf->mddev);
5065 }
5066 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5067 rdev = conf->disks[i].replacement;
5068 if (!rdev)
5069
5070 rdev = conf->disks[i].rdev;
5071 rdev_clear_badblocks(rdev, sh->sector,
5072 RAID5_STRIPE_SECTORS(conf), 0);
5073 rdev_dec_pending(rdev, conf->mddev);
5074 }
5075 }
5076
5077 if (s.ops_request)
5078 raid_run_ops(sh, s.ops_request);
5079
5080 ops_run_io(sh, &s);
5081
5082 if (s.dec_preread_active) {
5083
5084
5085
5086
5087 atomic_dec(&conf->preread_active_stripes);
5088 if (atomic_read(&conf->preread_active_stripes) <
5089 IO_THRESHOLD)
5090 md_wakeup_thread(conf->mddev->thread);
5091 }
5092
5093 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5094}
5095
5096static void raid5_activate_delayed(struct r5conf *conf)
5097{
5098 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5099 while (!list_empty(&conf->delayed_list)) {
5100 struct list_head *l = conf->delayed_list.next;
5101 struct stripe_head *sh;
5102 sh = list_entry(l, struct stripe_head, lru);
5103 list_del_init(l);
5104 clear_bit(STRIPE_DELAYED, &sh->state);
5105 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5106 atomic_inc(&conf->preread_active_stripes);
5107 list_add_tail(&sh->lru, &conf->hold_list);
5108 raid5_wakeup_stripe_thread(sh);
5109 }
5110 }
5111}
5112
5113static void activate_bit_delay(struct r5conf *conf,
5114 struct list_head *temp_inactive_list)
5115{
5116
5117 struct list_head head;
5118 list_add(&head, &conf->bitmap_list);
5119 list_del_init(&conf->bitmap_list);
5120 while (!list_empty(&head)) {
5121 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5122 int hash;
5123 list_del_init(&sh->lru);
5124 atomic_inc(&sh->count);
5125 hash = sh->hash_lock_index;
5126 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5127 }
5128}
5129
5130static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5131{
5132 struct r5conf *conf = mddev->private;
5133 sector_t sector = bio->bi_iter.bi_sector;
5134 unsigned int chunk_sectors;
5135 unsigned int bio_sectors = bio_sectors(bio);
5136
5137 WARN_ON_ONCE(bio->bi_partno);
5138
5139 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5140 return chunk_sectors >=
5141 ((sector & (chunk_sectors - 1)) + bio_sectors);
5142}
5143
5144
5145
5146
5147
5148static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5149{
5150 unsigned long flags;
5151
5152 spin_lock_irqsave(&conf->device_lock, flags);
5153
5154 bi->bi_next = conf->retry_read_aligned_list;
5155 conf->retry_read_aligned_list = bi;
5156
5157 spin_unlock_irqrestore(&conf->device_lock, flags);
5158 md_wakeup_thread(conf->mddev->thread);
5159}
5160
5161static struct bio *remove_bio_from_retry(struct r5conf *conf,
5162 unsigned int *offset)
5163{
5164 struct bio *bi;
5165
5166 bi = conf->retry_read_aligned;
5167 if (bi) {
5168 *offset = conf->retry_read_offset;
5169 conf->retry_read_aligned = NULL;
5170 return bi;
5171 }
5172 bi = conf->retry_read_aligned_list;
5173 if(bi) {
5174 conf->retry_read_aligned_list = bi->bi_next;
5175 bi->bi_next = NULL;
5176 *offset = 0;
5177 }
5178
5179 return bi;
5180}
5181
5182
5183
5184
5185
5186
5187
5188static void raid5_align_endio(struct bio *bi)
5189{
5190 struct bio* raid_bi = bi->bi_private;
5191 struct mddev *mddev;
5192 struct r5conf *conf;
5193 struct md_rdev *rdev;
5194 blk_status_t error = bi->bi_status;
5195
5196 bio_put(bi);
5197
5198 rdev = (void*)raid_bi->bi_next;
5199 raid_bi->bi_next = NULL;
5200 mddev = rdev->mddev;
5201 conf = mddev->private;
5202
5203 rdev_dec_pending(rdev, conf->mddev);
5204
5205 if (!error) {
5206 bio_endio(raid_bi);
5207 if (atomic_dec_and_test(&conf->active_aligned_reads))
5208 wake_up(&conf->wait_for_quiescent);
5209 return;
5210 }
5211
5212 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5213
5214 add_bio_to_retry(raid_bi, conf);
5215}
5216
5217static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5218{
5219 struct r5conf *conf = mddev->private;
5220 int dd_idx;
5221 struct bio* align_bi;
5222 struct md_rdev *rdev;
5223 sector_t end_sector;
5224
5225 if (!in_chunk_boundary(mddev, raid_bio)) {
5226 pr_debug("%s: non aligned\n", __func__);
5227 return 0;
5228 }
5229
5230
5231
5232 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
5233 if (!align_bi)
5234 return 0;
5235
5236
5237
5238
5239 align_bi->bi_end_io = raid5_align_endio;
5240 align_bi->bi_private = raid_bio;
5241
5242
5243
5244 align_bi->bi_iter.bi_sector =
5245 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
5246 0, &dd_idx, NULL);
5247
5248 end_sector = bio_end_sector(align_bi);
5249 rcu_read_lock();
5250 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5251 if (!rdev || test_bit(Faulty, &rdev->flags) ||
5252 rdev->recovery_offset < end_sector) {
5253 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5254 if (rdev &&
5255 (test_bit(Faulty, &rdev->flags) ||
5256 !(test_bit(In_sync, &rdev->flags) ||
5257 rdev->recovery_offset >= end_sector)))
5258 rdev = NULL;
5259 }
5260
5261 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
5262 rcu_read_unlock();
5263 bio_put(align_bi);
5264 return 0;
5265 }
5266
5267 if (rdev) {
5268 sector_t first_bad;
5269 int bad_sectors;
5270
5271 atomic_inc(&rdev->nr_pending);
5272 rcu_read_unlock();
5273 raid_bio->bi_next = (void*)rdev;
5274 bio_set_dev(align_bi, rdev->bdev);
5275
5276 if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
5277 bio_sectors(align_bi),
5278 &first_bad, &bad_sectors)) {
5279 bio_put(align_bi);
5280 rdev_dec_pending(rdev, mddev);
5281 return 0;
5282 }
5283
5284
5285 align_bi->bi_iter.bi_sector += rdev->data_offset;
5286
5287 spin_lock_irq(&conf->device_lock);
5288 wait_event_lock_irq(conf->wait_for_quiescent,
5289 conf->quiesce == 0,
5290 conf->device_lock);
5291 atomic_inc(&conf->active_aligned_reads);
5292 spin_unlock_irq(&conf->device_lock);
5293
5294 if (mddev->gendisk)
5295 trace_block_bio_remap(align_bi->bi_disk->queue,
5296 align_bi, disk_devt(mddev->gendisk),
5297 raid_bio->bi_iter.bi_sector);
5298 submit_bio_noacct(align_bi);
5299 return 1;
5300 } else {
5301 rcu_read_unlock();
5302 bio_put(align_bi);
5303 return 0;
5304 }
5305}
5306
5307static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5308{
5309 struct bio *split;
5310 sector_t sector = raid_bio->bi_iter.bi_sector;
5311 unsigned chunk_sects = mddev->chunk_sectors;
5312 unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5313
5314 if (sectors < bio_sectors(raid_bio)) {
5315 struct r5conf *conf = mddev->private;
5316 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5317 bio_chain(split, raid_bio);
5318 submit_bio_noacct(raid_bio);
5319 raid_bio = split;
5320 }
5321
5322 if (!raid5_read_one_chunk(mddev, raid_bio))
5323 return raid_bio;
5324
5325 return NULL;
5326}
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5339{
5340 struct stripe_head *sh, *tmp;
5341 struct list_head *handle_list = NULL;
5342 struct r5worker_group *wg;
5343 bool second_try = !r5c_is_writeback(conf->log) &&
5344 !r5l_log_disk_error(conf);
5345 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5346 r5l_log_disk_error(conf);
5347
5348again:
5349 wg = NULL;
5350 sh = NULL;
5351 if (conf->worker_cnt_per_group == 0) {
5352 handle_list = try_loprio ? &conf->loprio_list :
5353 &conf->handle_list;
5354 } else if (group != ANY_GROUP) {
5355 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5356 &conf->worker_groups[group].handle_list;
5357 wg = &conf->worker_groups[group];
5358 } else {
5359 int i;
5360 for (i = 0; i < conf->group_cnt; i++) {
5361 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5362 &conf->worker_groups[i].handle_list;
5363 wg = &conf->worker_groups[i];
5364 if (!list_empty(handle_list))
5365 break;
5366 }
5367 }
5368
5369 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5370 __func__,
5371 list_empty(handle_list) ? "empty" : "busy",
5372 list_empty(&conf->hold_list) ? "empty" : "busy",
5373 atomic_read(&conf->pending_full_writes), conf->bypass_count);
5374
5375 if (!list_empty(handle_list)) {
5376 sh = list_entry(handle_list->next, typeof(*sh), lru);
5377
5378 if (list_empty(&conf->hold_list))
5379 conf->bypass_count = 0;
5380 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5381 if (conf->hold_list.next == conf->last_hold)
5382 conf->bypass_count++;
5383 else {
5384 conf->last_hold = conf->hold_list.next;
5385 conf->bypass_count -= conf->bypass_threshold;
5386 if (conf->bypass_count < 0)
5387 conf->bypass_count = 0;
5388 }
5389 }
5390 } else if (!list_empty(&conf->hold_list) &&
5391 ((conf->bypass_threshold &&
5392 conf->bypass_count > conf->bypass_threshold) ||
5393 atomic_read(&conf->pending_full_writes) == 0)) {
5394
5395 list_for_each_entry(tmp, &conf->hold_list, lru) {
5396 if (conf->worker_cnt_per_group == 0 ||
5397 group == ANY_GROUP ||
5398 !cpu_online(tmp->cpu) ||
5399 cpu_to_group(tmp->cpu) == group) {
5400 sh = tmp;
5401 break;
5402 }
5403 }
5404
5405 if (sh) {
5406 conf->bypass_count -= conf->bypass_threshold;
5407 if (conf->bypass_count < 0)
5408 conf->bypass_count = 0;
5409 }
5410 wg = NULL;
5411 }
5412
5413 if (!sh) {
5414 if (second_try)
5415 return NULL;
5416 second_try = true;
5417 try_loprio = !try_loprio;
5418 goto again;
5419 }
5420
5421 if (wg) {
5422 wg->stripes_cnt--;
5423 sh->group = NULL;
5424 }
5425 list_del_init(&sh->lru);
5426 BUG_ON(atomic_inc_return(&sh->count) != 1);
5427 return sh;
5428}
5429
5430struct raid5_plug_cb {
5431 struct blk_plug_cb cb;
5432 struct list_head list;
5433 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5434};
5435
5436static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5437{
5438 struct raid5_plug_cb *cb = container_of(
5439 blk_cb, struct raid5_plug_cb, cb);
5440 struct stripe_head *sh;
5441 struct mddev *mddev = cb->cb.data;
5442 struct r5conf *conf = mddev->private;
5443 int cnt = 0;
5444 int hash;
5445
5446 if (cb->list.next && !list_empty(&cb->list)) {
5447 spin_lock_irq(&conf->device_lock);
5448 while (!list_empty(&cb->list)) {
5449 sh = list_first_entry(&cb->list, struct stripe_head, lru);
5450 list_del_init(&sh->lru);
5451
5452
5453
5454
5455
5456 smp_mb__before_atomic();
5457 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5458
5459
5460
5461
5462 hash = sh->hash_lock_index;
5463 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5464 cnt++;
5465 }
5466 spin_unlock_irq(&conf->device_lock);
5467 }
5468 release_inactive_stripe_list(conf, cb->temp_inactive_list,
5469 NR_STRIPE_HASH_LOCKS);
5470 if (mddev->queue)
5471 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5472 kfree(cb);
5473}
5474
5475static void release_stripe_plug(struct mddev *mddev,
5476 struct stripe_head *sh)
5477{
5478 struct blk_plug_cb *blk_cb = blk_check_plugged(
5479 raid5_unplug, mddev,
5480 sizeof(struct raid5_plug_cb));
5481 struct raid5_plug_cb *cb;
5482
5483 if (!blk_cb) {
5484 raid5_release_stripe(sh);
5485 return;
5486 }
5487
5488 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5489
5490 if (cb->list.next == NULL) {
5491 int i;
5492 INIT_LIST_HEAD(&cb->list);
5493 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5494 INIT_LIST_HEAD(cb->temp_inactive_list + i);
5495 }
5496
5497 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5498 list_add_tail(&sh->lru, &cb->list);
5499 else
5500 raid5_release_stripe(sh);
5501}
5502
5503static void make_discard_request(struct mddev *mddev, struct bio *bi)
5504{
5505 struct r5conf *conf = mddev->private;
5506 sector_t logical_sector, last_sector;
5507 struct stripe_head *sh;
5508 int stripe_sectors;
5509
5510 if (mddev->reshape_position != MaxSector)
5511
5512 return;
5513
5514 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5515 last_sector = bio_end_sector(bi);
5516
5517 bi->bi_next = NULL;
5518
5519 stripe_sectors = conf->chunk_sectors *
5520 (conf->raid_disks - conf->max_degraded);
5521 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5522 stripe_sectors);
5523 sector_div(last_sector, stripe_sectors);
5524
5525 logical_sector *= conf->chunk_sectors;
5526 last_sector *= conf->chunk_sectors;
5527
5528 for (; logical_sector < last_sector;
5529 logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5530 DEFINE_WAIT(w);
5531 int d;
5532 again:
5533 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5534 prepare_to_wait(&conf->wait_for_overlap, &w,
5535 TASK_UNINTERRUPTIBLE);
5536 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5537 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5538 raid5_release_stripe(sh);
5539 schedule();
5540 goto again;
5541 }
5542 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5543 spin_lock_irq(&sh->stripe_lock);
5544 for (d = 0; d < conf->raid_disks; d++) {
5545 if (d == sh->pd_idx || d == sh->qd_idx)
5546 continue;
5547 if (sh->dev[d].towrite || sh->dev[d].toread) {
5548 set_bit(R5_Overlap, &sh->dev[d].flags);
5549 spin_unlock_irq(&sh->stripe_lock);
5550 raid5_release_stripe(sh);
5551 schedule();
5552 goto again;
5553 }
5554 }
5555 set_bit(STRIPE_DISCARD, &sh->state);
5556 finish_wait(&conf->wait_for_overlap, &w);
5557 sh->overwrite_disks = 0;
5558 for (d = 0; d < conf->raid_disks; d++) {
5559 if (d == sh->pd_idx || d == sh->qd_idx)
5560 continue;
5561 sh->dev[d].towrite = bi;
5562 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5563 bio_inc_remaining(bi);
5564 md_write_inc(mddev, bi);
5565 sh->overwrite_disks++;
5566 }
5567 spin_unlock_irq(&sh->stripe_lock);
5568 if (conf->mddev->bitmap) {
5569 for (d = 0;
5570 d < conf->raid_disks - conf->max_degraded;
5571 d++)
5572 md_bitmap_startwrite(mddev->bitmap,
5573 sh->sector,
5574 RAID5_STRIPE_SECTORS(conf),
5575 0);
5576 sh->bm_seq = conf->seq_flush + 1;
5577 set_bit(STRIPE_BIT_DELAY, &sh->state);
5578 }
5579
5580 set_bit(STRIPE_HANDLE, &sh->state);
5581 clear_bit(STRIPE_DELAYED, &sh->state);
5582 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5583 atomic_inc(&conf->preread_active_stripes);
5584 release_stripe_plug(mddev, sh);
5585 }
5586
5587 bio_endio(bi);
5588}
5589
5590static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5591{
5592 struct r5conf *conf = mddev->private;
5593 int dd_idx;
5594 sector_t new_sector;
5595 sector_t logical_sector, last_sector;
5596 struct stripe_head *sh;
5597 const int rw = bio_data_dir(bi);
5598 DEFINE_WAIT(w);
5599 bool do_prepare;
5600 bool do_flush = false;
5601
5602 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5603 int ret = log_handle_flush_request(conf, bi);
5604
5605 if (ret == 0)
5606 return true;
5607 if (ret == -ENODEV) {
5608 if (md_flush_request(mddev, bi))
5609 return true;
5610 }
5611
5612
5613
5614
5615
5616 do_flush = bi->bi_opf & REQ_PREFLUSH;
5617 }
5618
5619 if (!md_write_start(mddev, bi))
5620 return false;
5621
5622
5623
5624
5625
5626 if (rw == READ && mddev->degraded == 0 &&
5627 mddev->reshape_position == MaxSector) {
5628 bi = chunk_aligned_read(mddev, bi);
5629 if (!bi)
5630 return true;
5631 }
5632
5633 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5634 make_discard_request(mddev, bi);
5635 md_write_end(mddev);
5636 return true;
5637 }
5638
5639 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5640 last_sector = bio_end_sector(bi);
5641 bi->bi_next = NULL;
5642
5643 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5644 for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5645 int previous;
5646 int seq;
5647
5648 do_prepare = false;
5649 retry:
5650 seq = read_seqcount_begin(&conf->gen_lock);
5651 previous = 0;
5652 if (do_prepare)
5653 prepare_to_wait(&conf->wait_for_overlap, &w,
5654 TASK_UNINTERRUPTIBLE);
5655 if (unlikely(conf->reshape_progress != MaxSector)) {
5656
5657
5658
5659
5660
5661
5662
5663
5664 spin_lock_irq(&conf->device_lock);
5665 if (mddev->reshape_backwards
5666 ? logical_sector < conf->reshape_progress
5667 : logical_sector >= conf->reshape_progress) {
5668 previous = 1;
5669 } else {
5670 if (mddev->reshape_backwards
5671 ? logical_sector < conf->reshape_safe
5672 : logical_sector >= conf->reshape_safe) {
5673 spin_unlock_irq(&conf->device_lock);
5674 schedule();
5675 do_prepare = true;
5676 goto retry;
5677 }
5678 }
5679 spin_unlock_irq(&conf->device_lock);
5680 }
5681
5682 new_sector = raid5_compute_sector(conf, logical_sector,
5683 previous,
5684 &dd_idx, NULL);
5685 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5686 (unsigned long long)new_sector,
5687 (unsigned long long)logical_sector);
5688
5689 sh = raid5_get_active_stripe(conf, new_sector, previous,
5690 (bi->bi_opf & REQ_RAHEAD), 0);
5691 if (sh) {
5692 if (unlikely(previous)) {
5693
5694
5695
5696
5697
5698
5699
5700
5701 int must_retry = 0;
5702 spin_lock_irq(&conf->device_lock);
5703 if (mddev->reshape_backwards
5704 ? logical_sector >= conf->reshape_progress
5705 : logical_sector < conf->reshape_progress)
5706
5707 must_retry = 1;
5708 spin_unlock_irq(&conf->device_lock);
5709 if (must_retry) {
5710 raid5_release_stripe(sh);
5711 schedule();
5712 do_prepare = true;
5713 goto retry;
5714 }
5715 }
5716 if (read_seqcount_retry(&conf->gen_lock, seq)) {
5717
5718
5719
5720 raid5_release_stripe(sh);
5721 goto retry;
5722 }
5723
5724 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5725 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5726
5727
5728
5729
5730 md_wakeup_thread(mddev->thread);
5731 raid5_release_stripe(sh);
5732 schedule();
5733 do_prepare = true;
5734 goto retry;
5735 }
5736 if (do_flush) {
5737 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5738
5739 do_flush = false;
5740 }
5741
5742 set_bit(STRIPE_HANDLE, &sh->state);
5743 clear_bit(STRIPE_DELAYED, &sh->state);
5744 if ((!sh->batch_head || sh == sh->batch_head) &&
5745 (bi->bi_opf & REQ_SYNC) &&
5746 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5747 atomic_inc(&conf->preread_active_stripes);
5748 release_stripe_plug(mddev, sh);
5749 } else {
5750
5751 bi->bi_status = BLK_STS_IOERR;
5752 break;
5753 }
5754 }
5755 finish_wait(&conf->wait_for_overlap, &w);
5756
5757 if (rw == WRITE)
5758 md_write_end(mddev);
5759 bio_endio(bi);
5760 return true;
5761}
5762
5763static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5764
5765static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5766{
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776 struct r5conf *conf = mddev->private;
5777 struct stripe_head *sh;
5778 struct md_rdev *rdev;
5779 sector_t first_sector, last_sector;
5780 int raid_disks = conf->previous_raid_disks;
5781 int data_disks = raid_disks - conf->max_degraded;
5782 int new_data_disks = conf->raid_disks - conf->max_degraded;
5783 int i;
5784 int dd_idx;
5785 sector_t writepos, readpos, safepos;
5786 sector_t stripe_addr;
5787 int reshape_sectors;
5788 struct list_head stripes;
5789 sector_t retn;
5790
5791 if (sector_nr == 0) {
5792
5793 if (mddev->reshape_backwards &&
5794 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5795 sector_nr = raid5_size(mddev, 0, 0)
5796 - conf->reshape_progress;
5797 } else if (mddev->reshape_backwards &&
5798 conf->reshape_progress == MaxSector) {
5799
5800 sector_nr = MaxSector;
5801 } else if (!mddev->reshape_backwards &&
5802 conf->reshape_progress > 0)
5803 sector_nr = conf->reshape_progress;
5804 sector_div(sector_nr, new_data_disks);
5805 if (sector_nr) {
5806 mddev->curr_resync_completed = sector_nr;
5807 sysfs_notify_dirent_safe(mddev->sysfs_completed);
5808 *skipped = 1;
5809 retn = sector_nr;
5810 goto finish;
5811 }
5812 }
5813
5814
5815
5816
5817
5818
5819 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
5820
5821
5822
5823
5824
5825
5826
5827 writepos = conf->reshape_progress;
5828 sector_div(writepos, new_data_disks);
5829 readpos = conf->reshape_progress;
5830 sector_div(readpos, data_disks);
5831 safepos = conf->reshape_safe;
5832 sector_div(safepos, data_disks);
5833 if (mddev->reshape_backwards) {
5834 BUG_ON(writepos < reshape_sectors);
5835 writepos -= reshape_sectors;
5836 readpos += reshape_sectors;
5837 safepos += reshape_sectors;
5838 } else {
5839 writepos += reshape_sectors;
5840
5841
5842
5843
5844 readpos -= min_t(sector_t, reshape_sectors, readpos);
5845 safepos -= min_t(sector_t, reshape_sectors, safepos);
5846 }
5847
5848
5849
5850
5851 if (mddev->reshape_backwards) {
5852 BUG_ON(conf->reshape_progress == 0);
5853 stripe_addr = writepos;
5854 BUG_ON((mddev->dev_sectors &
5855 ~((sector_t)reshape_sectors - 1))
5856 - reshape_sectors - stripe_addr
5857 != sector_nr);
5858 } else {
5859 BUG_ON(writepos != sector_nr + reshape_sectors);
5860 stripe_addr = sector_nr;
5861 }
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883 if (conf->min_offset_diff < 0) {
5884 safepos += -conf->min_offset_diff;
5885 readpos += -conf->min_offset_diff;
5886 } else
5887 writepos += conf->min_offset_diff;
5888
5889 if ((mddev->reshape_backwards
5890 ? (safepos > writepos && readpos < writepos)
5891 : (safepos < writepos && readpos > writepos)) ||
5892 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
5893
5894 wait_event(conf->wait_for_overlap,
5895 atomic_read(&conf->reshape_stripes)==0
5896 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5897 if (atomic_read(&conf->reshape_stripes) != 0)
5898 return 0;
5899 mddev->reshape_position = conf->reshape_progress;
5900 mddev->curr_resync_completed = sector_nr;
5901 if (!mddev->reshape_backwards)
5902
5903 rdev_for_each(rdev, mddev)
5904 if (rdev->raid_disk >= 0 &&
5905 !test_bit(Journal, &rdev->flags) &&
5906 !test_bit(In_sync, &rdev->flags) &&
5907 rdev->recovery_offset < sector_nr)
5908 rdev->recovery_offset = sector_nr;
5909
5910 conf->reshape_checkpoint = jiffies;
5911 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5912 md_wakeup_thread(mddev->thread);
5913 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
5914 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5915 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5916 return 0;
5917 spin_lock_irq(&conf->device_lock);
5918 conf->reshape_safe = mddev->reshape_position;
5919 spin_unlock_irq(&conf->device_lock);
5920 wake_up(&conf->wait_for_overlap);
5921 sysfs_notify_dirent_safe(mddev->sysfs_completed);
5922 }
5923
5924 INIT_LIST_HEAD(&stripes);
5925 for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
5926 int j;
5927 int skipped_disk = 0;
5928 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
5929 set_bit(STRIPE_EXPANDING, &sh->state);
5930 atomic_inc(&conf->reshape_stripes);
5931
5932
5933
5934 for (j=sh->disks; j--;) {
5935 sector_t s;
5936 if (j == sh->pd_idx)
5937 continue;
5938 if (conf->level == 6 &&
5939 j == sh->qd_idx)
5940 continue;
5941 s = raid5_compute_blocknr(sh, j, 0);
5942 if (s < raid5_size(mddev, 0, 0)) {
5943 skipped_disk = 1;
5944 continue;
5945 }
5946 memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
5947 set_bit(R5_Expanded, &sh->dev[j].flags);
5948 set_bit(R5_UPTODATE, &sh->dev[j].flags);
5949 }
5950 if (!skipped_disk) {
5951 set_bit(STRIPE_EXPAND_READY, &sh->state);
5952 set_bit(STRIPE_HANDLE, &sh->state);
5953 }
5954 list_add(&sh->lru, &stripes);
5955 }
5956 spin_lock_irq(&conf->device_lock);
5957 if (mddev->reshape_backwards)
5958 conf->reshape_progress -= reshape_sectors * new_data_disks;
5959 else
5960 conf->reshape_progress += reshape_sectors * new_data_disks;
5961 spin_unlock_irq(&conf->device_lock);
5962
5963
5964
5965
5966
5967 first_sector =
5968 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
5969 1, &dd_idx, NULL);
5970 last_sector =
5971 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
5972 * new_data_disks - 1),
5973 1, &dd_idx, NULL);
5974 if (last_sector >= mddev->dev_sectors)
5975 last_sector = mddev->dev_sectors - 1;
5976 while (first_sector <= last_sector) {
5977 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
5978 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
5979 set_bit(STRIPE_HANDLE, &sh->state);
5980 raid5_release_stripe(sh);
5981 first_sector += RAID5_STRIPE_SECTORS(conf);
5982 }
5983
5984
5985
5986 while (!list_empty(&stripes)) {
5987 sh = list_entry(stripes.next, struct stripe_head, lru);
5988 list_del_init(&sh->lru);
5989 raid5_release_stripe(sh);
5990 }
5991
5992
5993
5994 sector_nr += reshape_sectors;
5995 retn = reshape_sectors;
5996finish:
5997 if (mddev->curr_resync_completed > mddev->resync_max ||
5998 (sector_nr - mddev->curr_resync_completed) * 2
5999 >= mddev->resync_max - mddev->curr_resync_completed) {
6000
6001 wait_event(conf->wait_for_overlap,
6002 atomic_read(&conf->reshape_stripes) == 0
6003 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6004 if (atomic_read(&conf->reshape_stripes) != 0)
6005 goto ret;
6006 mddev->reshape_position = conf->reshape_progress;
6007 mddev->curr_resync_completed = sector_nr;
6008 if (!mddev->reshape_backwards)
6009
6010 rdev_for_each(rdev, mddev)
6011 if (rdev->raid_disk >= 0 &&
6012 !test_bit(Journal, &rdev->flags) &&
6013 !test_bit(In_sync, &rdev->flags) &&
6014 rdev->recovery_offset < sector_nr)
6015 rdev->recovery_offset = sector_nr;
6016 conf->reshape_checkpoint = jiffies;
6017 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6018 md_wakeup_thread(mddev->thread);
6019 wait_event(mddev->sb_wait,
6020 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6021 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6022 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6023 goto ret;
6024 spin_lock_irq(&conf->device_lock);
6025 conf->reshape_safe = mddev->reshape_position;
6026 spin_unlock_irq(&conf->device_lock);
6027 wake_up(&conf->wait_for_overlap);
6028 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6029 }
6030ret:
6031 return retn;
6032}
6033
6034static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6035 int *skipped)
6036{
6037 struct r5conf *conf = mddev->private;
6038 struct stripe_head *sh;
6039 sector_t max_sector = mddev->dev_sectors;
6040 sector_t sync_blocks;
6041 int still_degraded = 0;
6042 int i;
6043
6044 if (sector_nr >= max_sector) {
6045
6046
6047 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6048 end_reshape(conf);
6049 return 0;
6050 }
6051
6052 if (mddev->curr_resync < max_sector)
6053 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6054 &sync_blocks, 1);
6055 else
6056 conf->fullsync = 0;
6057 md_bitmap_close_sync(mddev->bitmap);
6058
6059 return 0;
6060 }
6061
6062
6063 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6064
6065 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6066 return reshape_request(mddev, sector_nr, skipped);
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078 if (mddev->degraded >= conf->max_degraded &&
6079 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6080 sector_t rv = mddev->dev_sectors - sector_nr;
6081 *skipped = 1;
6082 return rv;
6083 }
6084 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6085 !conf->fullsync &&
6086 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6087 sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6088
6089 do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6090 *skipped = 1;
6091
6092 return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6093 }
6094
6095 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6096
6097 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6098 if (sh == NULL) {
6099 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6100
6101
6102
6103 schedule_timeout_uninterruptible(1);
6104 }
6105
6106
6107
6108
6109 rcu_read_lock();
6110 for (i = 0; i < conf->raid_disks; i++) {
6111 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
6112
6113 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6114 still_degraded = 1;
6115 }
6116 rcu_read_unlock();
6117
6118 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6119
6120 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6121 set_bit(STRIPE_HANDLE, &sh->state);
6122
6123 raid5_release_stripe(sh);
6124
6125 return RAID5_STRIPE_SECTORS(conf);
6126}
6127
6128static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6129 unsigned int offset)
6130{
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141 struct stripe_head *sh;
6142 int dd_idx;
6143 sector_t sector, logical_sector, last_sector;
6144 int scnt = 0;
6145 int handled = 0;
6146
6147 logical_sector = raid_bio->bi_iter.bi_sector &
6148 ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6149 sector = raid5_compute_sector(conf, logical_sector,
6150 0, &dd_idx, NULL);
6151 last_sector = bio_end_sector(raid_bio);
6152
6153 for (; logical_sector < last_sector;
6154 logical_sector += RAID5_STRIPE_SECTORS(conf),
6155 sector += RAID5_STRIPE_SECTORS(conf),
6156 scnt++) {
6157
6158 if (scnt < offset)
6159
6160 continue;
6161
6162 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6163
6164 if (!sh) {
6165
6166 conf->retry_read_aligned = raid_bio;
6167 conf->retry_read_offset = scnt;
6168 return handled;
6169 }
6170
6171 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6172 raid5_release_stripe(sh);
6173 conf->retry_read_aligned = raid_bio;
6174 conf->retry_read_offset = scnt;
6175 return handled;
6176 }
6177
6178 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6179 handle_stripe(sh);
6180 raid5_release_stripe(sh);
6181 handled++;
6182 }
6183
6184 bio_endio(raid_bio);
6185
6186 if (atomic_dec_and_test(&conf->active_aligned_reads))
6187 wake_up(&conf->wait_for_quiescent);
6188 return handled;
6189}
6190
6191static int handle_active_stripes(struct r5conf *conf, int group,
6192 struct r5worker *worker,
6193 struct list_head *temp_inactive_list)
6194 __releases(&conf->device_lock)
6195 __acquires(&conf->device_lock)
6196{
6197 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6198 int i, batch_size = 0, hash;
6199 bool release_inactive = false;
6200
6201 while (batch_size < MAX_STRIPE_BATCH &&
6202 (sh = __get_priority_stripe(conf, group)) != NULL)
6203 batch[batch_size++] = sh;
6204
6205 if (batch_size == 0) {
6206 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6207 if (!list_empty(temp_inactive_list + i))
6208 break;
6209 if (i == NR_STRIPE_HASH_LOCKS) {
6210 spin_unlock_irq(&conf->device_lock);
6211 log_flush_stripe_to_raid(conf);
6212 spin_lock_irq(&conf->device_lock);
6213 return batch_size;
6214 }
6215 release_inactive = true;
6216 }
6217 spin_unlock_irq(&conf->device_lock);
6218
6219 release_inactive_stripe_list(conf, temp_inactive_list,
6220 NR_STRIPE_HASH_LOCKS);
6221
6222 r5l_flush_stripe_to_raid(conf->log);
6223 if (release_inactive) {
6224 spin_lock_irq(&conf->device_lock);
6225 return 0;
6226 }
6227
6228 for (i = 0; i < batch_size; i++)
6229 handle_stripe(batch[i]);
6230 log_write_stripe_run(conf);
6231
6232 cond_resched();
6233
6234 spin_lock_irq(&conf->device_lock);
6235 for (i = 0; i < batch_size; i++) {
6236 hash = batch[i]->hash_lock_index;
6237 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6238 }
6239 return batch_size;
6240}
6241
6242static void raid5_do_work(struct work_struct *work)
6243{
6244 struct r5worker *worker = container_of(work, struct r5worker, work);
6245 struct r5worker_group *group = worker->group;
6246 struct r5conf *conf = group->conf;
6247 struct mddev *mddev = conf->mddev;
6248 int group_id = group - conf->worker_groups;
6249 int handled;
6250 struct blk_plug plug;
6251
6252 pr_debug("+++ raid5worker active\n");
6253
6254 blk_start_plug(&plug);
6255 handled = 0;
6256 spin_lock_irq(&conf->device_lock);
6257 while (1) {
6258 int batch_size, released;
6259
6260 released = release_stripe_list(conf, worker->temp_inactive_list);
6261
6262 batch_size = handle_active_stripes(conf, group_id, worker,
6263 worker->temp_inactive_list);
6264 worker->working = false;
6265 if (!batch_size && !released)
6266 break;
6267 handled += batch_size;
6268 wait_event_lock_irq(mddev->sb_wait,
6269 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6270 conf->device_lock);
6271 }
6272 pr_debug("%d stripes handled\n", handled);
6273
6274 spin_unlock_irq(&conf->device_lock);
6275
6276 flush_deferred_bios(conf);
6277
6278 r5l_flush_stripe_to_raid(conf->log);
6279
6280 async_tx_issue_pending_all();
6281 blk_finish_plug(&plug);
6282
6283 pr_debug("--- raid5worker inactive\n");
6284}
6285
6286
6287
6288
6289
6290
6291
6292
6293static void raid5d(struct md_thread *thread)
6294{
6295 struct mddev *mddev = thread->mddev;
6296 struct r5conf *conf = mddev->private;
6297 int handled;
6298 struct blk_plug plug;
6299
6300 pr_debug("+++ raid5d active\n");
6301
6302 md_check_recovery(mddev);
6303
6304 blk_start_plug(&plug);
6305 handled = 0;
6306 spin_lock_irq(&conf->device_lock);
6307 while (1) {
6308 struct bio *bio;
6309 int batch_size, released;
6310 unsigned int offset;
6311
6312 released = release_stripe_list(conf, conf->temp_inactive_list);
6313 if (released)
6314 clear_bit(R5_DID_ALLOC, &conf->cache_state);
6315
6316 if (
6317 !list_empty(&conf->bitmap_list)) {
6318
6319 conf->seq_flush++;
6320 spin_unlock_irq(&conf->device_lock);
6321 md_bitmap_unplug(mddev->bitmap);
6322 spin_lock_irq(&conf->device_lock);
6323 conf->seq_write = conf->seq_flush;
6324 activate_bit_delay(conf, conf->temp_inactive_list);
6325 }
6326 raid5_activate_delayed(conf);
6327
6328 while ((bio = remove_bio_from_retry(conf, &offset))) {
6329 int ok;
6330 spin_unlock_irq(&conf->device_lock);
6331 ok = retry_aligned_read(conf, bio, offset);
6332 spin_lock_irq(&conf->device_lock);
6333 if (!ok)
6334 break;
6335 handled++;
6336 }
6337
6338 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6339 conf->temp_inactive_list);
6340 if (!batch_size && !released)
6341 break;
6342 handled += batch_size;
6343
6344 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6345 spin_unlock_irq(&conf->device_lock);
6346 md_check_recovery(mddev);
6347 spin_lock_irq(&conf->device_lock);
6348 }
6349 }
6350 pr_debug("%d stripes handled\n", handled);
6351
6352 spin_unlock_irq(&conf->device_lock);
6353 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6354 mutex_trylock(&conf->cache_size_mutex)) {
6355 grow_one_stripe(conf, __GFP_NOWARN);
6356
6357
6358
6359 set_bit(R5_DID_ALLOC, &conf->cache_state);
6360 mutex_unlock(&conf->cache_size_mutex);
6361 }
6362
6363 flush_deferred_bios(conf);
6364
6365 r5l_flush_stripe_to_raid(conf->log);
6366
6367 async_tx_issue_pending_all();
6368 blk_finish_plug(&plug);
6369
6370 pr_debug("--- raid5d inactive\n");
6371}
6372
6373static ssize_t
6374raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6375{
6376 struct r5conf *conf;
6377 int ret = 0;
6378 spin_lock(&mddev->lock);
6379 conf = mddev->private;
6380 if (conf)
6381 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6382 spin_unlock(&mddev->lock);
6383 return ret;
6384}
6385
6386int
6387raid5_set_cache_size(struct mddev *mddev, int size)
6388{
6389 int result = 0;
6390 struct r5conf *conf = mddev->private;
6391
6392 if (size <= 16 || size > 32768)
6393 return -EINVAL;
6394
6395 conf->min_nr_stripes = size;
6396 mutex_lock(&conf->cache_size_mutex);
6397 while (size < conf->max_nr_stripes &&
6398 drop_one_stripe(conf))
6399 ;
6400 mutex_unlock(&conf->cache_size_mutex);
6401
6402 md_allow_write(mddev);
6403
6404 mutex_lock(&conf->cache_size_mutex);
6405 while (size > conf->max_nr_stripes)
6406 if (!grow_one_stripe(conf, GFP_KERNEL)) {
6407 conf->min_nr_stripes = conf->max_nr_stripes;
6408 result = -ENOMEM;
6409 break;
6410 }
6411 mutex_unlock(&conf->cache_size_mutex);
6412
6413 return result;
6414}
6415EXPORT_SYMBOL(raid5_set_cache_size);
6416
6417static ssize_t
6418raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6419{
6420 struct r5conf *conf;
6421 unsigned long new;
6422 int err;
6423
6424 if (len >= PAGE_SIZE)
6425 return -EINVAL;
6426 if (kstrtoul(page, 10, &new))
6427 return -EINVAL;
6428 err = mddev_lock(mddev);
6429 if (err)
6430 return err;
6431 conf = mddev->private;
6432 if (!conf)
6433 err = -ENODEV;
6434 else
6435 err = raid5_set_cache_size(mddev, new);
6436 mddev_unlock(mddev);
6437
6438 return err ?: len;
6439}
6440
6441static struct md_sysfs_entry
6442raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6443 raid5_show_stripe_cache_size,
6444 raid5_store_stripe_cache_size);
6445
6446static ssize_t
6447raid5_show_rmw_level(struct mddev *mddev, char *page)
6448{
6449 struct r5conf *conf = mddev->private;
6450 if (conf)
6451 return sprintf(page, "%d\n", conf->rmw_level);
6452 else
6453 return 0;
6454}
6455
6456static ssize_t
6457raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
6458{
6459 struct r5conf *conf = mddev->private;
6460 unsigned long new;
6461
6462 if (!conf)
6463 return -ENODEV;
6464
6465 if (len >= PAGE_SIZE)
6466 return -EINVAL;
6467
6468 if (kstrtoul(page, 10, &new))
6469 return -EINVAL;
6470
6471 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6472 return -EINVAL;
6473
6474 if (new != PARITY_DISABLE_RMW &&
6475 new != PARITY_ENABLE_RMW &&
6476 new != PARITY_PREFER_RMW)
6477 return -EINVAL;
6478
6479 conf->rmw_level = new;
6480 return len;
6481}
6482
6483static struct md_sysfs_entry
6484raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6485 raid5_show_rmw_level,
6486 raid5_store_rmw_level);
6487
6488static ssize_t
6489raid5_show_stripe_size(struct mddev *mddev, char *page)
6490{
6491 struct r5conf *conf;
6492 int ret = 0;
6493
6494 spin_lock(&mddev->lock);
6495 conf = mddev->private;
6496 if (conf)
6497 ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6498 spin_unlock(&mddev->lock);
6499 return ret;
6500}
6501
6502#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6503static ssize_t
6504raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
6505{
6506 struct r5conf *conf;
6507 unsigned long new;
6508 int err;
6509
6510 if (len >= PAGE_SIZE)
6511 return -EINVAL;
6512 if (kstrtoul(page, 10, &new))
6513 return -EINVAL;
6514
6515
6516
6517
6518
6519
6520 if (new % DEFAULT_STRIPE_SIZE != 0 ||
6521 new > PAGE_SIZE || new == 0 ||
6522 new != roundup_pow_of_two(new))
6523 return -EINVAL;
6524
6525 err = mddev_lock(mddev);
6526 if (err)
6527 return err;
6528
6529 conf = mddev->private;
6530 if (!conf) {
6531 err = -ENODEV;
6532 goto out_unlock;
6533 }
6534
6535 if (new == conf->stripe_size)
6536 goto out_unlock;
6537
6538 pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6539 conf->stripe_size, new);
6540
6541 mddev_suspend(mddev);
6542 conf->stripe_size = new;
6543 conf->stripe_shift = ilog2(new) - 9;
6544 conf->stripe_sectors = new >> 9;
6545 mddev_resume(mddev);
6546
6547out_unlock:
6548 mddev_unlock(mddev);
6549 return err ?: len;
6550}
6551
6552static struct md_sysfs_entry
6553raid5_stripe_size = __ATTR(stripe_size, 0644,
6554 raid5_show_stripe_size,
6555 raid5_store_stripe_size);
6556#else
6557static struct md_sysfs_entry
6558raid5_stripe_size = __ATTR(stripe_size, 0444,
6559 raid5_show_stripe_size,
6560 NULL);
6561#endif
6562
6563static ssize_t
6564raid5_show_preread_threshold(struct mddev *mddev, char *page)
6565{
6566 struct r5conf *conf;
6567 int ret = 0;
6568 spin_lock(&mddev->lock);
6569 conf = mddev->private;
6570 if (conf)
6571 ret = sprintf(page, "%d\n", conf->bypass_threshold);
6572 spin_unlock(&mddev->lock);
6573 return ret;
6574}
6575
6576static ssize_t
6577raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6578{
6579 struct r5conf *conf;
6580 unsigned long new;
6581 int err;
6582
6583 if (len >= PAGE_SIZE)
6584 return -EINVAL;
6585 if (kstrtoul(page, 10, &new))
6586 return -EINVAL;
6587
6588 err = mddev_lock(mddev);
6589 if (err)
6590 return err;
6591 conf = mddev->private;
6592 if (!conf)
6593 err = -ENODEV;
6594 else if (new > conf->min_nr_stripes)
6595 err = -EINVAL;
6596 else
6597 conf->bypass_threshold = new;
6598 mddev_unlock(mddev);
6599 return err ?: len;
6600}
6601
6602static struct md_sysfs_entry
6603raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6604 S_IRUGO | S_IWUSR,
6605 raid5_show_preread_threshold,
6606 raid5_store_preread_threshold);
6607
6608static ssize_t
6609raid5_show_skip_copy(struct mddev *mddev, char *page)
6610{
6611 struct r5conf *conf;
6612 int ret = 0;
6613 spin_lock(&mddev->lock);
6614 conf = mddev->private;
6615 if (conf)
6616 ret = sprintf(page, "%d\n", conf->skip_copy);
6617 spin_unlock(&mddev->lock);
6618 return ret;
6619}
6620
6621static ssize_t
6622raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6623{
6624 struct r5conf *conf;
6625 unsigned long new;
6626 int err;
6627
6628 if (len >= PAGE_SIZE)
6629 return -EINVAL;
6630 if (kstrtoul(page, 10, &new))
6631 return -EINVAL;
6632 new = !!new;
6633
6634 err = mddev_lock(mddev);
6635 if (err)
6636 return err;
6637 conf = mddev->private;
6638 if (!conf)
6639 err = -ENODEV;
6640 else if (new != conf->skip_copy) {
6641 mddev_suspend(mddev);
6642 conf->skip_copy = new;
6643 if (new)
6644 mddev->queue->backing_dev_info->capabilities |=
6645 BDI_CAP_STABLE_WRITES;
6646 else
6647 mddev->queue->backing_dev_info->capabilities &=
6648 ~BDI_CAP_STABLE_WRITES;
6649 mddev_resume(mddev);
6650 }
6651 mddev_unlock(mddev);
6652 return err ?: len;
6653}
6654
6655static struct md_sysfs_entry
6656raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6657 raid5_show_skip_copy,
6658 raid5_store_skip_copy);
6659
6660static ssize_t
6661stripe_cache_active_show(struct mddev *mddev, char *page)
6662{
6663 struct r5conf *conf = mddev->private;
6664 if (conf)
6665 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6666 else
6667 return 0;
6668}
6669
6670static struct md_sysfs_entry
6671raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6672
6673static ssize_t
6674raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6675{
6676 struct r5conf *conf;
6677 int ret = 0;
6678 spin_lock(&mddev->lock);
6679 conf = mddev->private;
6680 if (conf)
6681 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6682 spin_unlock(&mddev->lock);
6683 return ret;
6684}
6685
6686static int alloc_thread_groups(struct r5conf *conf, int cnt,
6687 int *group_cnt,
6688 struct r5worker_group **worker_groups);
6689static ssize_t
6690raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6691{
6692 struct r5conf *conf;
6693 unsigned int new;
6694 int err;
6695 struct r5worker_group *new_groups, *old_groups;
6696 int group_cnt;
6697
6698 if (len >= PAGE_SIZE)
6699 return -EINVAL;
6700 if (kstrtouint(page, 10, &new))
6701 return -EINVAL;
6702
6703 if (new > 8192)
6704 return -EINVAL;
6705
6706 err = mddev_lock(mddev);
6707 if (err)
6708 return err;
6709 conf = mddev->private;
6710 if (!conf)
6711 err = -ENODEV;
6712 else if (new != conf->worker_cnt_per_group) {
6713 mddev_suspend(mddev);
6714
6715 old_groups = conf->worker_groups;
6716 if (old_groups)
6717 flush_workqueue(raid5_wq);
6718
6719 err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
6720 if (!err) {
6721 spin_lock_irq(&conf->device_lock);
6722 conf->group_cnt = group_cnt;
6723 conf->worker_cnt_per_group = new;
6724 conf->worker_groups = new_groups;
6725 spin_unlock_irq(&conf->device_lock);
6726
6727 if (old_groups)
6728 kfree(old_groups[0].workers);
6729 kfree(old_groups);
6730 }
6731 mddev_resume(mddev);
6732 }
6733 mddev_unlock(mddev);
6734
6735 return err ?: len;
6736}
6737
6738static struct md_sysfs_entry
6739raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6740 raid5_show_group_thread_cnt,
6741 raid5_store_group_thread_cnt);
6742
6743static struct attribute *raid5_attrs[] = {
6744 &raid5_stripecache_size.attr,
6745 &raid5_stripecache_active.attr,
6746 &raid5_preread_bypass_threshold.attr,
6747 &raid5_group_thread_cnt.attr,
6748 &raid5_skip_copy.attr,
6749 &raid5_rmw_level.attr,
6750 &raid5_stripe_size.attr,
6751 &r5c_journal_mode.attr,
6752 &ppl_write_hint.attr,
6753 NULL,
6754};
6755static struct attribute_group raid5_attrs_group = {
6756 .name = NULL,
6757 .attrs = raid5_attrs,
6758};
6759
6760static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
6761 struct r5worker_group **worker_groups)
6762{
6763 int i, j, k;
6764 ssize_t size;
6765 struct r5worker *workers;
6766
6767 if (cnt == 0) {
6768 *group_cnt = 0;
6769 *worker_groups = NULL;
6770 return 0;
6771 }
6772 *group_cnt = num_possible_nodes();
6773 size = sizeof(struct r5worker) * cnt;
6774 workers = kcalloc(size, *group_cnt, GFP_NOIO);
6775 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
6776 GFP_NOIO);
6777 if (!*worker_groups || !workers) {
6778 kfree(workers);
6779 kfree(*worker_groups);
6780 return -ENOMEM;
6781 }
6782
6783 for (i = 0; i < *group_cnt; i++) {
6784 struct r5worker_group *group;
6785
6786 group = &(*worker_groups)[i];
6787 INIT_LIST_HEAD(&group->handle_list);
6788 INIT_LIST_HEAD(&group->loprio_list);
6789 group->conf = conf;
6790 group->workers = workers + i * cnt;
6791
6792 for (j = 0; j < cnt; j++) {
6793 struct r5worker *worker = group->workers + j;
6794 worker->group = group;
6795 INIT_WORK(&worker->work, raid5_do_work);
6796
6797 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6798 INIT_LIST_HEAD(worker->temp_inactive_list + k);
6799 }
6800 }
6801
6802 return 0;
6803}
6804
6805static void free_thread_groups(struct r5conf *conf)
6806{
6807 if (conf->worker_groups)
6808 kfree(conf->worker_groups[0].workers);
6809 kfree(conf->worker_groups);
6810 conf->worker_groups = NULL;
6811}
6812
6813static sector_t
6814raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
6815{
6816 struct r5conf *conf = mddev->private;
6817
6818 if (!sectors)
6819 sectors = mddev->dev_sectors;
6820 if (!raid_disks)
6821
6822 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
6823
6824 sectors &= ~((sector_t)conf->chunk_sectors - 1);
6825 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
6826 return sectors * (raid_disks - conf->max_degraded);
6827}
6828
6829static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6830{
6831 safe_put_page(percpu->spare_page);
6832 percpu->spare_page = NULL;
6833 kvfree(percpu->scribble);
6834 percpu->scribble = NULL;
6835}
6836
6837static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6838{
6839 if (conf->level == 6 && !percpu->spare_page) {
6840 percpu->spare_page = alloc_page(GFP_KERNEL);
6841 if (!percpu->spare_page)
6842 return -ENOMEM;
6843 }
6844
6845 if (scribble_alloc(percpu,
6846 max(conf->raid_disks,
6847 conf->previous_raid_disks),
6848 max(conf->chunk_sectors,
6849 conf->prev_chunk_sectors)
6850 / RAID5_STRIPE_SECTORS(conf))) {
6851 free_scratch_buffer(conf, percpu);
6852 return -ENOMEM;
6853 }
6854
6855 return 0;
6856}
6857
6858static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
6859{
6860 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6861
6862 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
6863 return 0;
6864}
6865
6866static void raid5_free_percpu(struct r5conf *conf)
6867{
6868 if (!conf->percpu)
6869 return;
6870
6871 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6872 free_percpu(conf->percpu);
6873}
6874
6875static void free_conf(struct r5conf *conf)
6876{
6877 int i;
6878
6879 log_exit(conf);
6880
6881 unregister_shrinker(&conf->shrinker);
6882 free_thread_groups(conf);
6883 shrink_stripes(conf);
6884 raid5_free_percpu(conf);
6885 for (i = 0; i < conf->pool_size; i++)
6886 if (conf->disks[i].extra_page)
6887 put_page(conf->disks[i].extra_page);
6888 kfree(conf->disks);
6889 bioset_exit(&conf->bio_split);
6890 kfree(conf->stripe_hashtbl);
6891 kfree(conf->pending_data);
6892 kfree(conf);
6893}
6894
6895static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
6896{
6897 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6898 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6899
6900 if (alloc_scratch_buffer(conf, percpu)) {
6901 pr_warn("%s: failed memory allocation for cpu%u\n",
6902 __func__, cpu);
6903 return -ENOMEM;
6904 }
6905 return 0;
6906}
6907
6908static int raid5_alloc_percpu(struct r5conf *conf)
6909{
6910 int err = 0;
6911
6912 conf->percpu = alloc_percpu(struct raid5_percpu);
6913 if (!conf->percpu)
6914 return -ENOMEM;
6915
6916 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6917 if (!err) {
6918 conf->scribble_disks = max(conf->raid_disks,
6919 conf->previous_raid_disks);
6920 conf->scribble_sectors = max(conf->chunk_sectors,
6921 conf->prev_chunk_sectors);
6922 }
6923 return err;
6924}
6925
6926static unsigned long raid5_cache_scan(struct shrinker *shrink,
6927 struct shrink_control *sc)
6928{
6929 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6930 unsigned long ret = SHRINK_STOP;
6931
6932 if (mutex_trylock(&conf->cache_size_mutex)) {
6933 ret= 0;
6934 while (ret < sc->nr_to_scan &&
6935 conf->max_nr_stripes > conf->min_nr_stripes) {
6936 if (drop_one_stripe(conf) == 0) {
6937 ret = SHRINK_STOP;
6938 break;
6939 }
6940 ret++;
6941 }
6942 mutex_unlock(&conf->cache_size_mutex);
6943 }
6944 return ret;
6945}
6946
6947static unsigned long raid5_cache_count(struct shrinker *shrink,
6948 struct shrink_control *sc)
6949{
6950 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6951
6952 if (conf->max_nr_stripes < conf->min_nr_stripes)
6953
6954 return 0;
6955 return conf->max_nr_stripes - conf->min_nr_stripes;
6956}
6957
6958static struct r5conf *setup_conf(struct mddev *mddev)
6959{
6960 struct r5conf *conf;
6961 int raid_disk, memory, max_disks;
6962 struct md_rdev *rdev;
6963 struct disk_info *disk;
6964 char pers_name[6];
6965 int i;
6966 int group_cnt;
6967 struct r5worker_group *new_group;
6968 int ret;
6969
6970 if (mddev->new_level != 5
6971 && mddev->new_level != 4
6972 && mddev->new_level != 6) {
6973 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6974 mdname(mddev), mddev->new_level);
6975 return ERR_PTR(-EIO);
6976 }
6977 if ((mddev->new_level == 5
6978 && !algorithm_valid_raid5(mddev->new_layout)) ||
6979 (mddev->new_level == 6
6980 && !algorithm_valid_raid6(mddev->new_layout))) {
6981 pr_warn("md/raid:%s: layout %d not supported\n",
6982 mdname(mddev), mddev->new_layout);
6983 return ERR_PTR(-EIO);
6984 }
6985 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6986 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6987 mdname(mddev), mddev->raid_disks);
6988 return ERR_PTR(-EINVAL);
6989 }
6990
6991 if (!mddev->new_chunk_sectors ||
6992 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6993 !is_power_of_2(mddev->new_chunk_sectors)) {
6994 pr_warn("md/raid:%s: invalid chunk size %d\n",
6995 mdname(mddev), mddev->new_chunk_sectors << 9);
6996 return ERR_PTR(-EINVAL);
6997 }
6998
6999 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
7000 if (conf == NULL)
7001 goto abort;
7002
7003#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7004 conf->stripe_size = DEFAULT_STRIPE_SIZE;
7005 conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7006 conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7007#endif
7008 INIT_LIST_HEAD(&conf->free_list);
7009 INIT_LIST_HEAD(&conf->pending_list);
7010 conf->pending_data = kcalloc(PENDING_IO_MAX,
7011 sizeof(struct r5pending_data),
7012 GFP_KERNEL);
7013 if (!conf->pending_data)
7014 goto abort;
7015 for (i = 0; i < PENDING_IO_MAX; i++)
7016 list_add(&conf->pending_data[i].sibling, &conf->free_list);
7017
7018 if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
7019 conf->group_cnt = group_cnt;
7020 conf->worker_cnt_per_group = 0;
7021 conf->worker_groups = new_group;
7022 } else
7023 goto abort;
7024 spin_lock_init(&conf->device_lock);
7025 seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7026 mutex_init(&conf->cache_size_mutex);
7027 init_waitqueue_head(&conf->wait_for_quiescent);
7028 init_waitqueue_head(&conf->wait_for_stripe);
7029 init_waitqueue_head(&conf->wait_for_overlap);
7030 INIT_LIST_HEAD(&conf->handle_list);
7031 INIT_LIST_HEAD(&conf->loprio_list);
7032 INIT_LIST_HEAD(&conf->hold_list);
7033 INIT_LIST_HEAD(&conf->delayed_list);
7034 INIT_LIST_HEAD(&conf->bitmap_list);
7035 init_llist_head(&conf->released_stripes);
7036 atomic_set(&conf->active_stripes, 0);
7037 atomic_set(&conf->preread_active_stripes, 0);
7038 atomic_set(&conf->active_aligned_reads, 0);
7039 spin_lock_init(&conf->pending_bios_lock);
7040 conf->batch_bio_dispatch = true;
7041 rdev_for_each(rdev, mddev) {
7042 if (test_bit(Journal, &rdev->flags))
7043 continue;
7044 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
7045 conf->batch_bio_dispatch = false;
7046 break;
7047 }
7048 }
7049
7050 conf->bypass_threshold = BYPASS_THRESHOLD;
7051 conf->recovery_disabled = mddev->recovery_disabled - 1;
7052
7053 conf->raid_disks = mddev->raid_disks;
7054 if (mddev->reshape_position == MaxSector)
7055 conf->previous_raid_disks = mddev->raid_disks;
7056 else
7057 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7058 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7059
7060 conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
7061 GFP_KERNEL);
7062
7063 if (!conf->disks)
7064 goto abort;
7065
7066 for (i = 0; i < max_disks; i++) {
7067 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7068 if (!conf->disks[i].extra_page)
7069 goto abort;
7070 }
7071
7072 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
7073 if (ret)
7074 goto abort;
7075 conf->mddev = mddev;
7076
7077 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
7078 goto abort;
7079
7080
7081
7082
7083
7084
7085 spin_lock_init(conf->hash_locks);
7086 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
7087 spin_lock_init(conf->hash_locks + i);
7088
7089 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7090 INIT_LIST_HEAD(conf->inactive_list + i);
7091
7092 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7093 INIT_LIST_HEAD(conf->temp_inactive_list + i);
7094
7095 atomic_set(&conf->r5c_cached_full_stripes, 0);
7096 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7097 atomic_set(&conf->r5c_cached_partial_stripes, 0);
7098 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7099 atomic_set(&conf->r5c_flushing_full_stripes, 0);
7100 atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7101
7102 conf->level = mddev->new_level;
7103 conf->chunk_sectors = mddev->new_chunk_sectors;
7104 if (raid5_alloc_percpu(conf) != 0)
7105 goto abort;
7106
7107 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7108
7109 rdev_for_each(rdev, mddev) {
7110 raid_disk = rdev->raid_disk;
7111 if (raid_disk >= max_disks
7112 || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7113 continue;
7114 disk = conf->disks + raid_disk;
7115
7116 if (test_bit(Replacement, &rdev->flags)) {
7117 if (disk->replacement)
7118 goto abort;
7119 disk->replacement = rdev;
7120 } else {
7121 if (disk->rdev)
7122 goto abort;
7123 disk->rdev = rdev;
7124 }
7125
7126 if (test_bit(In_sync, &rdev->flags)) {
7127 char b[BDEVNAME_SIZE];
7128 pr_info("md/raid:%s: device %s operational as raid disk %d\n",
7129 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
7130 } else if (rdev->saved_raid_disk != raid_disk)
7131
7132 conf->fullsync = 1;
7133 }
7134
7135 conf->level = mddev->new_level;
7136 if (conf->level == 6) {
7137 conf->max_degraded = 2;
7138 if (raid6_call.xor_syndrome)
7139 conf->rmw_level = PARITY_ENABLE_RMW;
7140 else
7141 conf->rmw_level = PARITY_DISABLE_RMW;
7142 } else {
7143 conf->max_degraded = 1;
7144 conf->rmw_level = PARITY_ENABLE_RMW;
7145 }
7146 conf->algorithm = mddev->new_layout;
7147 conf->reshape_progress = mddev->reshape_position;
7148 if (conf->reshape_progress != MaxSector) {
7149 conf->prev_chunk_sectors = mddev->chunk_sectors;
7150 conf->prev_algo = mddev->layout;
7151 } else {
7152 conf->prev_chunk_sectors = conf->chunk_sectors;
7153 conf->prev_algo = conf->algorithm;
7154 }
7155
7156 conf->min_nr_stripes = NR_STRIPES;
7157 if (mddev->reshape_position != MaxSector) {
7158 int stripes = max_t(int,
7159 ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7160 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
7161 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7162 if (conf->min_nr_stripes != NR_STRIPES)
7163 pr_info("md/raid:%s: force stripe size %d for reshape\n",
7164 mdname(mddev), conf->min_nr_stripes);
7165 }
7166 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7167 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7168 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7169 if (grow_stripes(conf, conf->min_nr_stripes)) {
7170 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7171 mdname(mddev), memory);
7172 goto abort;
7173 } else
7174 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7175
7176
7177
7178
7179
7180 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7181 conf->shrinker.scan_objects = raid5_cache_scan;
7182 conf->shrinker.count_objects = raid5_cache_count;
7183 conf->shrinker.batch = 128;
7184 conf->shrinker.flags = 0;
7185 if (register_shrinker(&conf->shrinker)) {
7186 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7187 mdname(mddev));
7188 goto abort;
7189 }
7190
7191 sprintf(pers_name, "raid%d", mddev->new_level);
7192 conf->thread = md_register_thread(raid5d, mddev, pers_name);
7193 if (!conf->thread) {
7194 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7195 mdname(mddev));
7196 goto abort;
7197 }
7198
7199 return conf;
7200
7201 abort:
7202 if (conf) {
7203 free_conf(conf);
7204 return ERR_PTR(-EIO);
7205 } else
7206 return ERR_PTR(-ENOMEM);
7207}
7208
7209static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7210{
7211 switch (algo) {
7212 case ALGORITHM_PARITY_0:
7213 if (raid_disk < max_degraded)
7214 return 1;
7215 break;
7216 case ALGORITHM_PARITY_N:
7217 if (raid_disk >= raid_disks - max_degraded)
7218 return 1;
7219 break;
7220 case ALGORITHM_PARITY_0_6:
7221 if (raid_disk == 0 ||
7222 raid_disk == raid_disks - 1)
7223 return 1;
7224 break;
7225 case ALGORITHM_LEFT_ASYMMETRIC_6:
7226 case ALGORITHM_RIGHT_ASYMMETRIC_6:
7227 case ALGORITHM_LEFT_SYMMETRIC_6:
7228 case ALGORITHM_RIGHT_SYMMETRIC_6:
7229 if (raid_disk == raid_disks - 1)
7230 return 1;
7231 }
7232 return 0;
7233}
7234
7235static int raid5_run(struct mddev *mddev)
7236{
7237 struct r5conf *conf;
7238 int working_disks = 0;
7239 int dirty_parity_disks = 0;
7240 struct md_rdev *rdev;
7241 struct md_rdev *journal_dev = NULL;
7242 sector_t reshape_offset = 0;
7243 int i;
7244 long long min_offset_diff = 0;
7245 int first = 1;
7246
7247 if (mddev_init_writes_pending(mddev) < 0)
7248 return -ENOMEM;
7249
7250 if (mddev->recovery_cp != MaxSector)
7251 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7252 mdname(mddev));
7253
7254 rdev_for_each(rdev, mddev) {
7255 long long diff;
7256
7257 if (test_bit(Journal, &rdev->flags)) {
7258 journal_dev = rdev;
7259 continue;
7260 }
7261 if (rdev->raid_disk < 0)
7262 continue;
7263 diff = (rdev->new_data_offset - rdev->data_offset);
7264 if (first) {
7265 min_offset_diff = diff;
7266 first = 0;
7267 } else if (mddev->reshape_backwards &&
7268 diff < min_offset_diff)
7269 min_offset_diff = diff;
7270 else if (!mddev->reshape_backwards &&
7271 diff > min_offset_diff)
7272 min_offset_diff = diff;
7273 }
7274
7275 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7276 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7277 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7278 mdname(mddev));
7279 return -EINVAL;
7280 }
7281
7282 if (mddev->reshape_position != MaxSector) {
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295 sector_t here_new, here_old;
7296 int old_disks;
7297 int max_degraded = (mddev->level == 6 ? 2 : 1);
7298 int chunk_sectors;
7299 int new_data_disks;
7300
7301 if (journal_dev) {
7302 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7303 mdname(mddev));
7304 return -EINVAL;
7305 }
7306
7307 if (mddev->new_level != mddev->level) {
7308 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7309 mdname(mddev));
7310 return -EINVAL;
7311 }
7312 old_disks = mddev->raid_disks - mddev->delta_disks;
7313
7314
7315
7316
7317
7318
7319
7320 here_new = mddev->reshape_position;
7321 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7322 new_data_disks = mddev->raid_disks - max_degraded;
7323 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7324 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7325 mdname(mddev));
7326 return -EINVAL;
7327 }
7328 reshape_offset = here_new * chunk_sectors;
7329
7330 here_old = mddev->reshape_position;
7331 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7332
7333
7334 if (mddev->delta_disks == 0) {
7335
7336
7337
7338
7339
7340
7341
7342 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7343 abs(min_offset_diff) >= mddev->new_chunk_sectors)
7344 ;
7345 else if (mddev->ro == 0) {
7346 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7347 mdname(mddev));
7348 return -EINVAL;
7349 }
7350 } else if (mddev->reshape_backwards
7351 ? (here_new * chunk_sectors + min_offset_diff <=
7352 here_old * chunk_sectors)
7353 : (here_new * chunk_sectors >=
7354 here_old * chunk_sectors + (-min_offset_diff))) {
7355
7356 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7357 mdname(mddev));
7358 return -EINVAL;
7359 }
7360 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7361
7362 } else {
7363 BUG_ON(mddev->level != mddev->new_level);
7364 BUG_ON(mddev->layout != mddev->new_layout);
7365 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7366 BUG_ON(mddev->delta_disks != 0);
7367 }
7368
7369 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7370 test_bit(MD_HAS_PPL, &mddev->flags)) {
7371 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7372 mdname(mddev));
7373 clear_bit(MD_HAS_PPL, &mddev->flags);
7374 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7375 }
7376
7377 if (mddev->private == NULL)
7378 conf = setup_conf(mddev);
7379 else
7380 conf = mddev->private;
7381
7382 if (IS_ERR(conf))
7383 return PTR_ERR(conf);
7384
7385 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7386 if (!journal_dev) {
7387 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7388 mdname(mddev));
7389 mddev->ro = 1;
7390 set_disk_ro(mddev->gendisk, 1);
7391 } else if (mddev->recovery_cp == MaxSector)
7392 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7393 }
7394
7395 conf->min_offset_diff = min_offset_diff;
7396 mddev->thread = conf->thread;
7397 conf->thread = NULL;
7398 mddev->private = conf;
7399
7400 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7401 i++) {
7402 rdev = conf->disks[i].rdev;
7403 if (!rdev && conf->disks[i].replacement) {
7404
7405 rdev = conf->disks[i].replacement;
7406 conf->disks[i].replacement = NULL;
7407 clear_bit(Replacement, &rdev->flags);
7408 conf->disks[i].rdev = rdev;
7409 }
7410 if (!rdev)
7411 continue;
7412 if (conf->disks[i].replacement &&
7413 conf->reshape_progress != MaxSector) {
7414
7415 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7416 goto abort;
7417 }
7418 if (test_bit(In_sync, &rdev->flags)) {
7419 working_disks++;
7420 continue;
7421 }
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431 if (mddev->major_version == 0 &&
7432 mddev->minor_version > 90)
7433 rdev->recovery_offset = reshape_offset;
7434
7435 if (rdev->recovery_offset < reshape_offset) {
7436
7437 if (!only_parity(rdev->raid_disk,
7438 conf->algorithm,
7439 conf->raid_disks,
7440 conf->max_degraded))
7441 continue;
7442 }
7443 if (!only_parity(rdev->raid_disk,
7444 conf->prev_algo,
7445 conf->previous_raid_disks,
7446 conf->max_degraded))
7447 continue;
7448 dirty_parity_disks++;
7449 }
7450
7451
7452
7453
7454 mddev->degraded = raid5_calc_degraded(conf);
7455
7456 if (has_failed(conf)) {
7457 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7458 mdname(mddev), mddev->degraded, conf->raid_disks);
7459 goto abort;
7460 }
7461
7462
7463 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
7464 mddev->resync_max_sectors = mddev->dev_sectors;
7465
7466 if (mddev->degraded > dirty_parity_disks &&
7467 mddev->recovery_cp != MaxSector) {
7468 if (test_bit(MD_HAS_PPL, &mddev->flags))
7469 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7470 mdname(mddev));
7471 else if (mddev->ok_start_degraded)
7472 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7473 mdname(mddev));
7474 else {
7475 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7476 mdname(mddev));
7477 goto abort;
7478 }
7479 }
7480
7481 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7482 mdname(mddev), conf->level,
7483 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7484 mddev->new_layout);
7485
7486 print_raid5_conf(conf);
7487
7488 if (conf->reshape_progress != MaxSector) {
7489 conf->reshape_safe = conf->reshape_progress;
7490 atomic_set(&conf->reshape_stripes, 0);
7491 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7492 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7493 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7494 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7495 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7496 "reshape");
7497 if (!mddev->sync_thread)
7498 goto abort;
7499 }
7500
7501
7502 if (mddev->to_remove == &raid5_attrs_group)
7503 mddev->to_remove = NULL;
7504 else if (mddev->kobj.sd &&
7505 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7506 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7507 mdname(mddev));
7508 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7509
7510 if (mddev->queue) {
7511 int chunk_size;
7512
7513
7514
7515
7516 int data_disks = conf->previous_raid_disks - conf->max_degraded;
7517 int stripe = data_disks *
7518 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7519 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7520 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
7521
7522 chunk_size = mddev->chunk_sectors << 9;
7523 blk_queue_io_min(mddev->queue, chunk_size);
7524 blk_queue_io_opt(mddev->queue, chunk_size *
7525 (conf->raid_disks - conf->max_degraded));
7526 mddev->queue->limits.raid_partial_stripes_expensive = 1;
7527
7528
7529
7530
7531 stripe = stripe * PAGE_SIZE;
7532
7533
7534 while ((stripe-1) & stripe)
7535 stripe = (stripe | (stripe-1)) + 1;
7536 mddev->queue->limits.discard_alignment = stripe;
7537 mddev->queue->limits.discard_granularity = stripe;
7538
7539 blk_queue_max_write_same_sectors(mddev->queue, 0);
7540 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
7541
7542 rdev_for_each(rdev, mddev) {
7543 disk_stack_limits(mddev->gendisk, rdev->bdev,
7544 rdev->data_offset << 9);
7545 disk_stack_limits(mddev->gendisk, rdev->bdev,
7546 rdev->new_data_offset << 9);
7547 }
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564 if (devices_handle_discard_safely &&
7565 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7566 mddev->queue->limits.discard_granularity >= stripe)
7567 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
7568 mddev->queue);
7569 else
7570 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
7571 mddev->queue);
7572
7573 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7574 }
7575
7576 if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
7577 goto abort;
7578
7579 return 0;
7580abort:
7581 md_unregister_thread(&mddev->thread);
7582 print_raid5_conf(conf);
7583 free_conf(conf);
7584 mddev->private = NULL;
7585 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7586 return -EIO;
7587}
7588
7589static void raid5_free(struct mddev *mddev, void *priv)
7590{
7591 struct r5conf *conf = priv;
7592
7593 free_conf(conf);
7594 mddev->to_remove = &raid5_attrs_group;
7595}
7596
7597static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7598{
7599 struct r5conf *conf = mddev->private;
7600 int i;
7601
7602 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7603 conf->chunk_sectors / 2, mddev->layout);
7604 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7605 rcu_read_lock();
7606 for (i = 0; i < conf->raid_disks; i++) {
7607 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7608 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7609 }
7610 rcu_read_unlock();
7611 seq_printf (seq, "]");
7612}
7613
7614static void print_raid5_conf (struct r5conf *conf)
7615{
7616 int i;
7617 struct disk_info *tmp;
7618
7619 pr_debug("RAID conf printout:\n");
7620 if (!conf) {
7621 pr_debug("(conf==NULL)\n");
7622 return;
7623 }
7624 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7625 conf->raid_disks,
7626 conf->raid_disks - conf->mddev->degraded);
7627
7628 for (i = 0; i < conf->raid_disks; i++) {
7629 char b[BDEVNAME_SIZE];
7630 tmp = conf->disks + i;
7631 if (tmp->rdev)
7632 pr_debug(" disk %d, o:%d, dev:%s\n",
7633 i, !test_bit(Faulty, &tmp->rdev->flags),
7634 bdevname(tmp->rdev->bdev, b));
7635 }
7636}
7637
7638static int raid5_spare_active(struct mddev *mddev)
7639{
7640 int i;
7641 struct r5conf *conf = mddev->private;
7642 struct disk_info *tmp;
7643 int count = 0;
7644 unsigned long flags;
7645
7646 for (i = 0; i < conf->raid_disks; i++) {
7647 tmp = conf->disks + i;
7648 if (tmp->replacement
7649 && tmp->replacement->recovery_offset == MaxSector
7650 && !test_bit(Faulty, &tmp->replacement->flags)
7651 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7652
7653 if (!tmp->rdev
7654 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7655 count++;
7656 if (tmp->rdev) {
7657
7658
7659
7660
7661 set_bit(Faulty, &tmp->rdev->flags);
7662 sysfs_notify_dirent_safe(
7663 tmp->rdev->sysfs_state);
7664 }
7665 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7666 } else if (tmp->rdev
7667 && tmp->rdev->recovery_offset == MaxSector
7668 && !test_bit(Faulty, &tmp->rdev->flags)
7669 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7670 count++;
7671 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7672 }
7673 }
7674 spin_lock_irqsave(&conf->device_lock, flags);
7675 mddev->degraded = raid5_calc_degraded(conf);
7676 spin_unlock_irqrestore(&conf->device_lock, flags);
7677 print_raid5_conf(conf);
7678 return count;
7679}
7680
7681static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7682{
7683 struct r5conf *conf = mddev->private;
7684 int err = 0;
7685 int number = rdev->raid_disk;
7686 struct md_rdev **rdevp;
7687 struct disk_info *p = conf->disks + number;
7688
7689 print_raid5_conf(conf);
7690 if (test_bit(Journal, &rdev->flags) && conf->log) {
7691
7692
7693
7694
7695
7696
7697 if (atomic_read(&conf->active_stripes) ||
7698 atomic_read(&conf->r5c_cached_full_stripes) ||
7699 atomic_read(&conf->r5c_cached_partial_stripes)) {
7700 return -EBUSY;
7701 }
7702 log_exit(conf);
7703 return 0;
7704 }
7705 if (rdev == p->rdev)
7706 rdevp = &p->rdev;
7707 else if (rdev == p->replacement)
7708 rdevp = &p->replacement;
7709 else
7710 return 0;
7711
7712 if (number >= conf->raid_disks &&
7713 conf->reshape_progress == MaxSector)
7714 clear_bit(In_sync, &rdev->flags);
7715
7716 if (test_bit(In_sync, &rdev->flags) ||
7717 atomic_read(&rdev->nr_pending)) {
7718 err = -EBUSY;
7719 goto abort;
7720 }
7721
7722
7723
7724 if (!test_bit(Faulty, &rdev->flags) &&
7725 mddev->recovery_disabled != conf->recovery_disabled &&
7726 !has_failed(conf) &&
7727 (!p->replacement || p->replacement == rdev) &&
7728 number < conf->raid_disks) {
7729 err = -EBUSY;
7730 goto abort;
7731 }
7732 *rdevp = NULL;
7733 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7734 synchronize_rcu();
7735 if (atomic_read(&rdev->nr_pending)) {
7736
7737 err = -EBUSY;
7738 *rdevp = rdev;
7739 }
7740 }
7741 if (!err) {
7742 err = log_modify(conf, rdev, false);
7743 if (err)
7744 goto abort;
7745 }
7746 if (p->replacement) {
7747
7748 p->rdev = p->replacement;
7749 clear_bit(Replacement, &p->replacement->flags);
7750 smp_mb();
7751
7752
7753 p->replacement = NULL;
7754
7755 if (!err)
7756 err = log_modify(conf, p->rdev, true);
7757 }
7758
7759 clear_bit(WantReplacement, &rdev->flags);
7760abort:
7761
7762 print_raid5_conf(conf);
7763 return err;
7764}
7765
7766static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7767{
7768 struct r5conf *conf = mddev->private;
7769 int ret, err = -EEXIST;
7770 int disk;
7771 struct disk_info *p;
7772 int first = 0;
7773 int last = conf->raid_disks - 1;
7774
7775 if (test_bit(Journal, &rdev->flags)) {
7776 if (conf->log)
7777 return -EBUSY;
7778
7779 rdev->raid_disk = 0;
7780
7781
7782
7783
7784 ret = log_init(conf, rdev, false);
7785 if (ret)
7786 return ret;
7787
7788 ret = r5l_start(conf->log);
7789 if (ret)
7790 return ret;
7791
7792 return 0;
7793 }
7794 if (mddev->recovery_disabled == conf->recovery_disabled)
7795 return -EBUSY;
7796
7797 if (rdev->saved_raid_disk < 0 && has_failed(conf))
7798
7799 return -EINVAL;
7800
7801 if (rdev->raid_disk >= 0)
7802 first = last = rdev->raid_disk;
7803
7804
7805
7806
7807
7808 if (rdev->saved_raid_disk >= 0 &&
7809 rdev->saved_raid_disk >= first &&
7810 conf->disks[rdev->saved_raid_disk].rdev == NULL)
7811 first = rdev->saved_raid_disk;
7812
7813 for (disk = first; disk <= last; disk++) {
7814 p = conf->disks + disk;
7815 if (p->rdev == NULL) {
7816 clear_bit(In_sync, &rdev->flags);
7817 rdev->raid_disk = disk;
7818 if (rdev->saved_raid_disk != disk)
7819 conf->fullsync = 1;
7820 rcu_assign_pointer(p->rdev, rdev);
7821
7822 err = log_modify(conf, rdev, true);
7823
7824 goto out;
7825 }
7826 }
7827 for (disk = first; disk <= last; disk++) {
7828 p = conf->disks + disk;
7829 if (test_bit(WantReplacement, &p->rdev->flags) &&
7830 p->replacement == NULL) {
7831 clear_bit(In_sync, &rdev->flags);
7832 set_bit(Replacement, &rdev->flags);
7833 rdev->raid_disk = disk;
7834 err = 0;
7835 conf->fullsync = 1;
7836 rcu_assign_pointer(p->replacement, rdev);
7837 break;
7838 }
7839 }
7840out:
7841 print_raid5_conf(conf);
7842 return err;
7843}
7844
7845static int raid5_resize(struct mddev *mddev, sector_t sectors)
7846{
7847
7848
7849
7850
7851
7852
7853
7854 sector_t newsize;
7855 struct r5conf *conf = mddev->private;
7856
7857 if (raid5_has_log(conf) || raid5_has_ppl(conf))
7858 return -EINVAL;
7859 sectors &= ~((sector_t)conf->chunk_sectors - 1);
7860 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
7861 if (mddev->external_size &&
7862 mddev->array_sectors > newsize)
7863 return -EINVAL;
7864 if (mddev->bitmap) {
7865 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
7866 if (ret)
7867 return ret;
7868 }
7869 md_set_array_sectors(mddev, newsize);
7870 if (sectors > mddev->dev_sectors &&
7871 mddev->recovery_cp > mddev->dev_sectors) {
7872 mddev->recovery_cp = mddev->dev_sectors;
7873 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7874 }
7875 mddev->dev_sectors = sectors;
7876 mddev->resync_max_sectors = sectors;
7877 return 0;
7878}
7879
7880static int check_stripe_cache(struct mddev *mddev)
7881{
7882
7883
7884
7885
7886
7887
7888
7889
7890 struct r5conf *conf = mddev->private;
7891 if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
7892 > conf->min_nr_stripes ||
7893 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
7894 > conf->min_nr_stripes) {
7895 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
7896 mdname(mddev),
7897 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7898 / RAID5_STRIPE_SIZE(conf))*4);
7899 return 0;
7900 }
7901 return 1;
7902}
7903
7904static int check_reshape(struct mddev *mddev)
7905{
7906 struct r5conf *conf = mddev->private;
7907
7908 if (raid5_has_log(conf) || raid5_has_ppl(conf))
7909 return -EINVAL;
7910 if (mddev->delta_disks == 0 &&
7911 mddev->new_layout == mddev->layout &&
7912 mddev->new_chunk_sectors == mddev->chunk_sectors)
7913 return 0;
7914 if (has_failed(conf))
7915 return -EINVAL;
7916 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
7917
7918
7919
7920
7921
7922 int min = 2;
7923 if (mddev->level == 6)
7924 min = 4;
7925 if (mddev->raid_disks + mddev->delta_disks < min)
7926 return -EINVAL;
7927 }
7928
7929 if (!check_stripe_cache(mddev))
7930 return -ENOSPC;
7931
7932 if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
7933 mddev->delta_disks > 0)
7934 if (resize_chunks(conf,
7935 conf->previous_raid_disks
7936 + max(0, mddev->delta_disks),
7937 max(mddev->new_chunk_sectors,
7938 mddev->chunk_sectors)
7939 ) < 0)
7940 return -ENOMEM;
7941
7942 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
7943 return 0;
7944 return resize_stripes(conf, (conf->previous_raid_disks
7945 + mddev->delta_disks));
7946}
7947
7948static int raid5_start_reshape(struct mddev *mddev)
7949{
7950 struct r5conf *conf = mddev->private;
7951 struct md_rdev *rdev;
7952 int spares = 0;
7953 unsigned long flags;
7954
7955 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7956 return -EBUSY;
7957
7958 if (!check_stripe_cache(mddev))
7959 return -ENOSPC;
7960
7961 if (has_failed(conf))
7962 return -EINVAL;
7963
7964 rdev_for_each(rdev, mddev) {
7965 if (!test_bit(In_sync, &rdev->flags)
7966 && !test_bit(Faulty, &rdev->flags))
7967 spares++;
7968 }
7969
7970 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
7971
7972
7973
7974 return -EINVAL;
7975
7976
7977
7978
7979
7980 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7981 < mddev->array_sectors) {
7982 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
7983 mdname(mddev));
7984 return -EINVAL;
7985 }
7986
7987 atomic_set(&conf->reshape_stripes, 0);
7988 spin_lock_irq(&conf->device_lock);
7989 write_seqcount_begin(&conf->gen_lock);
7990 conf->previous_raid_disks = conf->raid_disks;
7991 conf->raid_disks += mddev->delta_disks;
7992 conf->prev_chunk_sectors = conf->chunk_sectors;
7993 conf->chunk_sectors = mddev->new_chunk_sectors;
7994 conf->prev_algo = conf->algorithm;
7995 conf->algorithm = mddev->new_layout;
7996 conf->generation++;
7997
7998
7999
8000 smp_mb();
8001 if (mddev->reshape_backwards)
8002 conf->reshape_progress = raid5_size(mddev, 0, 0);
8003 else
8004 conf->reshape_progress = 0;
8005 conf->reshape_safe = conf->reshape_progress;
8006 write_seqcount_end(&conf->gen_lock);
8007 spin_unlock_irq(&conf->device_lock);
8008
8009
8010
8011
8012
8013 mddev_suspend(mddev);
8014 mddev_resume(mddev);
8015
8016
8017
8018
8019
8020
8021
8022
8023 if (mddev->delta_disks >= 0) {
8024 rdev_for_each(rdev, mddev)
8025 if (rdev->raid_disk < 0 &&
8026 !test_bit(Faulty, &rdev->flags)) {
8027 if (raid5_add_disk(mddev, rdev) == 0) {
8028 if (rdev->raid_disk
8029 >= conf->previous_raid_disks)
8030 set_bit(In_sync, &rdev->flags);
8031 else
8032 rdev->recovery_offset = 0;
8033
8034
8035 sysfs_link_rdev(mddev, rdev);
8036 }
8037 } else if (rdev->raid_disk >= conf->previous_raid_disks
8038 && !test_bit(Faulty, &rdev->flags)) {
8039
8040 set_bit(In_sync, &rdev->flags);
8041 }
8042
8043
8044
8045
8046
8047 spin_lock_irqsave(&conf->device_lock, flags);
8048 mddev->degraded = raid5_calc_degraded(conf);
8049 spin_unlock_irqrestore(&conf->device_lock, flags);
8050 }
8051 mddev->raid_disks = conf->raid_disks;
8052 mddev->reshape_position = conf->reshape_progress;
8053 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8054
8055 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8056 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8057 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8058 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8059 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8060 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
8061 "reshape");
8062 if (!mddev->sync_thread) {
8063 mddev->recovery = 0;
8064 spin_lock_irq(&conf->device_lock);
8065 write_seqcount_begin(&conf->gen_lock);
8066 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
8067 mddev->new_chunk_sectors =
8068 conf->chunk_sectors = conf->prev_chunk_sectors;
8069 mddev->new_layout = conf->algorithm = conf->prev_algo;
8070 rdev_for_each(rdev, mddev)
8071 rdev->new_data_offset = rdev->data_offset;
8072 smp_wmb();
8073 conf->generation --;
8074 conf->reshape_progress = MaxSector;
8075 mddev->reshape_position = MaxSector;
8076 write_seqcount_end(&conf->gen_lock);
8077 spin_unlock_irq(&conf->device_lock);
8078 return -EAGAIN;
8079 }
8080 conf->reshape_checkpoint = jiffies;
8081 md_wakeup_thread(mddev->sync_thread);
8082 md_new_event(mddev);
8083 return 0;
8084}
8085
8086
8087
8088
8089static void end_reshape(struct r5conf *conf)
8090{
8091
8092 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8093 struct md_rdev *rdev;
8094
8095 spin_lock_irq(&conf->device_lock);
8096 conf->previous_raid_disks = conf->raid_disks;
8097 md_finish_reshape(conf->mddev);
8098 smp_wmb();
8099 conf->reshape_progress = MaxSector;
8100 conf->mddev->reshape_position = MaxSector;
8101 rdev_for_each(rdev, conf->mddev)
8102 if (rdev->raid_disk >= 0 &&
8103 !test_bit(Journal, &rdev->flags) &&
8104 !test_bit(In_sync, &rdev->flags))
8105 rdev->recovery_offset = MaxSector;
8106 spin_unlock_irq(&conf->device_lock);
8107 wake_up(&conf->wait_for_overlap);
8108
8109
8110
8111
8112 if (conf->mddev->queue) {
8113 int data_disks = conf->raid_disks - conf->max_degraded;
8114 int stripe = data_disks * ((conf->chunk_sectors << 9)
8115 / PAGE_SIZE);
8116 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
8117 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
8118 }
8119 }
8120}
8121
8122
8123
8124
8125static void raid5_finish_reshape(struct mddev *mddev)
8126{
8127 struct r5conf *conf = mddev->private;
8128
8129 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8130
8131 if (mddev->delta_disks <= 0) {
8132 int d;
8133 spin_lock_irq(&conf->device_lock);
8134 mddev->degraded = raid5_calc_degraded(conf);
8135 spin_unlock_irq(&conf->device_lock);
8136 for (d = conf->raid_disks ;
8137 d < conf->raid_disks - mddev->delta_disks;
8138 d++) {
8139 struct md_rdev *rdev = conf->disks[d].rdev;
8140 if (rdev)
8141 clear_bit(In_sync, &rdev->flags);
8142 rdev = conf->disks[d].replacement;
8143 if (rdev)
8144 clear_bit(In_sync, &rdev->flags);
8145 }
8146 }
8147 mddev->layout = conf->algorithm;
8148 mddev->chunk_sectors = conf->chunk_sectors;
8149 mddev->reshape_position = MaxSector;
8150 mddev->delta_disks = 0;
8151 mddev->reshape_backwards = 0;
8152 }
8153}
8154
8155static void raid5_quiesce(struct mddev *mddev, int quiesce)
8156{
8157 struct r5conf *conf = mddev->private;
8158
8159 if (quiesce) {
8160
8161 lock_all_device_hash_locks_irq(conf);
8162
8163
8164
8165 r5c_flush_cache(conf, INT_MAX);
8166 conf->quiesce = 2;
8167 wait_event_cmd(conf->wait_for_quiescent,
8168 atomic_read(&conf->active_stripes) == 0 &&
8169 atomic_read(&conf->active_aligned_reads) == 0,
8170 unlock_all_device_hash_locks_irq(conf),
8171 lock_all_device_hash_locks_irq(conf));
8172 conf->quiesce = 1;
8173 unlock_all_device_hash_locks_irq(conf);
8174
8175 wake_up(&conf->wait_for_overlap);
8176 } else {
8177
8178 lock_all_device_hash_locks_irq(conf);
8179 conf->quiesce = 0;
8180 wake_up(&conf->wait_for_quiescent);
8181 wake_up(&conf->wait_for_overlap);
8182 unlock_all_device_hash_locks_irq(conf);
8183 }
8184 log_quiesce(conf, quiesce);
8185}
8186
8187static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8188{
8189 struct r0conf *raid0_conf = mddev->private;
8190 sector_t sectors;
8191
8192
8193 if (raid0_conf->nr_strip_zones > 1) {
8194 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8195 mdname(mddev));
8196 return ERR_PTR(-EINVAL);
8197 }
8198
8199 sectors = raid0_conf->strip_zone[0].zone_end;
8200 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8201 mddev->dev_sectors = sectors;
8202 mddev->new_level = level;
8203 mddev->new_layout = ALGORITHM_PARITY_N;
8204 mddev->new_chunk_sectors = mddev->chunk_sectors;
8205 mddev->raid_disks += 1;
8206 mddev->delta_disks = 1;
8207
8208 mddev->recovery_cp = MaxSector;
8209
8210 return setup_conf(mddev);
8211}
8212
8213static void *raid5_takeover_raid1(struct mddev *mddev)
8214{
8215 int chunksect;
8216 void *ret;
8217
8218 if (mddev->raid_disks != 2 ||
8219 mddev->degraded > 1)
8220 return ERR_PTR(-EINVAL);
8221
8222
8223
8224 chunksect = 64*2;
8225
8226
8227 while (chunksect && (mddev->array_sectors & (chunksect-1)))
8228 chunksect >>= 1;
8229
8230 if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8231
8232 return ERR_PTR(-EINVAL);
8233
8234 mddev->new_level = 5;
8235 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8236 mddev->new_chunk_sectors = chunksect;
8237
8238 ret = setup_conf(mddev);
8239 if (!IS_ERR(ret))
8240 mddev_clear_unsupported_flags(mddev,
8241 UNSUPPORTED_MDDEV_FLAGS);
8242 return ret;
8243}
8244
8245static void *raid5_takeover_raid6(struct mddev *mddev)
8246{
8247 int new_layout;
8248
8249 switch (mddev->layout) {
8250 case ALGORITHM_LEFT_ASYMMETRIC_6:
8251 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8252 break;
8253 case ALGORITHM_RIGHT_ASYMMETRIC_6:
8254 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8255 break;
8256 case ALGORITHM_LEFT_SYMMETRIC_6:
8257 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8258 break;
8259 case ALGORITHM_RIGHT_SYMMETRIC_6:
8260 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8261 break;
8262 case ALGORITHM_PARITY_0_6:
8263 new_layout = ALGORITHM_PARITY_0;
8264 break;
8265 case ALGORITHM_PARITY_N:
8266 new_layout = ALGORITHM_PARITY_N;
8267 break;
8268 default:
8269 return ERR_PTR(-EINVAL);
8270 }
8271 mddev->new_level = 5;
8272 mddev->new_layout = new_layout;
8273 mddev->delta_disks = -1;
8274 mddev->raid_disks -= 1;
8275 return setup_conf(mddev);
8276}
8277
8278static int raid5_check_reshape(struct mddev *mddev)
8279{
8280
8281
8282
8283
8284
8285 struct r5conf *conf = mddev->private;
8286 int new_chunk = mddev->new_chunk_sectors;
8287
8288 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8289 return -EINVAL;
8290 if (new_chunk > 0) {
8291 if (!is_power_of_2(new_chunk))
8292 return -EINVAL;
8293 if (new_chunk < (PAGE_SIZE>>9))
8294 return -EINVAL;
8295 if (mddev->array_sectors & (new_chunk-1))
8296
8297 return -EINVAL;
8298 }
8299
8300
8301
8302 if (mddev->raid_disks == 2) {
8303
8304 if (mddev->new_layout >= 0) {
8305 conf->algorithm = mddev->new_layout;
8306 mddev->layout = mddev->new_layout;
8307 }
8308 if (new_chunk > 0) {
8309 conf->chunk_sectors = new_chunk ;
8310 mddev->chunk_sectors = new_chunk;
8311 }
8312 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8313 md_wakeup_thread(mddev->thread);
8314 }
8315 return check_reshape(mddev);
8316}
8317
8318static int raid6_check_reshape(struct mddev *mddev)
8319{
8320 int new_chunk = mddev->new_chunk_sectors;
8321
8322 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8323 return -EINVAL;
8324 if (new_chunk > 0) {
8325 if (!is_power_of_2(new_chunk))
8326 return -EINVAL;
8327 if (new_chunk < (PAGE_SIZE >> 9))
8328 return -EINVAL;
8329 if (mddev->array_sectors & (new_chunk-1))
8330
8331 return -EINVAL;
8332 }
8333
8334
8335 return check_reshape(mddev);
8336}
8337
8338static void *raid5_takeover(struct mddev *mddev)
8339{
8340
8341
8342
8343
8344
8345
8346 if (mddev->level == 0)
8347 return raid45_takeover_raid0(mddev, 5);
8348 if (mddev->level == 1)
8349 return raid5_takeover_raid1(mddev);
8350 if (mddev->level == 4) {
8351 mddev->new_layout = ALGORITHM_PARITY_N;
8352 mddev->new_level = 5;
8353 return setup_conf(mddev);
8354 }
8355 if (mddev->level == 6)
8356 return raid5_takeover_raid6(mddev);
8357
8358 return ERR_PTR(-EINVAL);
8359}
8360
8361static void *raid4_takeover(struct mddev *mddev)
8362{
8363
8364
8365
8366
8367 if (mddev->level == 0)
8368 return raid45_takeover_raid0(mddev, 4);
8369 if (mddev->level == 5 &&
8370 mddev->layout == ALGORITHM_PARITY_N) {
8371 mddev->new_layout = 0;
8372 mddev->new_level = 4;
8373 return setup_conf(mddev);
8374 }
8375 return ERR_PTR(-EINVAL);
8376}
8377
8378static struct md_personality raid5_personality;
8379
8380static void *raid6_takeover(struct mddev *mddev)
8381{
8382
8383
8384
8385
8386 int new_layout;
8387
8388 if (mddev->pers != &raid5_personality)
8389 return ERR_PTR(-EINVAL);
8390 if (mddev->degraded > 1)
8391 return ERR_PTR(-EINVAL);
8392 if (mddev->raid_disks > 253)
8393 return ERR_PTR(-EINVAL);
8394 if (mddev->raid_disks < 3)
8395 return ERR_PTR(-EINVAL);
8396
8397 switch (mddev->layout) {
8398 case ALGORITHM_LEFT_ASYMMETRIC:
8399 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8400 break;
8401 case ALGORITHM_RIGHT_ASYMMETRIC:
8402 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8403 break;
8404 case ALGORITHM_LEFT_SYMMETRIC:
8405 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8406 break;
8407 case ALGORITHM_RIGHT_SYMMETRIC:
8408 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8409 break;
8410 case ALGORITHM_PARITY_0:
8411 new_layout = ALGORITHM_PARITY_0_6;
8412 break;
8413 case ALGORITHM_PARITY_N:
8414 new_layout = ALGORITHM_PARITY_N;
8415 break;
8416 default:
8417 return ERR_PTR(-EINVAL);
8418 }
8419 mddev->new_level = 6;
8420 mddev->new_layout = new_layout;
8421 mddev->delta_disks = 1;
8422 mddev->raid_disks += 1;
8423 return setup_conf(mddev);
8424}
8425
8426static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8427{
8428 struct r5conf *conf;
8429 int err;
8430
8431 err = mddev_lock(mddev);
8432 if (err)
8433 return err;
8434 conf = mddev->private;
8435 if (!conf) {
8436 mddev_unlock(mddev);
8437 return -ENODEV;
8438 }
8439
8440 if (strncmp(buf, "ppl", 3) == 0) {
8441
8442 if (!raid5_has_ppl(conf) && conf->level == 5) {
8443 err = log_init(conf, NULL, true);
8444 if (!err) {
8445 err = resize_stripes(conf, conf->pool_size);
8446 if (err)
8447 log_exit(conf);
8448 }
8449 } else
8450 err = -EINVAL;
8451 } else if (strncmp(buf, "resync", 6) == 0) {
8452 if (raid5_has_ppl(conf)) {
8453 mddev_suspend(mddev);
8454 log_exit(conf);
8455 mddev_resume(mddev);
8456 err = resize_stripes(conf, conf->pool_size);
8457 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8458 r5l_log_disk_error(conf)) {
8459 bool journal_dev_exists = false;
8460 struct md_rdev *rdev;
8461
8462 rdev_for_each(rdev, mddev)
8463 if (test_bit(Journal, &rdev->flags)) {
8464 journal_dev_exists = true;
8465 break;
8466 }
8467
8468 if (!journal_dev_exists) {
8469 mddev_suspend(mddev);
8470 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8471 mddev_resume(mddev);
8472 } else
8473 err = -EBUSY;
8474 } else
8475 err = -EINVAL;
8476 } else {
8477 err = -EINVAL;
8478 }
8479
8480 if (!err)
8481 md_update_sb(mddev, 1);
8482
8483 mddev_unlock(mddev);
8484
8485 return err;
8486}
8487
8488static int raid5_start(struct mddev *mddev)
8489{
8490 struct r5conf *conf = mddev->private;
8491
8492 return r5l_start(conf->log);
8493}
8494
8495static struct md_personality raid6_personality =
8496{
8497 .name = "raid6",
8498 .level = 6,
8499 .owner = THIS_MODULE,
8500 .make_request = raid5_make_request,
8501 .run = raid5_run,
8502 .start = raid5_start,
8503 .free = raid5_free,
8504 .status = raid5_status,
8505 .error_handler = raid5_error,
8506 .hot_add_disk = raid5_add_disk,
8507 .hot_remove_disk= raid5_remove_disk,
8508 .spare_active = raid5_spare_active,
8509 .sync_request = raid5_sync_request,
8510 .resize = raid5_resize,
8511 .size = raid5_size,
8512 .check_reshape = raid6_check_reshape,
8513 .start_reshape = raid5_start_reshape,
8514 .finish_reshape = raid5_finish_reshape,
8515 .quiesce = raid5_quiesce,
8516 .takeover = raid6_takeover,
8517 .change_consistency_policy = raid5_change_consistency_policy,
8518};
8519static struct md_personality raid5_personality =
8520{
8521 .name = "raid5",
8522 .level = 5,
8523 .owner = THIS_MODULE,
8524 .make_request = raid5_make_request,
8525 .run = raid5_run,
8526 .start = raid5_start,
8527 .free = raid5_free,
8528 .status = raid5_status,
8529 .error_handler = raid5_error,
8530 .hot_add_disk = raid5_add_disk,
8531 .hot_remove_disk= raid5_remove_disk,
8532 .spare_active = raid5_spare_active,
8533 .sync_request = raid5_sync_request,
8534 .resize = raid5_resize,
8535 .size = raid5_size,
8536 .check_reshape = raid5_check_reshape,
8537 .start_reshape = raid5_start_reshape,
8538 .finish_reshape = raid5_finish_reshape,
8539 .quiesce = raid5_quiesce,
8540 .takeover = raid5_takeover,
8541 .change_consistency_policy = raid5_change_consistency_policy,
8542};
8543
8544static struct md_personality raid4_personality =
8545{
8546 .name = "raid4",
8547 .level = 4,
8548 .owner = THIS_MODULE,
8549 .make_request = raid5_make_request,
8550 .run = raid5_run,
8551 .start = raid5_start,
8552 .free = raid5_free,
8553 .status = raid5_status,
8554 .error_handler = raid5_error,
8555 .hot_add_disk = raid5_add_disk,
8556 .hot_remove_disk= raid5_remove_disk,
8557 .spare_active = raid5_spare_active,
8558 .sync_request = raid5_sync_request,
8559 .resize = raid5_resize,
8560 .size = raid5_size,
8561 .check_reshape = raid5_check_reshape,
8562 .start_reshape = raid5_start_reshape,
8563 .finish_reshape = raid5_finish_reshape,
8564 .quiesce = raid5_quiesce,
8565 .takeover = raid4_takeover,
8566 .change_consistency_policy = raid5_change_consistency_policy,
8567};
8568
8569static int __init raid5_init(void)
8570{
8571 int ret;
8572
8573 raid5_wq = alloc_workqueue("raid5wq",
8574 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8575 if (!raid5_wq)
8576 return -ENOMEM;
8577
8578 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8579 "md/raid5:prepare",
8580 raid456_cpu_up_prepare,
8581 raid456_cpu_dead);
8582 if (ret) {
8583 destroy_workqueue(raid5_wq);
8584 return ret;
8585 }
8586 register_md_personality(&raid6_personality);
8587 register_md_personality(&raid5_personality);
8588 register_md_personality(&raid4_personality);
8589 return 0;
8590}
8591
8592static void raid5_exit(void)
8593{
8594 unregister_md_personality(&raid6_personality);
8595 unregister_md_personality(&raid5_personality);
8596 unregister_md_personality(&raid4_personality);
8597 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8598 destroy_workqueue(raid5_wq);
8599}
8600
8601module_init(raid5_init);
8602module_exit(raid5_exit);
8603MODULE_LICENSE("GPL");
8604MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8605MODULE_ALIAS("md-personality-4");
8606MODULE_ALIAS("md-raid5");
8607MODULE_ALIAS("md-raid4");
8608MODULE_ALIAS("md-level-5");
8609MODULE_ALIAS("md-level-4");
8610MODULE_ALIAS("md-personality-8");
8611MODULE_ALIAS("md-raid6");
8612MODULE_ALIAS("md-level-6");
8613
8614
8615MODULE_ALIAS("raid5");
8616MODULE_ALIAS("raid6");
8617