1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/blkdev.h>
39#include <linux/kthread.h>
40#include <linux/raid/pq.h>
41#include <linux/async_tx.h>
42#include <linux/module.h>
43#include <linux/async.h>
44#include <linux/seq_file.h>
45#include <linux/cpu.h>
46#include <linux/slab.h>
47#include <linux/ratelimit.h>
48#include <linux/nodemask.h>
49
50#include <trace/events/block.h>
51#include <linux/list_sort.h>
52
53#include "md.h"
54#include "raid5.h"
55#include "raid0.h"
56#include "md-bitmap.h"
57#include "raid5-log.h"
58
59#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
60
61#define cpu_to_group(cpu) cpu_to_node(cpu)
62#define ANY_GROUP NUMA_NO_NODE
63
64static bool devices_handle_discard_safely = false;
65module_param(devices_handle_discard_safely, bool, 0644);
66MODULE_PARM_DESC(devices_handle_discard_safely,
67 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
68static struct workqueue_struct *raid5_wq;
69
70static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
71{
72 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
73 return &conf->stripe_hashtbl[hash];
74}
75
76static inline int stripe_hash_locks_hash(sector_t sect)
77{
78 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
79}
80
81static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
82{
83 spin_lock_irq(conf->hash_locks + hash);
84 spin_lock(&conf->device_lock);
85}
86
87static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
88{
89 spin_unlock(&conf->device_lock);
90 spin_unlock_irq(conf->hash_locks + hash);
91}
92
93static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
94{
95 int i;
96 spin_lock_irq(conf->hash_locks);
97 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
98 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
99 spin_lock(&conf->device_lock);
100}
101
102static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
103{
104 int i;
105 spin_unlock(&conf->device_lock);
106 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
107 spin_unlock(conf->hash_locks + i);
108 spin_unlock_irq(conf->hash_locks);
109}
110
111
112static inline int raid6_d0(struct stripe_head *sh)
113{
114 if (sh->ddf_layout)
115
116 return 0;
117
118 if (sh->qd_idx == sh->disks - 1)
119 return 0;
120 else
121 return sh->qd_idx + 1;
122}
123static inline int raid6_next_disk(int disk, int raid_disks)
124{
125 disk++;
126 return (disk < raid_disks) ? disk : 0;
127}
128
129
130
131
132
133
134static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
135 int *count, int syndrome_disks)
136{
137 int slot = *count;
138
139 if (sh->ddf_layout)
140 (*count)++;
141 if (idx == sh->pd_idx)
142 return syndrome_disks;
143 if (idx == sh->qd_idx)
144 return syndrome_disks + 1;
145 if (!sh->ddf_layout)
146 (*count)++;
147 return slot;
148}
149
150static void print_raid5_conf (struct r5conf *conf);
151
152static int stripe_operations_active(struct stripe_head *sh)
153{
154 return sh->check_state || sh->reconstruct_state ||
155 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
156 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
157}
158
159static bool stripe_is_lowprio(struct stripe_head *sh)
160{
161 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
162 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
163 !test_bit(STRIPE_R5C_CACHING, &sh->state);
164}
165
166static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
167{
168 struct r5conf *conf = sh->raid_conf;
169 struct r5worker_group *group;
170 int thread_cnt;
171 int i, cpu = sh->cpu;
172
173 if (!cpu_online(cpu)) {
174 cpu = cpumask_any(cpu_online_mask);
175 sh->cpu = cpu;
176 }
177
178 if (list_empty(&sh->lru)) {
179 struct r5worker_group *group;
180 group = conf->worker_groups + cpu_to_group(cpu);
181 if (stripe_is_lowprio(sh))
182 list_add_tail(&sh->lru, &group->loprio_list);
183 else
184 list_add_tail(&sh->lru, &group->handle_list);
185 group->stripes_cnt++;
186 sh->group = group;
187 }
188
189 if (conf->worker_cnt_per_group == 0) {
190 md_wakeup_thread(conf->mddev->thread);
191 return;
192 }
193
194 group = conf->worker_groups + cpu_to_group(sh->cpu);
195
196 group->workers[0].working = true;
197
198 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
199
200 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
201
202 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
203 if (group->workers[i].working == false) {
204 group->workers[i].working = true;
205 queue_work_on(sh->cpu, raid5_wq,
206 &group->workers[i].work);
207 thread_cnt--;
208 }
209 }
210}
211
212static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
213 struct list_head *temp_inactive_list)
214{
215 int i;
216 int injournal = 0;
217
218 BUG_ON(!list_empty(&sh->lru));
219 BUG_ON(atomic_read(&conf->active_stripes)==0);
220
221 if (r5c_is_writeback(conf->log))
222 for (i = sh->disks; i--; )
223 if (test_bit(R5_InJournal, &sh->dev[i].flags))
224 injournal++;
225
226
227
228
229
230
231
232 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
233 (conf->quiesce && r5c_is_writeback(conf->log) &&
234 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
235 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
236 r5c_make_stripe_write_out(sh);
237 set_bit(STRIPE_HANDLE, &sh->state);
238 }
239
240 if (test_bit(STRIPE_HANDLE, &sh->state)) {
241 if (test_bit(STRIPE_DELAYED, &sh->state) &&
242 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
243 list_add_tail(&sh->lru, &conf->delayed_list);
244 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
245 sh->bm_seq - conf->seq_write > 0)
246 list_add_tail(&sh->lru, &conf->bitmap_list);
247 else {
248 clear_bit(STRIPE_DELAYED, &sh->state);
249 clear_bit(STRIPE_BIT_DELAY, &sh->state);
250 if (conf->worker_cnt_per_group == 0) {
251 if (stripe_is_lowprio(sh))
252 list_add_tail(&sh->lru,
253 &conf->loprio_list);
254 else
255 list_add_tail(&sh->lru,
256 &conf->handle_list);
257 } else {
258 raid5_wakeup_stripe_thread(sh);
259 return;
260 }
261 }
262 md_wakeup_thread(conf->mddev->thread);
263 } else {
264 BUG_ON(stripe_operations_active(sh));
265 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
266 if (atomic_dec_return(&conf->preread_active_stripes)
267 < IO_THRESHOLD)
268 md_wakeup_thread(conf->mddev->thread);
269 atomic_dec(&conf->active_stripes);
270 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
271 if (!r5c_is_writeback(conf->log))
272 list_add_tail(&sh->lru, temp_inactive_list);
273 else {
274 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
275 if (injournal == 0)
276 list_add_tail(&sh->lru, temp_inactive_list);
277 else if (injournal == conf->raid_disks - conf->max_degraded) {
278
279 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
280 atomic_inc(&conf->r5c_cached_full_stripes);
281 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
282 atomic_dec(&conf->r5c_cached_partial_stripes);
283 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
284 r5c_check_cached_full_stripe(conf);
285 } else
286
287
288
289
290
291 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
292 }
293 }
294 }
295}
296
297static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
298 struct list_head *temp_inactive_list)
299{
300 if (atomic_dec_and_test(&sh->count))
301 do_release_stripe(conf, sh, temp_inactive_list);
302}
303
304
305
306
307
308
309
310
311static void release_inactive_stripe_list(struct r5conf *conf,
312 struct list_head *temp_inactive_list,
313 int hash)
314{
315 int size;
316 bool do_wakeup = false;
317 unsigned long flags;
318
319 if (hash == NR_STRIPE_HASH_LOCKS) {
320 size = NR_STRIPE_HASH_LOCKS;
321 hash = NR_STRIPE_HASH_LOCKS - 1;
322 } else
323 size = 1;
324 while (size) {
325 struct list_head *list = &temp_inactive_list[size - 1];
326
327
328
329
330
331 if (!list_empty_careful(list)) {
332 spin_lock_irqsave(conf->hash_locks + hash, flags);
333 if (list_empty(conf->inactive_list + hash) &&
334 !list_empty(list))
335 atomic_dec(&conf->empty_inactive_list_nr);
336 list_splice_tail_init(list, conf->inactive_list + hash);
337 do_wakeup = true;
338 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
339 }
340 size--;
341 hash--;
342 }
343
344 if (do_wakeup) {
345 wake_up(&conf->wait_for_stripe);
346 if (atomic_read(&conf->active_stripes) == 0)
347 wake_up(&conf->wait_for_quiescent);
348 if (conf->retry_read_aligned)
349 md_wakeup_thread(conf->mddev->thread);
350 }
351}
352
353
354static int release_stripe_list(struct r5conf *conf,
355 struct list_head *temp_inactive_list)
356{
357 struct stripe_head *sh, *t;
358 int count = 0;
359 struct llist_node *head;
360
361 head = llist_del_all(&conf->released_stripes);
362 head = llist_reverse_order(head);
363 llist_for_each_entry_safe(sh, t, head, release_list) {
364 int hash;
365
366
367 smp_mb();
368 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
369
370
371
372
373
374 hash = sh->hash_lock_index;
375 __release_stripe(conf, sh, &temp_inactive_list[hash]);
376 count++;
377 }
378
379 return count;
380}
381
382void raid5_release_stripe(struct stripe_head *sh)
383{
384 struct r5conf *conf = sh->raid_conf;
385 unsigned long flags;
386 struct list_head list;
387 int hash;
388 bool wakeup;
389
390
391
392 if (atomic_add_unless(&sh->count, -1, 1))
393 return;
394
395 if (unlikely(!conf->mddev->thread) ||
396 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
397 goto slow_path;
398 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
399 if (wakeup)
400 md_wakeup_thread(conf->mddev->thread);
401 return;
402slow_path:
403
404 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
405 INIT_LIST_HEAD(&list);
406 hash = sh->hash_lock_index;
407 do_release_stripe(conf, sh, &list);
408 spin_unlock_irqrestore(&conf->device_lock, flags);
409 release_inactive_stripe_list(conf, &list, hash);
410 }
411}
412
413static inline void remove_hash(struct stripe_head *sh)
414{
415 pr_debug("remove_hash(), stripe %llu\n",
416 (unsigned long long)sh->sector);
417
418 hlist_del_init(&sh->hash);
419}
420
421static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
422{
423 struct hlist_head *hp = stripe_hash(conf, sh->sector);
424
425 pr_debug("insert_hash(), stripe %llu\n",
426 (unsigned long long)sh->sector);
427
428 hlist_add_head(&sh->hash, hp);
429}
430
431
432static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
433{
434 struct stripe_head *sh = NULL;
435 struct list_head *first;
436
437 if (list_empty(conf->inactive_list + hash))
438 goto out;
439 first = (conf->inactive_list + hash)->next;
440 sh = list_entry(first, struct stripe_head, lru);
441 list_del_init(first);
442 remove_hash(sh);
443 atomic_inc(&conf->active_stripes);
444 BUG_ON(hash != sh->hash_lock_index);
445 if (list_empty(conf->inactive_list + hash))
446 atomic_inc(&conf->empty_inactive_list_nr);
447out:
448 return sh;
449}
450
451static void shrink_buffers(struct stripe_head *sh)
452{
453 struct page *p;
454 int i;
455 int num = sh->raid_conf->pool_size;
456
457 for (i = 0; i < num ; i++) {
458 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
459 p = sh->dev[i].page;
460 if (!p)
461 continue;
462 sh->dev[i].page = NULL;
463 put_page(p);
464 }
465}
466
467static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
468{
469 int i;
470 int num = sh->raid_conf->pool_size;
471
472 for (i = 0; i < num; i++) {
473 struct page *page;
474
475 if (!(page = alloc_page(gfp))) {
476 return 1;
477 }
478 sh->dev[i].page = page;
479 sh->dev[i].orig_page = page;
480 }
481
482 return 0;
483}
484
485static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
486 struct stripe_head *sh);
487
488static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
489{
490 struct r5conf *conf = sh->raid_conf;
491 int i, seq;
492
493 BUG_ON(atomic_read(&sh->count) != 0);
494 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
495 BUG_ON(stripe_operations_active(sh));
496 BUG_ON(sh->batch_head);
497
498 pr_debug("init_stripe called, stripe %llu\n",
499 (unsigned long long)sector);
500retry:
501 seq = read_seqcount_begin(&conf->gen_lock);
502 sh->generation = conf->generation - previous;
503 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
504 sh->sector = sector;
505 stripe_set_idx(sector, conf, previous, sh);
506 sh->state = 0;
507
508 for (i = sh->disks; i--; ) {
509 struct r5dev *dev = &sh->dev[i];
510
511 if (dev->toread || dev->read || dev->towrite || dev->written ||
512 test_bit(R5_LOCKED, &dev->flags)) {
513 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
514 (unsigned long long)sh->sector, i, dev->toread,
515 dev->read, dev->towrite, dev->written,
516 test_bit(R5_LOCKED, &dev->flags));
517 WARN_ON(1);
518 }
519 dev->flags = 0;
520 dev->sector = raid5_compute_blocknr(sh, i, previous);
521 }
522 if (read_seqcount_retry(&conf->gen_lock, seq))
523 goto retry;
524 sh->overwrite_disks = 0;
525 insert_hash(conf, sh);
526 sh->cpu = smp_processor_id();
527 set_bit(STRIPE_BATCH_READY, &sh->state);
528}
529
530static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
531 short generation)
532{
533 struct stripe_head *sh;
534
535 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
536 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
537 if (sh->sector == sector && sh->generation == generation)
538 return sh;
539 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
540 return NULL;
541}
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556int raid5_calc_degraded(struct r5conf *conf)
557{
558 int degraded, degraded2;
559 int i;
560
561 rcu_read_lock();
562 degraded = 0;
563 for (i = 0; i < conf->previous_raid_disks; i++) {
564 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
565 if (rdev && test_bit(Faulty, &rdev->flags))
566 rdev = rcu_dereference(conf->disks[i].replacement);
567 if (!rdev || test_bit(Faulty, &rdev->flags))
568 degraded++;
569 else if (test_bit(In_sync, &rdev->flags))
570 ;
571 else
572
573
574
575
576
577
578
579
580
581 if (conf->raid_disks >= conf->previous_raid_disks)
582 degraded++;
583 }
584 rcu_read_unlock();
585 if (conf->raid_disks == conf->previous_raid_disks)
586 return degraded;
587 rcu_read_lock();
588 degraded2 = 0;
589 for (i = 0; i < conf->raid_disks; i++) {
590 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
591 if (rdev && test_bit(Faulty, &rdev->flags))
592 rdev = rcu_dereference(conf->disks[i].replacement);
593 if (!rdev || test_bit(Faulty, &rdev->flags))
594 degraded2++;
595 else if (test_bit(In_sync, &rdev->flags))
596 ;
597 else
598
599
600
601
602
603 if (conf->raid_disks <= conf->previous_raid_disks)
604 degraded2++;
605 }
606 rcu_read_unlock();
607 if (degraded2 > degraded)
608 return degraded2;
609 return degraded;
610}
611
612static int has_failed(struct r5conf *conf)
613{
614 int degraded;
615
616 if (conf->mddev->reshape_position == MaxSector)
617 return conf->mddev->degraded > conf->max_degraded;
618
619 degraded = raid5_calc_degraded(conf);
620 if (degraded > conf->max_degraded)
621 return 1;
622 return 0;
623}
624
625struct stripe_head *
626raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
627 int previous, int noblock, int noquiesce)
628{
629 struct stripe_head *sh;
630 int hash = stripe_hash_locks_hash(sector);
631 int inc_empty_inactive_list_flag;
632
633 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
634
635 spin_lock_irq(conf->hash_locks + hash);
636
637 do {
638 wait_event_lock_irq(conf->wait_for_quiescent,
639 conf->quiesce == 0 || noquiesce,
640 *(conf->hash_locks + hash));
641 sh = __find_stripe(conf, sector, conf->generation - previous);
642 if (!sh) {
643 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
644 sh = get_free_stripe(conf, hash);
645 if (!sh && !test_bit(R5_DID_ALLOC,
646 &conf->cache_state))
647 set_bit(R5_ALLOC_MORE,
648 &conf->cache_state);
649 }
650 if (noblock && sh == NULL)
651 break;
652
653 r5c_check_stripe_cache_usage(conf);
654 if (!sh) {
655 set_bit(R5_INACTIVE_BLOCKED,
656 &conf->cache_state);
657 r5l_wake_reclaim(conf->log, 0);
658 wait_event_lock_irq(
659 conf->wait_for_stripe,
660 !list_empty(conf->inactive_list + hash) &&
661 (atomic_read(&conf->active_stripes)
662 < (conf->max_nr_stripes * 3 / 4)
663 || !test_bit(R5_INACTIVE_BLOCKED,
664 &conf->cache_state)),
665 *(conf->hash_locks + hash));
666 clear_bit(R5_INACTIVE_BLOCKED,
667 &conf->cache_state);
668 } else {
669 init_stripe(sh, sector, previous);
670 atomic_inc(&sh->count);
671 }
672 } else if (!atomic_inc_not_zero(&sh->count)) {
673 spin_lock(&conf->device_lock);
674 if (!atomic_read(&sh->count)) {
675 if (!test_bit(STRIPE_HANDLE, &sh->state))
676 atomic_inc(&conf->active_stripes);
677 BUG_ON(list_empty(&sh->lru) &&
678 !test_bit(STRIPE_EXPANDING, &sh->state));
679 inc_empty_inactive_list_flag = 0;
680 if (!list_empty(conf->inactive_list + hash))
681 inc_empty_inactive_list_flag = 1;
682 list_del_init(&sh->lru);
683 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
684 atomic_inc(&conf->empty_inactive_list_nr);
685 if (sh->group) {
686 sh->group->stripes_cnt--;
687 sh->group = NULL;
688 }
689 }
690 atomic_inc(&sh->count);
691 spin_unlock(&conf->device_lock);
692 }
693 } while (sh == NULL);
694
695 spin_unlock_irq(conf->hash_locks + hash);
696 return sh;
697}
698
699static bool is_full_stripe_write(struct stripe_head *sh)
700{
701 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
702 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
703}
704
705static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
706 __acquires(&sh1->stripe_lock)
707 __acquires(&sh2->stripe_lock)
708{
709 if (sh1 > sh2) {
710 spin_lock_irq(&sh2->stripe_lock);
711 spin_lock_nested(&sh1->stripe_lock, 1);
712 } else {
713 spin_lock_irq(&sh1->stripe_lock);
714 spin_lock_nested(&sh2->stripe_lock, 1);
715 }
716}
717
718static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
719 __releases(&sh1->stripe_lock)
720 __releases(&sh2->stripe_lock)
721{
722 spin_unlock(&sh1->stripe_lock);
723 spin_unlock_irq(&sh2->stripe_lock);
724}
725
726
727static bool stripe_can_batch(struct stripe_head *sh)
728{
729 struct r5conf *conf = sh->raid_conf;
730
731 if (raid5_has_log(conf) || raid5_has_ppl(conf))
732 return false;
733 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
734 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
735 is_full_stripe_write(sh);
736}
737
738
739static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
740{
741 struct stripe_head *head;
742 sector_t head_sector, tmp_sec;
743 int hash;
744 int dd_idx;
745 int inc_empty_inactive_list_flag;
746
747
748 tmp_sec = sh->sector;
749 if (!sector_div(tmp_sec, conf->chunk_sectors))
750 return;
751 head_sector = sh->sector - STRIPE_SECTORS;
752
753 hash = stripe_hash_locks_hash(head_sector);
754 spin_lock_irq(conf->hash_locks + hash);
755 head = __find_stripe(conf, head_sector, conf->generation);
756 if (head && !atomic_inc_not_zero(&head->count)) {
757 spin_lock(&conf->device_lock);
758 if (!atomic_read(&head->count)) {
759 if (!test_bit(STRIPE_HANDLE, &head->state))
760 atomic_inc(&conf->active_stripes);
761 BUG_ON(list_empty(&head->lru) &&
762 !test_bit(STRIPE_EXPANDING, &head->state));
763 inc_empty_inactive_list_flag = 0;
764 if (!list_empty(conf->inactive_list + hash))
765 inc_empty_inactive_list_flag = 1;
766 list_del_init(&head->lru);
767 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
768 atomic_inc(&conf->empty_inactive_list_nr);
769 if (head->group) {
770 head->group->stripes_cnt--;
771 head->group = NULL;
772 }
773 }
774 atomic_inc(&head->count);
775 spin_unlock(&conf->device_lock);
776 }
777 spin_unlock_irq(conf->hash_locks + hash);
778
779 if (!head)
780 return;
781 if (!stripe_can_batch(head))
782 goto out;
783
784 lock_two_stripes(head, sh);
785
786 if (!stripe_can_batch(head) || !stripe_can_batch(sh))
787 goto unlock_out;
788
789 if (sh->batch_head)
790 goto unlock_out;
791
792 dd_idx = 0;
793 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
794 dd_idx++;
795 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
796 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
797 goto unlock_out;
798
799 if (head->batch_head) {
800 spin_lock(&head->batch_head->batch_lock);
801
802 if (!stripe_can_batch(head)) {
803 spin_unlock(&head->batch_head->batch_lock);
804 goto unlock_out;
805 }
806
807
808
809
810
811
812
813 sh->batch_head = head->batch_head;
814
815
816
817
818
819 list_add(&sh->batch_list, &head->batch_list);
820 spin_unlock(&head->batch_head->batch_lock);
821 } else {
822 head->batch_head = head;
823 sh->batch_head = head->batch_head;
824 spin_lock(&head->batch_lock);
825 list_add_tail(&sh->batch_list, &head->batch_list);
826 spin_unlock(&head->batch_lock);
827 }
828
829 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
830 if (atomic_dec_return(&conf->preread_active_stripes)
831 < IO_THRESHOLD)
832 md_wakeup_thread(conf->mddev->thread);
833
834 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
835 int seq = sh->bm_seq;
836 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
837 sh->batch_head->bm_seq > seq)
838 seq = sh->batch_head->bm_seq;
839 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
840 sh->batch_head->bm_seq = seq;
841 }
842
843 atomic_inc(&sh->count);
844unlock_out:
845 unlock_two_stripes(head, sh);
846out:
847 raid5_release_stripe(head);
848}
849
850
851
852
853static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
854{
855 sector_t progress = conf->reshape_progress;
856
857
858
859
860 smp_rmb();
861 if (progress == MaxSector)
862 return 0;
863 if (sh->generation == conf->generation - 1)
864 return 0;
865
866
867
868 return 1;
869}
870
871static void dispatch_bio_list(struct bio_list *tmp)
872{
873 struct bio *bio;
874
875 while ((bio = bio_list_pop(tmp)))
876 generic_make_request(bio);
877}
878
879static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
880{
881 const struct r5pending_data *da = list_entry(a,
882 struct r5pending_data, sibling);
883 const struct r5pending_data *db = list_entry(b,
884 struct r5pending_data, sibling);
885 if (da->sector > db->sector)
886 return 1;
887 if (da->sector < db->sector)
888 return -1;
889 return 0;
890}
891
892static void dispatch_defer_bios(struct r5conf *conf, int target,
893 struct bio_list *list)
894{
895 struct r5pending_data *data;
896 struct list_head *first, *next = NULL;
897 int cnt = 0;
898
899 if (conf->pending_data_cnt == 0)
900 return;
901
902 list_sort(NULL, &conf->pending_list, cmp_stripe);
903
904 first = conf->pending_list.next;
905
906
907 if (conf->next_pending_data)
908 list_move_tail(&conf->pending_list,
909 &conf->next_pending_data->sibling);
910
911 while (!list_empty(&conf->pending_list)) {
912 data = list_first_entry(&conf->pending_list,
913 struct r5pending_data, sibling);
914 if (&data->sibling == first)
915 first = data->sibling.next;
916 next = data->sibling.next;
917
918 bio_list_merge(list, &data->bios);
919 list_move(&data->sibling, &conf->free_list);
920 cnt++;
921 if (cnt >= target)
922 break;
923 }
924 conf->pending_data_cnt -= cnt;
925 BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
926
927 if (next != &conf->pending_list)
928 conf->next_pending_data = list_entry(next,
929 struct r5pending_data, sibling);
930 else
931 conf->next_pending_data = NULL;
932
933 if (first != &conf->pending_list)
934 list_move_tail(&conf->pending_list, first);
935}
936
937static void flush_deferred_bios(struct r5conf *conf)
938{
939 struct bio_list tmp = BIO_EMPTY_LIST;
940
941 if (conf->pending_data_cnt == 0)
942 return;
943
944 spin_lock(&conf->pending_bios_lock);
945 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
946 BUG_ON(conf->pending_data_cnt != 0);
947 spin_unlock(&conf->pending_bios_lock);
948
949 dispatch_bio_list(&tmp);
950}
951
952static void defer_issue_bios(struct r5conf *conf, sector_t sector,
953 struct bio_list *bios)
954{
955 struct bio_list tmp = BIO_EMPTY_LIST;
956 struct r5pending_data *ent;
957
958 spin_lock(&conf->pending_bios_lock);
959 ent = list_first_entry(&conf->free_list, struct r5pending_data,
960 sibling);
961 list_move_tail(&ent->sibling, &conf->pending_list);
962 ent->sector = sector;
963 bio_list_init(&ent->bios);
964 bio_list_merge(&ent->bios, bios);
965 conf->pending_data_cnt++;
966 if (conf->pending_data_cnt >= PENDING_IO_MAX)
967 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
968
969 spin_unlock(&conf->pending_bios_lock);
970
971 dispatch_bio_list(&tmp);
972}
973
974static void
975raid5_end_read_request(struct bio *bi);
976static void
977raid5_end_write_request(struct bio *bi);
978
979static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
980{
981 struct r5conf *conf = sh->raid_conf;
982 int i, disks = sh->disks;
983 struct stripe_head *head_sh = sh;
984 struct bio_list pending_bios = BIO_EMPTY_LIST;
985 bool should_defer;
986
987 might_sleep();
988
989 if (log_stripe(sh, s) == 0)
990 return;
991
992 should_defer = conf->batch_bio_dispatch && conf->group_cnt;
993
994 for (i = disks; i--; ) {
995 int op, op_flags = 0;
996 int replace_only = 0;
997 struct bio *bi, *rbi;
998 struct md_rdev *rdev, *rrdev = NULL;
999
1000 sh = head_sh;
1001 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1002 op = REQ_OP_WRITE;
1003 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1004 op_flags = REQ_FUA;
1005 if (test_bit(R5_Discard, &sh->dev[i].flags))
1006 op = REQ_OP_DISCARD;
1007 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1008 op = REQ_OP_READ;
1009 else if (test_and_clear_bit(R5_WantReplace,
1010 &sh->dev[i].flags)) {
1011 op = REQ_OP_WRITE;
1012 replace_only = 1;
1013 } else
1014 continue;
1015 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1016 op_flags |= REQ_SYNC;
1017
1018again:
1019 bi = &sh->dev[i].req;
1020 rbi = &sh->dev[i].rreq;
1021
1022 rcu_read_lock();
1023 rrdev = rcu_dereference(conf->disks[i].replacement);
1024 smp_mb();
1025 rdev = rcu_dereference(conf->disks[i].rdev);
1026 if (!rdev) {
1027 rdev = rrdev;
1028 rrdev = NULL;
1029 }
1030 if (op_is_write(op)) {
1031 if (replace_only)
1032 rdev = NULL;
1033 if (rdev == rrdev)
1034
1035 rrdev = NULL;
1036 } else {
1037 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1038 rdev = rrdev;
1039 rrdev = NULL;
1040 }
1041
1042 if (rdev && test_bit(Faulty, &rdev->flags))
1043 rdev = NULL;
1044 if (rdev)
1045 atomic_inc(&rdev->nr_pending);
1046 if (rrdev && test_bit(Faulty, &rrdev->flags))
1047 rrdev = NULL;
1048 if (rrdev)
1049 atomic_inc(&rrdev->nr_pending);
1050 rcu_read_unlock();
1051
1052
1053
1054
1055
1056 while (op_is_write(op) && rdev &&
1057 test_bit(WriteErrorSeen, &rdev->flags)) {
1058 sector_t first_bad;
1059 int bad_sectors;
1060 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
1061 &first_bad, &bad_sectors);
1062 if (!bad)
1063 break;
1064
1065 if (bad < 0) {
1066 set_bit(BlockedBadBlocks, &rdev->flags);
1067 if (!conf->mddev->external &&
1068 conf->mddev->sb_flags) {
1069
1070
1071
1072
1073 md_check_recovery(conf->mddev);
1074 }
1075
1076
1077
1078
1079
1080 atomic_inc(&rdev->nr_pending);
1081 md_wait_for_blocked_rdev(rdev, conf->mddev);
1082 } else {
1083
1084 rdev_dec_pending(rdev, conf->mddev);
1085 rdev = NULL;
1086 }
1087 }
1088
1089 if (rdev) {
1090 if (s->syncing || s->expanding || s->expanded
1091 || s->replacing)
1092 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1093
1094 set_bit(STRIPE_IO_STARTED, &sh->state);
1095
1096 bio_set_dev(bi, rdev->bdev);
1097 bio_set_op_attrs(bi, op, op_flags);
1098 bi->bi_end_io = op_is_write(op)
1099 ? raid5_end_write_request
1100 : raid5_end_read_request;
1101 bi->bi_private = sh;
1102
1103 pr_debug("%s: for %llu schedule op %d on disc %d\n",
1104 __func__, (unsigned long long)sh->sector,
1105 bi->bi_opf, i);
1106 atomic_inc(&sh->count);
1107 if (sh != head_sh)
1108 atomic_inc(&head_sh->count);
1109 if (use_new_offset(conf, sh))
1110 bi->bi_iter.bi_sector = (sh->sector
1111 + rdev->new_data_offset);
1112 else
1113 bi->bi_iter.bi_sector = (sh->sector
1114 + rdev->data_offset);
1115 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1116 bi->bi_opf |= REQ_NOMERGE;
1117
1118 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1119 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1120
1121 if (!op_is_write(op) &&
1122 test_bit(R5_InJournal, &sh->dev[i].flags))
1123
1124
1125
1126
1127
1128 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1129 else
1130 sh->dev[i].vec.bv_page = sh->dev[i].page;
1131 bi->bi_vcnt = 1;
1132 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1133 bi->bi_io_vec[0].bv_offset = 0;
1134 bi->bi_iter.bi_size = STRIPE_SIZE;
1135 bi->bi_write_hint = sh->dev[i].write_hint;
1136 if (!rrdev)
1137 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1138
1139
1140
1141
1142 if (op == REQ_OP_DISCARD)
1143 bi->bi_vcnt = 0;
1144 if (rrdev)
1145 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1146
1147 if (conf->mddev->gendisk)
1148 trace_block_bio_remap(bi->bi_disk->queue,
1149 bi, disk_devt(conf->mddev->gendisk),
1150 sh->dev[i].sector);
1151 if (should_defer && op_is_write(op))
1152 bio_list_add(&pending_bios, bi);
1153 else
1154 generic_make_request(bi);
1155 }
1156 if (rrdev) {
1157 if (s->syncing || s->expanding || s->expanded
1158 || s->replacing)
1159 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1160
1161 set_bit(STRIPE_IO_STARTED, &sh->state);
1162
1163 bio_set_dev(rbi, rrdev->bdev);
1164 bio_set_op_attrs(rbi, op, op_flags);
1165 BUG_ON(!op_is_write(op));
1166 rbi->bi_end_io = raid5_end_write_request;
1167 rbi->bi_private = sh;
1168
1169 pr_debug("%s: for %llu schedule op %d on "
1170 "replacement disc %d\n",
1171 __func__, (unsigned long long)sh->sector,
1172 rbi->bi_opf, i);
1173 atomic_inc(&sh->count);
1174 if (sh != head_sh)
1175 atomic_inc(&head_sh->count);
1176 if (use_new_offset(conf, sh))
1177 rbi->bi_iter.bi_sector = (sh->sector
1178 + rrdev->new_data_offset);
1179 else
1180 rbi->bi_iter.bi_sector = (sh->sector
1181 + rrdev->data_offset);
1182 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1183 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1184 sh->dev[i].rvec.bv_page = sh->dev[i].page;
1185 rbi->bi_vcnt = 1;
1186 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1187 rbi->bi_io_vec[0].bv_offset = 0;
1188 rbi->bi_iter.bi_size = STRIPE_SIZE;
1189 rbi->bi_write_hint = sh->dev[i].write_hint;
1190 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1191
1192
1193
1194
1195 if (op == REQ_OP_DISCARD)
1196 rbi->bi_vcnt = 0;
1197 if (conf->mddev->gendisk)
1198 trace_block_bio_remap(rbi->bi_disk->queue,
1199 rbi, disk_devt(conf->mddev->gendisk),
1200 sh->dev[i].sector);
1201 if (should_defer && op_is_write(op))
1202 bio_list_add(&pending_bios, rbi);
1203 else
1204 generic_make_request(rbi);
1205 }
1206 if (!rdev && !rrdev) {
1207 if (op_is_write(op))
1208 set_bit(STRIPE_DEGRADED, &sh->state);
1209 pr_debug("skip op %d on disc %d for sector %llu\n",
1210 bi->bi_opf, i, (unsigned long long)sh->sector);
1211 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1212 set_bit(STRIPE_HANDLE, &sh->state);
1213 }
1214
1215 if (!head_sh->batch_head)
1216 continue;
1217 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1218 batch_list);
1219 if (sh != head_sh)
1220 goto again;
1221 }
1222
1223 if (should_defer && !bio_list_empty(&pending_bios))
1224 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1225}
1226
1227static struct dma_async_tx_descriptor *
1228async_copy_data(int frombio, struct bio *bio, struct page **page,
1229 sector_t sector, struct dma_async_tx_descriptor *tx,
1230 struct stripe_head *sh, int no_skipcopy)
1231{
1232 struct bio_vec bvl;
1233 struct bvec_iter iter;
1234 struct page *bio_page;
1235 int page_offset;
1236 struct async_submit_ctl submit;
1237 enum async_tx_flags flags = 0;
1238
1239 if (bio->bi_iter.bi_sector >= sector)
1240 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1241 else
1242 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1243
1244 if (frombio)
1245 flags |= ASYNC_TX_FENCE;
1246 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1247
1248 bio_for_each_segment(bvl, bio, iter) {
1249 int len = bvl.bv_len;
1250 int clen;
1251 int b_offset = 0;
1252
1253 if (page_offset < 0) {
1254 b_offset = -page_offset;
1255 page_offset += b_offset;
1256 len -= b_offset;
1257 }
1258
1259 if (len > 0 && page_offset + len > STRIPE_SIZE)
1260 clen = STRIPE_SIZE - page_offset;
1261 else
1262 clen = len;
1263
1264 if (clen > 0) {
1265 b_offset += bvl.bv_offset;
1266 bio_page = bvl.bv_page;
1267 if (frombio) {
1268 if (sh->raid_conf->skip_copy &&
1269 b_offset == 0 && page_offset == 0 &&
1270 clen == STRIPE_SIZE &&
1271 !no_skipcopy)
1272 *page = bio_page;
1273 else
1274 tx = async_memcpy(*page, bio_page, page_offset,
1275 b_offset, clen, &submit);
1276 } else
1277 tx = async_memcpy(bio_page, *page, b_offset,
1278 page_offset, clen, &submit);
1279 }
1280
1281 submit.depend_tx = tx;
1282
1283 if (clen < len)
1284 break;
1285 page_offset += len;
1286 }
1287
1288 return tx;
1289}
1290
1291static void ops_complete_biofill(void *stripe_head_ref)
1292{
1293 struct stripe_head *sh = stripe_head_ref;
1294 int i;
1295
1296 pr_debug("%s: stripe %llu\n", __func__,
1297 (unsigned long long)sh->sector);
1298
1299
1300 for (i = sh->disks; i--; ) {
1301 struct r5dev *dev = &sh->dev[i];
1302
1303
1304
1305
1306
1307
1308 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1309 struct bio *rbi, *rbi2;
1310
1311 BUG_ON(!dev->read);
1312 rbi = dev->read;
1313 dev->read = NULL;
1314 while (rbi && rbi->bi_iter.bi_sector <
1315 dev->sector + STRIPE_SECTORS) {
1316 rbi2 = r5_next_bio(rbi, dev->sector);
1317 bio_endio(rbi);
1318 rbi = rbi2;
1319 }
1320 }
1321 }
1322 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1323
1324 set_bit(STRIPE_HANDLE, &sh->state);
1325 raid5_release_stripe(sh);
1326}
1327
1328static void ops_run_biofill(struct stripe_head *sh)
1329{
1330 struct dma_async_tx_descriptor *tx = NULL;
1331 struct async_submit_ctl submit;
1332 int i;
1333
1334 BUG_ON(sh->batch_head);
1335 pr_debug("%s: stripe %llu\n", __func__,
1336 (unsigned long long)sh->sector);
1337
1338 for (i = sh->disks; i--; ) {
1339 struct r5dev *dev = &sh->dev[i];
1340 if (test_bit(R5_Wantfill, &dev->flags)) {
1341 struct bio *rbi;
1342 spin_lock_irq(&sh->stripe_lock);
1343 dev->read = rbi = dev->toread;
1344 dev->toread = NULL;
1345 spin_unlock_irq(&sh->stripe_lock);
1346 while (rbi && rbi->bi_iter.bi_sector <
1347 dev->sector + STRIPE_SECTORS) {
1348 tx = async_copy_data(0, rbi, &dev->page,
1349 dev->sector, tx, sh, 0);
1350 rbi = r5_next_bio(rbi, dev->sector);
1351 }
1352 }
1353 }
1354
1355 atomic_inc(&sh->count);
1356 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1357 async_trigger_callback(&submit);
1358}
1359
1360static void mark_target_uptodate(struct stripe_head *sh, int target)
1361{
1362 struct r5dev *tgt;
1363
1364 if (target < 0)
1365 return;
1366
1367 tgt = &sh->dev[target];
1368 set_bit(R5_UPTODATE, &tgt->flags);
1369 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1370 clear_bit(R5_Wantcompute, &tgt->flags);
1371}
1372
1373static void ops_complete_compute(void *stripe_head_ref)
1374{
1375 struct stripe_head *sh = stripe_head_ref;
1376
1377 pr_debug("%s: stripe %llu\n", __func__,
1378 (unsigned long long)sh->sector);
1379
1380
1381 mark_target_uptodate(sh, sh->ops.target);
1382 mark_target_uptodate(sh, sh->ops.target2);
1383
1384 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1385 if (sh->check_state == check_state_compute_run)
1386 sh->check_state = check_state_compute_result;
1387 set_bit(STRIPE_HANDLE, &sh->state);
1388 raid5_release_stripe(sh);
1389}
1390
1391
1392static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1393{
1394 return percpu->scribble + i * percpu->scribble_obj_size;
1395}
1396
1397
1398static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1399 struct raid5_percpu *percpu, int i)
1400{
1401 return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1402}
1403
1404static struct dma_async_tx_descriptor *
1405ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1406{
1407 int disks = sh->disks;
1408 struct page **xor_srcs = to_addr_page(percpu, 0);
1409 int target = sh->ops.target;
1410 struct r5dev *tgt = &sh->dev[target];
1411 struct page *xor_dest = tgt->page;
1412 int count = 0;
1413 struct dma_async_tx_descriptor *tx;
1414 struct async_submit_ctl submit;
1415 int i;
1416
1417 BUG_ON(sh->batch_head);
1418
1419 pr_debug("%s: stripe %llu block: %d\n",
1420 __func__, (unsigned long long)sh->sector, target);
1421 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1422
1423 for (i = disks; i--; )
1424 if (i != target)
1425 xor_srcs[count++] = sh->dev[i].page;
1426
1427 atomic_inc(&sh->count);
1428
1429 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1430 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1431 if (unlikely(count == 1))
1432 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1433 else
1434 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1435
1436 return tx;
1437}
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448static int set_syndrome_sources(struct page **srcs,
1449 struct stripe_head *sh,
1450 int srctype)
1451{
1452 int disks = sh->disks;
1453 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1454 int d0_idx = raid6_d0(sh);
1455 int count;
1456 int i;
1457
1458 for (i = 0; i < disks; i++)
1459 srcs[i] = NULL;
1460
1461 count = 0;
1462 i = d0_idx;
1463 do {
1464 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1465 struct r5dev *dev = &sh->dev[i];
1466
1467 if (i == sh->qd_idx || i == sh->pd_idx ||
1468 (srctype == SYNDROME_SRC_ALL) ||
1469 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1470 (test_bit(R5_Wantdrain, &dev->flags) ||
1471 test_bit(R5_InJournal, &dev->flags))) ||
1472 (srctype == SYNDROME_SRC_WRITTEN &&
1473 (dev->written ||
1474 test_bit(R5_InJournal, &dev->flags)))) {
1475 if (test_bit(R5_InJournal, &dev->flags))
1476 srcs[slot] = sh->dev[i].orig_page;
1477 else
1478 srcs[slot] = sh->dev[i].page;
1479 }
1480 i = raid6_next_disk(i, disks);
1481 } while (i != d0_idx);
1482
1483 return syndrome_disks;
1484}
1485
1486static struct dma_async_tx_descriptor *
1487ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1488{
1489 int disks = sh->disks;
1490 struct page **blocks = to_addr_page(percpu, 0);
1491 int target;
1492 int qd_idx = sh->qd_idx;
1493 struct dma_async_tx_descriptor *tx;
1494 struct async_submit_ctl submit;
1495 struct r5dev *tgt;
1496 struct page *dest;
1497 int i;
1498 int count;
1499
1500 BUG_ON(sh->batch_head);
1501 if (sh->ops.target < 0)
1502 target = sh->ops.target2;
1503 else if (sh->ops.target2 < 0)
1504 target = sh->ops.target;
1505 else
1506
1507 BUG();
1508 BUG_ON(target < 0);
1509 pr_debug("%s: stripe %llu block: %d\n",
1510 __func__, (unsigned long long)sh->sector, target);
1511
1512 tgt = &sh->dev[target];
1513 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1514 dest = tgt->page;
1515
1516 atomic_inc(&sh->count);
1517
1518 if (target == qd_idx) {
1519 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1520 blocks[count] = NULL;
1521 BUG_ON(blocks[count+1] != dest);
1522 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1523 ops_complete_compute, sh,
1524 to_addr_conv(sh, percpu, 0));
1525 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1526 } else {
1527
1528 count = 0;
1529 for (i = disks; i-- ; ) {
1530 if (i == target || i == qd_idx)
1531 continue;
1532 blocks[count++] = sh->dev[i].page;
1533 }
1534
1535 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1536 NULL, ops_complete_compute, sh,
1537 to_addr_conv(sh, percpu, 0));
1538 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1539 }
1540
1541 return tx;
1542}
1543
1544static struct dma_async_tx_descriptor *
1545ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1546{
1547 int i, count, disks = sh->disks;
1548 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1549 int d0_idx = raid6_d0(sh);
1550 int faila = -1, failb = -1;
1551 int target = sh->ops.target;
1552 int target2 = sh->ops.target2;
1553 struct r5dev *tgt = &sh->dev[target];
1554 struct r5dev *tgt2 = &sh->dev[target2];
1555 struct dma_async_tx_descriptor *tx;
1556 struct page **blocks = to_addr_page(percpu, 0);
1557 struct async_submit_ctl submit;
1558
1559 BUG_ON(sh->batch_head);
1560 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1561 __func__, (unsigned long long)sh->sector, target, target2);
1562 BUG_ON(target < 0 || target2 < 0);
1563 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1564 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1565
1566
1567
1568
1569 for (i = 0; i < disks ; i++)
1570 blocks[i] = NULL;
1571 count = 0;
1572 i = d0_idx;
1573 do {
1574 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1575
1576 blocks[slot] = sh->dev[i].page;
1577
1578 if (i == target)
1579 faila = slot;
1580 if (i == target2)
1581 failb = slot;
1582 i = raid6_next_disk(i, disks);
1583 } while (i != d0_idx);
1584
1585 BUG_ON(faila == failb);
1586 if (failb < faila)
1587 swap(faila, failb);
1588 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1589 __func__, (unsigned long long)sh->sector, faila, failb);
1590
1591 atomic_inc(&sh->count);
1592
1593 if (failb == syndrome_disks+1) {
1594
1595 if (faila == syndrome_disks) {
1596
1597 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1598 ops_complete_compute, sh,
1599 to_addr_conv(sh, percpu, 0));
1600 return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1601 STRIPE_SIZE, &submit);
1602 } else {
1603 struct page *dest;
1604 int data_target;
1605 int qd_idx = sh->qd_idx;
1606
1607
1608 if (target == qd_idx)
1609 data_target = target2;
1610 else
1611 data_target = target;
1612
1613 count = 0;
1614 for (i = disks; i-- ; ) {
1615 if (i == data_target || i == qd_idx)
1616 continue;
1617 blocks[count++] = sh->dev[i].page;
1618 }
1619 dest = sh->dev[data_target].page;
1620 init_async_submit(&submit,
1621 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1622 NULL, NULL, NULL,
1623 to_addr_conv(sh, percpu, 0));
1624 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1625 &submit);
1626
1627 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1628 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1629 ops_complete_compute, sh,
1630 to_addr_conv(sh, percpu, 0));
1631 return async_gen_syndrome(blocks, 0, count+2,
1632 STRIPE_SIZE, &submit);
1633 }
1634 } else {
1635 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1636 ops_complete_compute, sh,
1637 to_addr_conv(sh, percpu, 0));
1638 if (failb == syndrome_disks) {
1639
1640 return async_raid6_datap_recov(syndrome_disks+2,
1641 STRIPE_SIZE, faila,
1642 blocks, &submit);
1643 } else {
1644
1645 return async_raid6_2data_recov(syndrome_disks+2,
1646 STRIPE_SIZE, faila, failb,
1647 blocks, &submit);
1648 }
1649 }
1650}
1651
1652static void ops_complete_prexor(void *stripe_head_ref)
1653{
1654 struct stripe_head *sh = stripe_head_ref;
1655
1656 pr_debug("%s: stripe %llu\n", __func__,
1657 (unsigned long long)sh->sector);
1658
1659 if (r5c_is_writeback(sh->raid_conf->log))
1660
1661
1662
1663
1664 r5c_release_extra_page(sh);
1665}
1666
1667static struct dma_async_tx_descriptor *
1668ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1669 struct dma_async_tx_descriptor *tx)
1670{
1671 int disks = sh->disks;
1672 struct page **xor_srcs = to_addr_page(percpu, 0);
1673 int count = 0, pd_idx = sh->pd_idx, i;
1674 struct async_submit_ctl submit;
1675
1676
1677 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1678
1679 BUG_ON(sh->batch_head);
1680 pr_debug("%s: stripe %llu\n", __func__,
1681 (unsigned long long)sh->sector);
1682
1683 for (i = disks; i--; ) {
1684 struct r5dev *dev = &sh->dev[i];
1685
1686 if (test_bit(R5_InJournal, &dev->flags))
1687 xor_srcs[count++] = dev->orig_page;
1688 else if (test_bit(R5_Wantdrain, &dev->flags))
1689 xor_srcs[count++] = dev->page;
1690 }
1691
1692 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1693 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1694 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1695
1696 return tx;
1697}
1698
1699static struct dma_async_tx_descriptor *
1700ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1701 struct dma_async_tx_descriptor *tx)
1702{
1703 struct page **blocks = to_addr_page(percpu, 0);
1704 int count;
1705 struct async_submit_ctl submit;
1706
1707 pr_debug("%s: stripe %llu\n", __func__,
1708 (unsigned long long)sh->sector);
1709
1710 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1711
1712 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1713 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1714 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1715
1716 return tx;
1717}
1718
1719static struct dma_async_tx_descriptor *
1720ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1721{
1722 struct r5conf *conf = sh->raid_conf;
1723 int disks = sh->disks;
1724 int i;
1725 struct stripe_head *head_sh = sh;
1726
1727 pr_debug("%s: stripe %llu\n", __func__,
1728 (unsigned long long)sh->sector);
1729
1730 for (i = disks; i--; ) {
1731 struct r5dev *dev;
1732 struct bio *chosen;
1733
1734 sh = head_sh;
1735 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1736 struct bio *wbi;
1737
1738again:
1739 dev = &sh->dev[i];
1740
1741
1742
1743
1744 clear_bit(R5_InJournal, &dev->flags);
1745 spin_lock_irq(&sh->stripe_lock);
1746 chosen = dev->towrite;
1747 dev->towrite = NULL;
1748 sh->overwrite_disks = 0;
1749 BUG_ON(dev->written);
1750 wbi = dev->written = chosen;
1751 spin_unlock_irq(&sh->stripe_lock);
1752 WARN_ON(dev->page != dev->orig_page);
1753
1754 while (wbi && wbi->bi_iter.bi_sector <
1755 dev->sector + STRIPE_SECTORS) {
1756 if (wbi->bi_opf & REQ_FUA)
1757 set_bit(R5_WantFUA, &dev->flags);
1758 if (wbi->bi_opf & REQ_SYNC)
1759 set_bit(R5_SyncIO, &dev->flags);
1760 if (bio_op(wbi) == REQ_OP_DISCARD)
1761 set_bit(R5_Discard, &dev->flags);
1762 else {
1763 tx = async_copy_data(1, wbi, &dev->page,
1764 dev->sector, tx, sh,
1765 r5c_is_writeback(conf->log));
1766 if (dev->page != dev->orig_page &&
1767 !r5c_is_writeback(conf->log)) {
1768 set_bit(R5_SkipCopy, &dev->flags);
1769 clear_bit(R5_UPTODATE, &dev->flags);
1770 clear_bit(R5_OVERWRITE, &dev->flags);
1771 }
1772 }
1773 wbi = r5_next_bio(wbi, dev->sector);
1774 }
1775
1776 if (head_sh->batch_head) {
1777 sh = list_first_entry(&sh->batch_list,
1778 struct stripe_head,
1779 batch_list);
1780 if (sh == head_sh)
1781 continue;
1782 goto again;
1783 }
1784 }
1785 }
1786
1787 return tx;
1788}
1789
1790static void ops_complete_reconstruct(void *stripe_head_ref)
1791{
1792 struct stripe_head *sh = stripe_head_ref;
1793 int disks = sh->disks;
1794 int pd_idx = sh->pd_idx;
1795 int qd_idx = sh->qd_idx;
1796 int i;
1797 bool fua = false, sync = false, discard = false;
1798
1799 pr_debug("%s: stripe %llu\n", __func__,
1800 (unsigned long long)sh->sector);
1801
1802 for (i = disks; i--; ) {
1803 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1804 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1805 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1806 }
1807
1808 for (i = disks; i--; ) {
1809 struct r5dev *dev = &sh->dev[i];
1810
1811 if (dev->written || i == pd_idx || i == qd_idx) {
1812 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
1813 set_bit(R5_UPTODATE, &dev->flags);
1814 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
1815 set_bit(R5_Expanded, &dev->flags);
1816 }
1817 if (fua)
1818 set_bit(R5_WantFUA, &dev->flags);
1819 if (sync)
1820 set_bit(R5_SyncIO, &dev->flags);
1821 }
1822 }
1823
1824 if (sh->reconstruct_state == reconstruct_state_drain_run)
1825 sh->reconstruct_state = reconstruct_state_drain_result;
1826 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1827 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1828 else {
1829 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1830 sh->reconstruct_state = reconstruct_state_result;
1831 }
1832
1833 set_bit(STRIPE_HANDLE, &sh->state);
1834 raid5_release_stripe(sh);
1835}
1836
1837static void
1838ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1839 struct dma_async_tx_descriptor *tx)
1840{
1841 int disks = sh->disks;
1842 struct page **xor_srcs;
1843 struct async_submit_ctl submit;
1844 int count, pd_idx = sh->pd_idx, i;
1845 struct page *xor_dest;
1846 int prexor = 0;
1847 unsigned long flags;
1848 int j = 0;
1849 struct stripe_head *head_sh = sh;
1850 int last_stripe;
1851
1852 pr_debug("%s: stripe %llu\n", __func__,
1853 (unsigned long long)sh->sector);
1854
1855 for (i = 0; i < sh->disks; i++) {
1856 if (pd_idx == i)
1857 continue;
1858 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1859 break;
1860 }
1861 if (i >= sh->disks) {
1862 atomic_inc(&sh->count);
1863 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1864 ops_complete_reconstruct(sh);
1865 return;
1866 }
1867again:
1868 count = 0;
1869 xor_srcs = to_addr_page(percpu, j);
1870
1871
1872
1873 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1874 prexor = 1;
1875 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1876 for (i = disks; i--; ) {
1877 struct r5dev *dev = &sh->dev[i];
1878 if (head_sh->dev[i].written ||
1879 test_bit(R5_InJournal, &head_sh->dev[i].flags))
1880 xor_srcs[count++] = dev->page;
1881 }
1882 } else {
1883 xor_dest = sh->dev[pd_idx].page;
1884 for (i = disks; i--; ) {
1885 struct r5dev *dev = &sh->dev[i];
1886 if (i != pd_idx)
1887 xor_srcs[count++] = dev->page;
1888 }
1889 }
1890
1891
1892
1893
1894
1895
1896 last_stripe = !head_sh->batch_head ||
1897 list_first_entry(&sh->batch_list,
1898 struct stripe_head, batch_list) == head_sh;
1899 if (last_stripe) {
1900 flags = ASYNC_TX_ACK |
1901 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1902
1903 atomic_inc(&head_sh->count);
1904 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1905 to_addr_conv(sh, percpu, j));
1906 } else {
1907 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1908 init_async_submit(&submit, flags, tx, NULL, NULL,
1909 to_addr_conv(sh, percpu, j));
1910 }
1911
1912 if (unlikely(count == 1))
1913 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1914 else
1915 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1916 if (!last_stripe) {
1917 j++;
1918 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1919 batch_list);
1920 goto again;
1921 }
1922}
1923
1924static void
1925ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1926 struct dma_async_tx_descriptor *tx)
1927{
1928 struct async_submit_ctl submit;
1929 struct page **blocks;
1930 int count, i, j = 0;
1931 struct stripe_head *head_sh = sh;
1932 int last_stripe;
1933 int synflags;
1934 unsigned long txflags;
1935
1936 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1937
1938 for (i = 0; i < sh->disks; i++) {
1939 if (sh->pd_idx == i || sh->qd_idx == i)
1940 continue;
1941 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1942 break;
1943 }
1944 if (i >= sh->disks) {
1945 atomic_inc(&sh->count);
1946 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1947 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1948 ops_complete_reconstruct(sh);
1949 return;
1950 }
1951
1952again:
1953 blocks = to_addr_page(percpu, j);
1954
1955 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1956 synflags = SYNDROME_SRC_WRITTEN;
1957 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1958 } else {
1959 synflags = SYNDROME_SRC_ALL;
1960 txflags = ASYNC_TX_ACK;
1961 }
1962
1963 count = set_syndrome_sources(blocks, sh, synflags);
1964 last_stripe = !head_sh->batch_head ||
1965 list_first_entry(&sh->batch_list,
1966 struct stripe_head, batch_list) == head_sh;
1967
1968 if (last_stripe) {
1969 atomic_inc(&head_sh->count);
1970 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1971 head_sh, to_addr_conv(sh, percpu, j));
1972 } else
1973 init_async_submit(&submit, 0, tx, NULL, NULL,
1974 to_addr_conv(sh, percpu, j));
1975 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1976 if (!last_stripe) {
1977 j++;
1978 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1979 batch_list);
1980 goto again;
1981 }
1982}
1983
1984static void ops_complete_check(void *stripe_head_ref)
1985{
1986 struct stripe_head *sh = stripe_head_ref;
1987
1988 pr_debug("%s: stripe %llu\n", __func__,
1989 (unsigned long long)sh->sector);
1990
1991 sh->check_state = check_state_check_result;
1992 set_bit(STRIPE_HANDLE, &sh->state);
1993 raid5_release_stripe(sh);
1994}
1995
1996static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1997{
1998 int disks = sh->disks;
1999 int pd_idx = sh->pd_idx;
2000 int qd_idx = sh->qd_idx;
2001 struct page *xor_dest;
2002 struct page **xor_srcs = to_addr_page(percpu, 0);
2003 struct dma_async_tx_descriptor *tx;
2004 struct async_submit_ctl submit;
2005 int count;
2006 int i;
2007
2008 pr_debug("%s: stripe %llu\n", __func__,
2009 (unsigned long long)sh->sector);
2010
2011 BUG_ON(sh->batch_head);
2012 count = 0;
2013 xor_dest = sh->dev[pd_idx].page;
2014 xor_srcs[count++] = xor_dest;
2015 for (i = disks; i--; ) {
2016 if (i == pd_idx || i == qd_idx)
2017 continue;
2018 xor_srcs[count++] = sh->dev[i].page;
2019 }
2020
2021 init_async_submit(&submit, 0, NULL, NULL, NULL,
2022 to_addr_conv(sh, percpu, 0));
2023 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
2024 &sh->ops.zero_sum_result, &submit);
2025
2026 atomic_inc(&sh->count);
2027 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2028 tx = async_trigger_callback(&submit);
2029}
2030
2031static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2032{
2033 struct page **srcs = to_addr_page(percpu, 0);
2034 struct async_submit_ctl submit;
2035 int count;
2036
2037 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2038 (unsigned long long)sh->sector, checkp);
2039
2040 BUG_ON(sh->batch_head);
2041 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2042 if (!checkp)
2043 srcs[count] = NULL;
2044
2045 atomic_inc(&sh->count);
2046 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2047 sh, to_addr_conv(sh, percpu, 0));
2048 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
2049 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2050}
2051
2052static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2053{
2054 int overlap_clear = 0, i, disks = sh->disks;
2055 struct dma_async_tx_descriptor *tx = NULL;
2056 struct r5conf *conf = sh->raid_conf;
2057 int level = conf->level;
2058 struct raid5_percpu *percpu;
2059 unsigned long cpu;
2060
2061 cpu = get_cpu();
2062 percpu = per_cpu_ptr(conf->percpu, cpu);
2063 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2064 ops_run_biofill(sh);
2065 overlap_clear++;
2066 }
2067
2068 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2069 if (level < 6)
2070 tx = ops_run_compute5(sh, percpu);
2071 else {
2072 if (sh->ops.target2 < 0 || sh->ops.target < 0)
2073 tx = ops_run_compute6_1(sh, percpu);
2074 else
2075 tx = ops_run_compute6_2(sh, percpu);
2076 }
2077
2078 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2079 async_tx_ack(tx);
2080 }
2081
2082 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2083 if (level < 6)
2084 tx = ops_run_prexor5(sh, percpu, tx);
2085 else
2086 tx = ops_run_prexor6(sh, percpu, tx);
2087 }
2088
2089 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2090 tx = ops_run_partial_parity(sh, percpu, tx);
2091
2092 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2093 tx = ops_run_biodrain(sh, tx);
2094 overlap_clear++;
2095 }
2096
2097 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2098 if (level < 6)
2099 ops_run_reconstruct5(sh, percpu, tx);
2100 else
2101 ops_run_reconstruct6(sh, percpu, tx);
2102 }
2103
2104 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2105 if (sh->check_state == check_state_run)
2106 ops_run_check_p(sh, percpu);
2107 else if (sh->check_state == check_state_run_q)
2108 ops_run_check_pq(sh, percpu, 0);
2109 else if (sh->check_state == check_state_run_pq)
2110 ops_run_check_pq(sh, percpu, 1);
2111 else
2112 BUG();
2113 }
2114
2115 if (overlap_clear && !sh->batch_head)
2116 for (i = disks; i--; ) {
2117 struct r5dev *dev = &sh->dev[i];
2118 if (test_and_clear_bit(R5_Overlap, &dev->flags))
2119 wake_up(&sh->raid_conf->wait_for_overlap);
2120 }
2121 put_cpu();
2122}
2123
2124static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2125{
2126 if (sh->ppl_page)
2127 __free_page(sh->ppl_page);
2128 kmem_cache_free(sc, sh);
2129}
2130
2131static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2132 int disks, struct r5conf *conf)
2133{
2134 struct stripe_head *sh;
2135 int i;
2136
2137 sh = kmem_cache_zalloc(sc, gfp);
2138 if (sh) {
2139 spin_lock_init(&sh->stripe_lock);
2140 spin_lock_init(&sh->batch_lock);
2141 INIT_LIST_HEAD(&sh->batch_list);
2142 INIT_LIST_HEAD(&sh->lru);
2143 INIT_LIST_HEAD(&sh->r5c);
2144 INIT_LIST_HEAD(&sh->log_list);
2145 atomic_set(&sh->count, 1);
2146 sh->raid_conf = conf;
2147 sh->log_start = MaxSector;
2148 for (i = 0; i < disks; i++) {
2149 struct r5dev *dev = &sh->dev[i];
2150
2151 bio_init(&dev->req, &dev->vec, 1);
2152 bio_init(&dev->rreq, &dev->rvec, 1);
2153 }
2154
2155 if (raid5_has_ppl(conf)) {
2156 sh->ppl_page = alloc_page(gfp);
2157 if (!sh->ppl_page) {
2158 free_stripe(sc, sh);
2159 sh = NULL;
2160 }
2161 }
2162 }
2163 return sh;
2164}
2165static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2166{
2167 struct stripe_head *sh;
2168
2169 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2170 if (!sh)
2171 return 0;
2172
2173 if (grow_buffers(sh, gfp)) {
2174 shrink_buffers(sh);
2175 free_stripe(conf->slab_cache, sh);
2176 return 0;
2177 }
2178 sh->hash_lock_index =
2179 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2180
2181 atomic_inc(&conf->active_stripes);
2182
2183 raid5_release_stripe(sh);
2184 conf->max_nr_stripes++;
2185 return 1;
2186}
2187
2188static int grow_stripes(struct r5conf *conf, int num)
2189{
2190 struct kmem_cache *sc;
2191 size_t namelen = sizeof(conf->cache_name[0]);
2192 int devs = max(conf->raid_disks, conf->previous_raid_disks);
2193
2194 if (conf->mddev->gendisk)
2195 snprintf(conf->cache_name[0], namelen,
2196 "raid%d-%s", conf->level, mdname(conf->mddev));
2197 else
2198 snprintf(conf->cache_name[0], namelen,
2199 "raid%d-%p", conf->level, conf->mddev);
2200 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2201
2202 conf->active_name = 0;
2203 sc = kmem_cache_create(conf->cache_name[conf->active_name],
2204 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2205 0, 0, NULL);
2206 if (!sc)
2207 return 1;
2208 conf->slab_cache = sc;
2209 conf->pool_size = devs;
2210 while (num--)
2211 if (!grow_one_stripe(conf, GFP_KERNEL))
2212 return 1;
2213
2214 return 0;
2215}
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230static int scribble_alloc(struct raid5_percpu *percpu,
2231 int num, int cnt, gfp_t flags)
2232{
2233 size_t obj_size =
2234 sizeof(struct page *) * (num+2) +
2235 sizeof(addr_conv_t) * (num+2);
2236 void *scribble;
2237
2238 scribble = kvmalloc_array(cnt, obj_size, flags);
2239 if (!scribble)
2240 return -ENOMEM;
2241
2242 kvfree(percpu->scribble);
2243
2244 percpu->scribble = scribble;
2245 percpu->scribble_obj_size = obj_size;
2246 return 0;
2247}
2248
2249static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2250{
2251 unsigned long cpu;
2252 int err = 0;
2253
2254
2255
2256
2257
2258
2259 if (conf->scribble_disks >= new_disks &&
2260 conf->scribble_sectors >= new_sectors)
2261 return 0;
2262 mddev_suspend(conf->mddev);
2263 get_online_cpus();
2264
2265 for_each_present_cpu(cpu) {
2266 struct raid5_percpu *percpu;
2267
2268 percpu = per_cpu_ptr(conf->percpu, cpu);
2269 err = scribble_alloc(percpu, new_disks,
2270 new_sectors / STRIPE_SECTORS,
2271 GFP_NOIO);
2272 if (err)
2273 break;
2274 }
2275
2276 put_online_cpus();
2277 mddev_resume(conf->mddev);
2278 if (!err) {
2279 conf->scribble_disks = new_disks;
2280 conf->scribble_sectors = new_sectors;
2281 }
2282 return err;
2283}
2284
2285static int resize_stripes(struct r5conf *conf, int newsize)
2286{
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310 struct stripe_head *osh, *nsh;
2311 LIST_HEAD(newstripes);
2312 struct disk_info *ndisks;
2313 int err = 0;
2314 struct kmem_cache *sc;
2315 int i;
2316 int hash, cnt;
2317
2318 md_allow_write(conf->mddev);
2319
2320
2321 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2322 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2323 0, 0, NULL);
2324 if (!sc)
2325 return -ENOMEM;
2326
2327
2328 mutex_lock(&conf->cache_size_mutex);
2329
2330 for (i = conf->max_nr_stripes; i; i--) {
2331 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2332 if (!nsh)
2333 break;
2334
2335 list_add(&nsh->lru, &newstripes);
2336 }
2337 if (i) {
2338
2339 while (!list_empty(&newstripes)) {
2340 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2341 list_del(&nsh->lru);
2342 free_stripe(sc, nsh);
2343 }
2344 kmem_cache_destroy(sc);
2345 mutex_unlock(&conf->cache_size_mutex);
2346 return -ENOMEM;
2347 }
2348
2349
2350
2351
2352 hash = 0;
2353 cnt = 0;
2354 list_for_each_entry(nsh, &newstripes, lru) {
2355 lock_device_hash_lock(conf, hash);
2356 wait_event_cmd(conf->wait_for_stripe,
2357 !list_empty(conf->inactive_list + hash),
2358 unlock_device_hash_lock(conf, hash),
2359 lock_device_hash_lock(conf, hash));
2360 osh = get_free_stripe(conf, hash);
2361 unlock_device_hash_lock(conf, hash);
2362
2363 for(i=0; i<conf->pool_size; i++) {
2364 nsh->dev[i].page = osh->dev[i].page;
2365 nsh->dev[i].orig_page = osh->dev[i].page;
2366 }
2367 nsh->hash_lock_index = hash;
2368 free_stripe(conf->slab_cache, osh);
2369 cnt++;
2370 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2371 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2372 hash++;
2373 cnt = 0;
2374 }
2375 }
2376 kmem_cache_destroy(conf->slab_cache);
2377
2378
2379
2380
2381
2382
2383 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2384 if (ndisks) {
2385 for (i = 0; i < conf->pool_size; i++)
2386 ndisks[i] = conf->disks[i];
2387
2388 for (i = conf->pool_size; i < newsize; i++) {
2389 ndisks[i].extra_page = alloc_page(GFP_NOIO);
2390 if (!ndisks[i].extra_page)
2391 err = -ENOMEM;
2392 }
2393
2394 if (err) {
2395 for (i = conf->pool_size; i < newsize; i++)
2396 if (ndisks[i].extra_page)
2397 put_page(ndisks[i].extra_page);
2398 kfree(ndisks);
2399 } else {
2400 kfree(conf->disks);
2401 conf->disks = ndisks;
2402 }
2403 } else
2404 err = -ENOMEM;
2405
2406 mutex_unlock(&conf->cache_size_mutex);
2407
2408 conf->slab_cache = sc;
2409 conf->active_name = 1-conf->active_name;
2410
2411
2412 while(!list_empty(&newstripes)) {
2413 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2414 list_del_init(&nsh->lru);
2415
2416 for (i=conf->raid_disks; i < newsize; i++)
2417 if (nsh->dev[i].page == NULL) {
2418 struct page *p = alloc_page(GFP_NOIO);
2419 nsh->dev[i].page = p;
2420 nsh->dev[i].orig_page = p;
2421 if (!p)
2422 err = -ENOMEM;
2423 }
2424 raid5_release_stripe(nsh);
2425 }
2426
2427
2428 if (!err)
2429 conf->pool_size = newsize;
2430 return err;
2431}
2432
2433static int drop_one_stripe(struct r5conf *conf)
2434{
2435 struct stripe_head *sh;
2436 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2437
2438 spin_lock_irq(conf->hash_locks + hash);
2439 sh = get_free_stripe(conf, hash);
2440 spin_unlock_irq(conf->hash_locks + hash);
2441 if (!sh)
2442 return 0;
2443 BUG_ON(atomic_read(&sh->count));
2444 shrink_buffers(sh);
2445 free_stripe(conf->slab_cache, sh);
2446 atomic_dec(&conf->active_stripes);
2447 conf->max_nr_stripes--;
2448 return 1;
2449}
2450
2451static void shrink_stripes(struct r5conf *conf)
2452{
2453 while (conf->max_nr_stripes &&
2454 drop_one_stripe(conf))
2455 ;
2456
2457 kmem_cache_destroy(conf->slab_cache);
2458 conf->slab_cache = NULL;
2459}
2460
2461static void raid5_end_read_request(struct bio * bi)
2462{
2463 struct stripe_head *sh = bi->bi_private;
2464 struct r5conf *conf = sh->raid_conf;
2465 int disks = sh->disks, i;
2466 char b[BDEVNAME_SIZE];
2467 struct md_rdev *rdev = NULL;
2468 sector_t s;
2469
2470 for (i=0 ; i<disks; i++)
2471 if (bi == &sh->dev[i].req)
2472 break;
2473
2474 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2475 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2476 bi->bi_status);
2477 if (i == disks) {
2478 bio_reset(bi);
2479 BUG();
2480 return;
2481 }
2482 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2483
2484
2485
2486
2487
2488 rdev = conf->disks[i].replacement;
2489 if (!rdev)
2490 rdev = conf->disks[i].rdev;
2491
2492 if (use_new_offset(conf, sh))
2493 s = sh->sector + rdev->new_data_offset;
2494 else
2495 s = sh->sector + rdev->data_offset;
2496 if (!bi->bi_status) {
2497 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2498 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2499
2500
2501
2502
2503 pr_info_ratelimited(
2504 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2505 mdname(conf->mddev), STRIPE_SECTORS,
2506 (unsigned long long)s,
2507 bdevname(rdev->bdev, b));
2508 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2509 clear_bit(R5_ReadError, &sh->dev[i].flags);
2510 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2511 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2512 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2513
2514 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2515
2516
2517
2518
2519 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2520
2521 if (atomic_read(&rdev->read_errors))
2522 atomic_set(&rdev->read_errors, 0);
2523 } else {
2524 const char *bdn = bdevname(rdev->bdev, b);
2525 int retry = 0;
2526 int set_bad = 0;
2527
2528 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2529 atomic_inc(&rdev->read_errors);
2530 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2531 pr_warn_ratelimited(
2532 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2533 mdname(conf->mddev),
2534 (unsigned long long)s,
2535 bdn);
2536 else if (conf->mddev->degraded >= conf->max_degraded) {
2537 set_bad = 1;
2538 pr_warn_ratelimited(
2539 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2540 mdname(conf->mddev),
2541 (unsigned long long)s,
2542 bdn);
2543 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2544
2545 set_bad = 1;
2546 pr_warn_ratelimited(
2547 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2548 mdname(conf->mddev),
2549 (unsigned long long)s,
2550 bdn);
2551 } else if (atomic_read(&rdev->read_errors)
2552 > conf->max_nr_stripes)
2553 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2554 mdname(conf->mddev), bdn);
2555 else
2556 retry = 1;
2557 if (set_bad && test_bit(In_sync, &rdev->flags)
2558 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2559 retry = 1;
2560 if (retry)
2561 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2562 set_bit(R5_ReadError, &sh->dev[i].flags);
2563 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2564 } else
2565 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2566 else {
2567 clear_bit(R5_ReadError, &sh->dev[i].flags);
2568 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2569 if (!(set_bad
2570 && test_bit(In_sync, &rdev->flags)
2571 && rdev_set_badblocks(
2572 rdev, sh->sector, STRIPE_SECTORS, 0)))
2573 md_error(conf->mddev, rdev);
2574 }
2575 }
2576 rdev_dec_pending(rdev, conf->mddev);
2577 bio_reset(bi);
2578 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2579 set_bit(STRIPE_HANDLE, &sh->state);
2580 raid5_release_stripe(sh);
2581}
2582
2583static void raid5_end_write_request(struct bio *bi)
2584{
2585 struct stripe_head *sh = bi->bi_private;
2586 struct r5conf *conf = sh->raid_conf;
2587 int disks = sh->disks, i;
2588 struct md_rdev *uninitialized_var(rdev);
2589 sector_t first_bad;
2590 int bad_sectors;
2591 int replacement = 0;
2592
2593 for (i = 0 ; i < disks; i++) {
2594 if (bi == &sh->dev[i].req) {
2595 rdev = conf->disks[i].rdev;
2596 break;
2597 }
2598 if (bi == &sh->dev[i].rreq) {
2599 rdev = conf->disks[i].replacement;
2600 if (rdev)
2601 replacement = 1;
2602 else
2603
2604
2605
2606
2607 rdev = conf->disks[i].rdev;
2608 break;
2609 }
2610 }
2611 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2612 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2613 bi->bi_status);
2614 if (i == disks) {
2615 bio_reset(bi);
2616 BUG();
2617 return;
2618 }
2619
2620 if (replacement) {
2621 if (bi->bi_status)
2622 md_error(conf->mddev, rdev);
2623 else if (is_badblock(rdev, sh->sector,
2624 STRIPE_SECTORS,
2625 &first_bad, &bad_sectors))
2626 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2627 } else {
2628 if (bi->bi_status) {
2629 set_bit(STRIPE_DEGRADED, &sh->state);
2630 set_bit(WriteErrorSeen, &rdev->flags);
2631 set_bit(R5_WriteError, &sh->dev[i].flags);
2632 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2633 set_bit(MD_RECOVERY_NEEDED,
2634 &rdev->mddev->recovery);
2635 } else if (is_badblock(rdev, sh->sector,
2636 STRIPE_SECTORS,
2637 &first_bad, &bad_sectors)) {
2638 set_bit(R5_MadeGood, &sh->dev[i].flags);
2639 if (test_bit(R5_ReadError, &sh->dev[i].flags))
2640
2641
2642
2643
2644 set_bit(R5_ReWrite, &sh->dev[i].flags);
2645 }
2646 }
2647 rdev_dec_pending(rdev, conf->mddev);
2648
2649 if (sh->batch_head && bi->bi_status && !replacement)
2650 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2651
2652 bio_reset(bi);
2653 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2654 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2655 set_bit(STRIPE_HANDLE, &sh->state);
2656 raid5_release_stripe(sh);
2657
2658 if (sh->batch_head && sh != sh->batch_head)
2659 raid5_release_stripe(sh->batch_head);
2660}
2661
2662static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2663{
2664 char b[BDEVNAME_SIZE];
2665 struct r5conf *conf = mddev->private;
2666 unsigned long flags;
2667 pr_debug("raid456: error called\n");
2668
2669 spin_lock_irqsave(&conf->device_lock, flags);
2670
2671 if (test_bit(In_sync, &rdev->flags) &&
2672 mddev->degraded == conf->max_degraded) {
2673
2674
2675
2676
2677 conf->recovery_disabled = mddev->recovery_disabled;
2678 spin_unlock_irqrestore(&conf->device_lock, flags);
2679 return;
2680 }
2681
2682 set_bit(Faulty, &rdev->flags);
2683 clear_bit(In_sync, &rdev->flags);
2684 mddev->degraded = raid5_calc_degraded(conf);
2685 spin_unlock_irqrestore(&conf->device_lock, flags);
2686 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2687
2688 set_bit(Blocked, &rdev->flags);
2689 set_mask_bits(&mddev->sb_flags, 0,
2690 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2691 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2692 "md/raid:%s: Operation continuing on %d devices.\n",
2693 mdname(mddev),
2694 bdevname(rdev->bdev, b),
2695 mdname(mddev),
2696 conf->raid_disks - mddev->degraded);
2697 r5c_update_on_rdev_error(mddev, rdev);
2698}
2699
2700
2701
2702
2703
2704sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2705 int previous, int *dd_idx,
2706 struct stripe_head *sh)
2707{
2708 sector_t stripe, stripe2;
2709 sector_t chunk_number;
2710 unsigned int chunk_offset;
2711 int pd_idx, qd_idx;
2712 int ddf_layout = 0;
2713 sector_t new_sector;
2714 int algorithm = previous ? conf->prev_algo
2715 : conf->algorithm;
2716 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2717 : conf->chunk_sectors;
2718 int raid_disks = previous ? conf->previous_raid_disks
2719 : conf->raid_disks;
2720 int data_disks = raid_disks - conf->max_degraded;
2721
2722
2723
2724
2725
2726
2727 chunk_offset = sector_div(r_sector, sectors_per_chunk);
2728 chunk_number = r_sector;
2729
2730
2731
2732
2733 stripe = chunk_number;
2734 *dd_idx = sector_div(stripe, data_disks);
2735 stripe2 = stripe;
2736
2737
2738
2739 pd_idx = qd_idx = -1;
2740 switch(conf->level) {
2741 case 4:
2742 pd_idx = data_disks;
2743 break;
2744 case 5:
2745 switch (algorithm) {
2746 case ALGORITHM_LEFT_ASYMMETRIC:
2747 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2748 if (*dd_idx >= pd_idx)
2749 (*dd_idx)++;
2750 break;
2751 case ALGORITHM_RIGHT_ASYMMETRIC:
2752 pd_idx = sector_div(stripe2, raid_disks);
2753 if (*dd_idx >= pd_idx)
2754 (*dd_idx)++;
2755 break;
2756 case ALGORITHM_LEFT_SYMMETRIC:
2757 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2758 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2759 break;
2760 case ALGORITHM_RIGHT_SYMMETRIC:
2761 pd_idx = sector_div(stripe2, raid_disks);
2762 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2763 break;
2764 case ALGORITHM_PARITY_0:
2765 pd_idx = 0;
2766 (*dd_idx)++;
2767 break;
2768 case ALGORITHM_PARITY_N:
2769 pd_idx = data_disks;
2770 break;
2771 default:
2772 BUG();
2773 }
2774 break;
2775 case 6:
2776
2777 switch (algorithm) {
2778 case ALGORITHM_LEFT_ASYMMETRIC:
2779 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2780 qd_idx = pd_idx + 1;
2781 if (pd_idx == raid_disks-1) {
2782 (*dd_idx)++;
2783 qd_idx = 0;
2784 } else if (*dd_idx >= pd_idx)
2785 (*dd_idx) += 2;
2786 break;
2787 case ALGORITHM_RIGHT_ASYMMETRIC:
2788 pd_idx = sector_div(stripe2, raid_disks);
2789 qd_idx = pd_idx + 1;
2790 if (pd_idx == raid_disks-1) {
2791 (*dd_idx)++;
2792 qd_idx = 0;
2793 } else if (*dd_idx >= pd_idx)
2794 (*dd_idx) += 2;
2795 break;
2796 case ALGORITHM_LEFT_SYMMETRIC:
2797 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2798 qd_idx = (pd_idx + 1) % raid_disks;
2799 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2800 break;
2801 case ALGORITHM_RIGHT_SYMMETRIC:
2802 pd_idx = sector_div(stripe2, raid_disks);
2803 qd_idx = (pd_idx + 1) % raid_disks;
2804 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2805 break;
2806
2807 case ALGORITHM_PARITY_0:
2808 pd_idx = 0;
2809 qd_idx = 1;
2810 (*dd_idx) += 2;
2811 break;
2812 case ALGORITHM_PARITY_N:
2813 pd_idx = data_disks;
2814 qd_idx = data_disks + 1;
2815 break;
2816
2817 case ALGORITHM_ROTATING_ZERO_RESTART:
2818
2819
2820
2821 pd_idx = sector_div(stripe2, raid_disks);
2822 qd_idx = pd_idx + 1;
2823 if (pd_idx == raid_disks-1) {
2824 (*dd_idx)++;
2825 qd_idx = 0;
2826 } else if (*dd_idx >= pd_idx)
2827 (*dd_idx) += 2;
2828 ddf_layout = 1;
2829 break;
2830
2831 case ALGORITHM_ROTATING_N_RESTART:
2832
2833
2834
2835
2836 stripe2 += 1;
2837 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2838 qd_idx = pd_idx + 1;
2839 if (pd_idx == raid_disks-1) {
2840 (*dd_idx)++;
2841 qd_idx = 0;
2842 } else if (*dd_idx >= pd_idx)
2843 (*dd_idx) += 2;
2844 ddf_layout = 1;
2845 break;
2846
2847 case ALGORITHM_ROTATING_N_CONTINUE:
2848
2849 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2850 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2851 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2852 ddf_layout = 1;
2853 break;
2854
2855 case ALGORITHM_LEFT_ASYMMETRIC_6:
2856
2857 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2858 if (*dd_idx >= pd_idx)
2859 (*dd_idx)++;
2860 qd_idx = raid_disks - 1;
2861 break;
2862
2863 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2864 pd_idx = sector_div(stripe2, raid_disks-1);
2865 if (*dd_idx >= pd_idx)
2866 (*dd_idx)++;
2867 qd_idx = raid_disks - 1;
2868 break;
2869
2870 case ALGORITHM_LEFT_SYMMETRIC_6:
2871 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2872 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2873 qd_idx = raid_disks - 1;
2874 break;
2875
2876 case ALGORITHM_RIGHT_SYMMETRIC_6:
2877 pd_idx = sector_div(stripe2, raid_disks-1);
2878 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2879 qd_idx = raid_disks - 1;
2880 break;
2881
2882 case ALGORITHM_PARITY_0_6:
2883 pd_idx = 0;
2884 (*dd_idx)++;
2885 qd_idx = raid_disks - 1;
2886 break;
2887
2888 default:
2889 BUG();
2890 }
2891 break;
2892 }
2893
2894 if (sh) {
2895 sh->pd_idx = pd_idx;
2896 sh->qd_idx = qd_idx;
2897 sh->ddf_layout = ddf_layout;
2898 }
2899
2900
2901
2902 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2903 return new_sector;
2904}
2905
2906sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2907{
2908 struct r5conf *conf = sh->raid_conf;
2909 int raid_disks = sh->disks;
2910 int data_disks = raid_disks - conf->max_degraded;
2911 sector_t new_sector = sh->sector, check;
2912 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2913 : conf->chunk_sectors;
2914 int algorithm = previous ? conf->prev_algo
2915 : conf->algorithm;
2916 sector_t stripe;
2917 int chunk_offset;
2918 sector_t chunk_number;
2919 int dummy1, dd_idx = i;
2920 sector_t r_sector;
2921 struct stripe_head sh2;
2922
2923 chunk_offset = sector_div(new_sector, sectors_per_chunk);
2924 stripe = new_sector;
2925
2926 if (i == sh->pd_idx)
2927 return 0;
2928 switch(conf->level) {
2929 case 4: break;
2930 case 5:
2931 switch (algorithm) {
2932 case ALGORITHM_LEFT_ASYMMETRIC:
2933 case ALGORITHM_RIGHT_ASYMMETRIC:
2934 if (i > sh->pd_idx)
2935 i--;
2936 break;
2937 case ALGORITHM_LEFT_SYMMETRIC:
2938 case ALGORITHM_RIGHT_SYMMETRIC:
2939 if (i < sh->pd_idx)
2940 i += raid_disks;
2941 i -= (sh->pd_idx + 1);
2942 break;
2943 case ALGORITHM_PARITY_0:
2944 i -= 1;
2945 break;
2946 case ALGORITHM_PARITY_N:
2947 break;
2948 default:
2949 BUG();
2950 }
2951 break;
2952 case 6:
2953 if (i == sh->qd_idx)
2954 return 0;
2955 switch (algorithm) {
2956 case ALGORITHM_LEFT_ASYMMETRIC:
2957 case ALGORITHM_RIGHT_ASYMMETRIC:
2958 case ALGORITHM_ROTATING_ZERO_RESTART:
2959 case ALGORITHM_ROTATING_N_RESTART:
2960 if (sh->pd_idx == raid_disks-1)
2961 i--;
2962 else if (i > sh->pd_idx)
2963 i -= 2;
2964 break;
2965 case ALGORITHM_LEFT_SYMMETRIC:
2966 case ALGORITHM_RIGHT_SYMMETRIC:
2967 if (sh->pd_idx == raid_disks-1)
2968 i--;
2969 else {
2970
2971 if (i < sh->pd_idx)
2972 i += raid_disks;
2973 i -= (sh->pd_idx + 2);
2974 }
2975 break;
2976 case ALGORITHM_PARITY_0:
2977 i -= 2;
2978 break;
2979 case ALGORITHM_PARITY_N:
2980 break;
2981 case ALGORITHM_ROTATING_N_CONTINUE:
2982
2983 if (sh->pd_idx == 0)
2984 i--;
2985 else {
2986
2987 if (i < sh->pd_idx)
2988 i += raid_disks;
2989 i -= (sh->pd_idx + 1);
2990 }
2991 break;
2992 case ALGORITHM_LEFT_ASYMMETRIC_6:
2993 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2994 if (i > sh->pd_idx)
2995 i--;
2996 break;
2997 case ALGORITHM_LEFT_SYMMETRIC_6:
2998 case ALGORITHM_RIGHT_SYMMETRIC_6:
2999 if (i < sh->pd_idx)
3000 i += data_disks + 1;
3001 i -= (sh->pd_idx + 1);
3002 break;
3003 case ALGORITHM_PARITY_0_6:
3004 i -= 1;
3005 break;
3006 default:
3007 BUG();
3008 }
3009 break;
3010 }
3011
3012 chunk_number = stripe * data_disks + i;
3013 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3014
3015 check = raid5_compute_sector(conf, r_sector,
3016 previous, &dummy1, &sh2);
3017 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3018 || sh2.qd_idx != sh->qd_idx) {
3019 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3020 mdname(conf->mddev));
3021 return 0;
3022 }
3023 return r_sector;
3024}
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064static inline bool delay_towrite(struct r5conf *conf,
3065 struct r5dev *dev,
3066 struct stripe_head_state *s)
3067{
3068
3069 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3070 !test_bit(R5_Insync, &dev->flags) && s->injournal)
3071 return true;
3072
3073 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3074 s->injournal > 0)
3075 return true;
3076
3077 if (s->log_failed && s->injournal)
3078 return true;
3079 return false;
3080}
3081
3082static void
3083schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3084 int rcw, int expand)
3085{
3086 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3087 struct r5conf *conf = sh->raid_conf;
3088 int level = conf->level;
3089
3090 if (rcw) {
3091
3092
3093
3094
3095
3096
3097 r5c_release_extra_page(sh);
3098
3099 for (i = disks; i--; ) {
3100 struct r5dev *dev = &sh->dev[i];
3101
3102 if (dev->towrite && !delay_towrite(conf, dev, s)) {
3103 set_bit(R5_LOCKED, &dev->flags);
3104 set_bit(R5_Wantdrain, &dev->flags);
3105 if (!expand)
3106 clear_bit(R5_UPTODATE, &dev->flags);
3107 s->locked++;
3108 } else if (test_bit(R5_InJournal, &dev->flags)) {
3109 set_bit(R5_LOCKED, &dev->flags);
3110 s->locked++;
3111 }
3112 }
3113
3114
3115
3116
3117 if (!expand) {
3118 if (!s->locked)
3119
3120 return;
3121 sh->reconstruct_state = reconstruct_state_drain_run;
3122 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3123 } else
3124 sh->reconstruct_state = reconstruct_state_run;
3125
3126 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3127
3128 if (s->locked + conf->max_degraded == disks)
3129 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3130 atomic_inc(&conf->pending_full_writes);
3131 } else {
3132 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3133 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3134 BUG_ON(level == 6 &&
3135 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3136 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3137
3138 for (i = disks; i--; ) {
3139 struct r5dev *dev = &sh->dev[i];
3140 if (i == pd_idx || i == qd_idx)
3141 continue;
3142
3143 if (dev->towrite &&
3144 (test_bit(R5_UPTODATE, &dev->flags) ||
3145 test_bit(R5_Wantcompute, &dev->flags))) {
3146 set_bit(R5_Wantdrain, &dev->flags);
3147 set_bit(R5_LOCKED, &dev->flags);
3148 clear_bit(R5_UPTODATE, &dev->flags);
3149 s->locked++;
3150 } else if (test_bit(R5_InJournal, &dev->flags)) {
3151 set_bit(R5_LOCKED, &dev->flags);
3152 s->locked++;
3153 }
3154 }
3155 if (!s->locked)
3156
3157 return;
3158 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3159 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3160 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3161 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3162 }
3163
3164
3165
3166
3167 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3168 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3169 s->locked++;
3170
3171 if (level == 6) {
3172 int qd_idx = sh->qd_idx;
3173 struct r5dev *dev = &sh->dev[qd_idx];
3174
3175 set_bit(R5_LOCKED, &dev->flags);
3176 clear_bit(R5_UPTODATE, &dev->flags);
3177 s->locked++;
3178 }
3179
3180 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3181 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3182 !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3183 test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3184 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3185
3186 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3187 __func__, (unsigned long long)sh->sector,
3188 s->locked, s->ops_request);
3189}
3190
3191
3192
3193
3194
3195
3196static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3197 int forwrite, int previous)
3198{
3199 struct bio **bip;
3200 struct r5conf *conf = sh->raid_conf;
3201 int firstwrite=0;
3202
3203 pr_debug("adding bi b#%llu to stripe s#%llu\n",
3204 (unsigned long long)bi->bi_iter.bi_sector,
3205 (unsigned long long)sh->sector);
3206
3207 spin_lock_irq(&sh->stripe_lock);
3208 sh->dev[dd_idx].write_hint = bi->bi_write_hint;
3209
3210 if (sh->batch_head)
3211 goto overlap;
3212 if (forwrite) {
3213 bip = &sh->dev[dd_idx].towrite;
3214 if (*bip == NULL)
3215 firstwrite = 1;
3216 } else
3217 bip = &sh->dev[dd_idx].toread;
3218 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3219 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3220 goto overlap;
3221 bip = & (*bip)->bi_next;
3222 }
3223 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3224 goto overlap;
3225
3226 if (forwrite && raid5_has_ppl(conf)) {
3227
3228
3229
3230
3231
3232
3233
3234 sector_t sector;
3235 sector_t first = 0;
3236 sector_t last = 0;
3237 int count = 0;
3238 int i;
3239
3240 for (i = 0; i < sh->disks; i++) {
3241 if (i != sh->pd_idx &&
3242 (i == dd_idx || sh->dev[i].towrite)) {
3243 sector = sh->dev[i].sector;
3244 if (count == 0 || sector < first)
3245 first = sector;
3246 if (sector > last)
3247 last = sector;
3248 count++;
3249 }
3250 }
3251
3252 if (first + conf->chunk_sectors * (count - 1) != last)
3253 goto overlap;
3254 }
3255
3256 if (!forwrite || previous)
3257 clear_bit(STRIPE_BATCH_READY, &sh->state);
3258
3259 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3260 if (*bip)
3261 bi->bi_next = *bip;
3262 *bip = bi;
3263 bio_inc_remaining(bi);
3264 md_write_inc(conf->mddev, bi);
3265
3266 if (forwrite) {
3267
3268 sector_t sector = sh->dev[dd_idx].sector;
3269 for (bi=sh->dev[dd_idx].towrite;
3270 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3271 bi && bi->bi_iter.bi_sector <= sector;
3272 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3273 if (bio_end_sector(bi) >= sector)
3274 sector = bio_end_sector(bi);
3275 }
3276 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3277 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3278 sh->overwrite_disks++;
3279 }
3280
3281 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3282 (unsigned long long)(*bip)->bi_iter.bi_sector,
3283 (unsigned long long)sh->sector, dd_idx);
3284
3285 if (conf->mddev->bitmap && firstwrite) {
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3299 spin_unlock_irq(&sh->stripe_lock);
3300 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3301 STRIPE_SECTORS, 0);
3302 spin_lock_irq(&sh->stripe_lock);
3303 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3304 if (!sh->batch_head) {
3305 sh->bm_seq = conf->seq_flush+1;
3306 set_bit(STRIPE_BIT_DELAY, &sh->state);
3307 }
3308 }
3309 spin_unlock_irq(&sh->stripe_lock);
3310
3311 if (stripe_can_batch(sh))
3312 stripe_add_to_batch_list(conf, sh);
3313 return 1;
3314
3315 overlap:
3316 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3317 spin_unlock_irq(&sh->stripe_lock);
3318 return 0;
3319}
3320
3321static void end_reshape(struct r5conf *conf);
3322
3323static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3324 struct stripe_head *sh)
3325{
3326 int sectors_per_chunk =
3327 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3328 int dd_idx;
3329 int chunk_offset = sector_div(stripe, sectors_per_chunk);
3330 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3331
3332 raid5_compute_sector(conf,
3333 stripe * (disks - conf->max_degraded)
3334 *sectors_per_chunk + chunk_offset,
3335 previous,
3336 &dd_idx, sh);
3337}
3338
3339static void
3340handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3341 struct stripe_head_state *s, int disks)
3342{
3343 int i;
3344 BUG_ON(sh->batch_head);
3345 for (i = disks; i--; ) {
3346 struct bio *bi;
3347 int bitmap_end = 0;
3348
3349 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3350 struct md_rdev *rdev;
3351 rcu_read_lock();
3352 rdev = rcu_dereference(conf->disks[i].rdev);
3353 if (rdev && test_bit(In_sync, &rdev->flags) &&
3354 !test_bit(Faulty, &rdev->flags))
3355 atomic_inc(&rdev->nr_pending);
3356 else
3357 rdev = NULL;
3358 rcu_read_unlock();
3359 if (rdev) {
3360 if (!rdev_set_badblocks(
3361 rdev,
3362 sh->sector,
3363 STRIPE_SECTORS, 0))
3364 md_error(conf->mddev, rdev);
3365 rdev_dec_pending(rdev, conf->mddev);
3366 }
3367 }
3368 spin_lock_irq(&sh->stripe_lock);
3369
3370 bi = sh->dev[i].towrite;
3371 sh->dev[i].towrite = NULL;
3372 sh->overwrite_disks = 0;
3373 spin_unlock_irq(&sh->stripe_lock);
3374 if (bi)
3375 bitmap_end = 1;
3376
3377 log_stripe_write_finished(sh);
3378
3379 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3380 wake_up(&conf->wait_for_overlap);
3381
3382 while (bi && bi->bi_iter.bi_sector <
3383 sh->dev[i].sector + STRIPE_SECTORS) {
3384 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3385
3386 md_write_end(conf->mddev);
3387 bio_io_error(bi);
3388 bi = nextbi;
3389 }
3390 if (bitmap_end)
3391 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3392 STRIPE_SECTORS, 0, 0);
3393 bitmap_end = 0;
3394
3395 bi = sh->dev[i].written;
3396 sh->dev[i].written = NULL;
3397 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3398 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3399 sh->dev[i].page = sh->dev[i].orig_page;
3400 }
3401
3402 if (bi) bitmap_end = 1;
3403 while (bi && bi->bi_iter.bi_sector <
3404 sh->dev[i].sector + STRIPE_SECTORS) {
3405 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3406
3407 md_write_end(conf->mddev);
3408 bio_io_error(bi);
3409 bi = bi2;
3410 }
3411
3412
3413
3414
3415 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3416 s->failed > conf->max_degraded &&
3417 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3418 test_bit(R5_ReadError, &sh->dev[i].flags))) {
3419 spin_lock_irq(&sh->stripe_lock);
3420 bi = sh->dev[i].toread;
3421 sh->dev[i].toread = NULL;
3422 spin_unlock_irq(&sh->stripe_lock);
3423 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3424 wake_up(&conf->wait_for_overlap);
3425 if (bi)
3426 s->to_read--;
3427 while (bi && bi->bi_iter.bi_sector <
3428 sh->dev[i].sector + STRIPE_SECTORS) {
3429 struct bio *nextbi =
3430 r5_next_bio(bi, sh->dev[i].sector);
3431
3432 bio_io_error(bi);
3433 bi = nextbi;
3434 }
3435 }
3436 if (bitmap_end)
3437 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3438 STRIPE_SECTORS, 0, 0);
3439
3440
3441
3442 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3443 }
3444 s->to_write = 0;
3445 s->written = 0;
3446
3447 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3448 if (atomic_dec_and_test(&conf->pending_full_writes))
3449 md_wakeup_thread(conf->mddev->thread);
3450}
3451
3452static void
3453handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3454 struct stripe_head_state *s)
3455{
3456 int abort = 0;
3457 int i;
3458
3459 BUG_ON(sh->batch_head);
3460 clear_bit(STRIPE_SYNCING, &sh->state);
3461 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3462 wake_up(&conf->wait_for_overlap);
3463 s->syncing = 0;
3464 s->replacing = 0;
3465
3466
3467
3468
3469
3470
3471
3472 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3473
3474
3475
3476 rcu_read_lock();
3477 for (i = 0; i < conf->raid_disks; i++) {
3478 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3479 if (rdev
3480 && !test_bit(Faulty, &rdev->flags)
3481 && !test_bit(In_sync, &rdev->flags)
3482 && !rdev_set_badblocks(rdev, sh->sector,
3483 STRIPE_SECTORS, 0))
3484 abort = 1;
3485 rdev = rcu_dereference(conf->disks[i].replacement);
3486 if (rdev
3487 && !test_bit(Faulty, &rdev->flags)
3488 && !test_bit(In_sync, &rdev->flags)
3489 && !rdev_set_badblocks(rdev, sh->sector,
3490 STRIPE_SECTORS, 0))
3491 abort = 1;
3492 }
3493 rcu_read_unlock();
3494 if (abort)
3495 conf->recovery_disabled =
3496 conf->mddev->recovery_disabled;
3497 }
3498 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3499}
3500
3501static int want_replace(struct stripe_head *sh, int disk_idx)
3502{
3503 struct md_rdev *rdev;
3504 int rv = 0;
3505
3506 rcu_read_lock();
3507 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3508 if (rdev
3509 && !test_bit(Faulty, &rdev->flags)
3510 && !test_bit(In_sync, &rdev->flags)
3511 && (rdev->recovery_offset <= sh->sector
3512 || rdev->mddev->recovery_cp <= sh->sector))
3513 rv = 1;
3514 rcu_read_unlock();
3515 return rv;
3516}
3517
3518static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3519 int disk_idx, int disks)
3520{
3521 struct r5dev *dev = &sh->dev[disk_idx];
3522 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3523 &sh->dev[s->failed_num[1]] };
3524 int i;
3525
3526
3527 if (test_bit(R5_LOCKED, &dev->flags) ||
3528 test_bit(R5_UPTODATE, &dev->flags))
3529
3530
3531
3532 return 0;
3533
3534 if (dev->toread ||
3535 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3536
3537 return 1;
3538
3539 if (s->syncing || s->expanding ||
3540 (s->replacing && want_replace(sh, disk_idx)))
3541
3542
3543
3544 return 1;
3545
3546 if ((s->failed >= 1 && fdev[0]->toread) ||
3547 (s->failed >= 2 && fdev[1]->toread))
3548
3549
3550
3551 return 1;
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561 if (!s->failed || !s->to_write)
3562 return 0;
3563
3564 if (test_bit(R5_Insync, &dev->flags) &&
3565 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3566
3567
3568
3569
3570
3571 return 0;
3572
3573 for (i = 0; i < s->failed && i < 2; i++) {
3574 if (fdev[i]->towrite &&
3575 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3576 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3577
3578
3579
3580
3581
3582 return 1;
3583 }
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593 if (sh->raid_conf->level != 6 &&
3594 sh->sector < sh->raid_conf->mddev->recovery_cp)
3595
3596 return 0;
3597 for (i = 0; i < s->failed && i < 2; i++) {
3598 if (s->failed_num[i] != sh->pd_idx &&
3599 s->failed_num[i] != sh->qd_idx &&
3600 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3601 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3602 return 1;
3603 }
3604
3605 return 0;
3606}
3607
3608
3609
3610
3611
3612
3613
3614static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3615 int disk_idx, int disks)
3616{
3617 struct r5dev *dev = &sh->dev[disk_idx];
3618
3619
3620 if (need_this_block(sh, s, disk_idx, disks)) {
3621
3622
3623
3624 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3625 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3626 BUG_ON(sh->batch_head);
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637 if ((s->uptodate == disks - 1) &&
3638 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3639 (s->failed && (disk_idx == s->failed_num[0] ||
3640 disk_idx == s->failed_num[1])))) {
3641
3642
3643
3644 pr_debug("Computing stripe %llu block %d\n",
3645 (unsigned long long)sh->sector, disk_idx);
3646 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3647 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3648 set_bit(R5_Wantcompute, &dev->flags);
3649 sh->ops.target = disk_idx;
3650 sh->ops.target2 = -1;
3651 s->req_compute = 1;
3652
3653
3654
3655
3656
3657
3658 s->uptodate++;
3659 return 1;
3660 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3661
3662
3663
3664 int other;
3665 for (other = disks; other--; ) {
3666 if (other == disk_idx)
3667 continue;
3668 if (!test_bit(R5_UPTODATE,
3669 &sh->dev[other].flags))
3670 break;
3671 }
3672 BUG_ON(other < 0);
3673 pr_debug("Computing stripe %llu blocks %d,%d\n",
3674 (unsigned long long)sh->sector,
3675 disk_idx, other);
3676 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3677 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3678 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3679 set_bit(R5_Wantcompute, &sh->dev[other].flags);
3680 sh->ops.target = disk_idx;
3681 sh->ops.target2 = other;
3682 s->uptodate += 2;
3683 s->req_compute = 1;
3684 return 1;
3685 } else if (test_bit(R5_Insync, &dev->flags)) {
3686 set_bit(R5_LOCKED, &dev->flags);
3687 set_bit(R5_Wantread, &dev->flags);
3688 s->locked++;
3689 pr_debug("Reading block %d (sync=%d)\n",
3690 disk_idx, s->syncing);
3691 }
3692 }
3693
3694 return 0;
3695}
3696
3697
3698
3699
3700static void handle_stripe_fill(struct stripe_head *sh,
3701 struct stripe_head_state *s,
3702 int disks)
3703{
3704 int i;
3705
3706
3707
3708
3709
3710 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3711 !sh->reconstruct_state) {
3712
3713
3714
3715
3716
3717
3718
3719
3720 if (s->injournal && s->failed) {
3721 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3722 r5c_make_stripe_write_out(sh);
3723 goto out;
3724 }
3725
3726 for (i = disks; i--; )
3727 if (fetch_block(sh, s, i, disks))
3728 break;
3729 }
3730out:
3731 set_bit(STRIPE_HANDLE, &sh->state);
3732}
3733
3734static void break_stripe_batch_list(struct stripe_head *head_sh,
3735 unsigned long handle_flags);
3736
3737
3738
3739
3740
3741static void handle_stripe_clean_event(struct r5conf *conf,
3742 struct stripe_head *sh, int disks)
3743{
3744 int i;
3745 struct r5dev *dev;
3746 int discard_pending = 0;
3747 struct stripe_head *head_sh = sh;
3748 bool do_endio = false;
3749
3750 for (i = disks; i--; )
3751 if (sh->dev[i].written) {
3752 dev = &sh->dev[i];
3753 if (!test_bit(R5_LOCKED, &dev->flags) &&
3754 (test_bit(R5_UPTODATE, &dev->flags) ||
3755 test_bit(R5_Discard, &dev->flags) ||
3756 test_bit(R5_SkipCopy, &dev->flags))) {
3757
3758 struct bio *wbi, *wbi2;
3759 pr_debug("Return write for disc %d\n", i);
3760 if (test_and_clear_bit(R5_Discard, &dev->flags))
3761 clear_bit(R5_UPTODATE, &dev->flags);
3762 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3763 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3764 }
3765 do_endio = true;
3766
3767returnbi:
3768 dev->page = dev->orig_page;
3769 wbi = dev->written;
3770 dev->written = NULL;
3771 while (wbi && wbi->bi_iter.bi_sector <
3772 dev->sector + STRIPE_SECTORS) {
3773 wbi2 = r5_next_bio(wbi, dev->sector);
3774 md_write_end(conf->mddev);
3775 bio_endio(wbi);
3776 wbi = wbi2;
3777 }
3778 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3779 STRIPE_SECTORS,
3780 !test_bit(STRIPE_DEGRADED, &sh->state),
3781 0);
3782 if (head_sh->batch_head) {
3783 sh = list_first_entry(&sh->batch_list,
3784 struct stripe_head,
3785 batch_list);
3786 if (sh != head_sh) {
3787 dev = &sh->dev[i];
3788 goto returnbi;
3789 }
3790 }
3791 sh = head_sh;
3792 dev = &sh->dev[i];
3793 } else if (test_bit(R5_Discard, &dev->flags))
3794 discard_pending = 1;
3795 }
3796
3797 log_stripe_write_finished(sh);
3798
3799 if (!discard_pending &&
3800 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
3801 int hash;
3802 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
3803 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3804 if (sh->qd_idx >= 0) {
3805 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
3806 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
3807 }
3808
3809 clear_bit(STRIPE_DISCARD, &sh->state);
3810
3811
3812
3813
3814
3815unhash:
3816 hash = sh->hash_lock_index;
3817 spin_lock_irq(conf->hash_locks + hash);
3818 remove_hash(sh);
3819 spin_unlock_irq(conf->hash_locks + hash);
3820 if (head_sh->batch_head) {
3821 sh = list_first_entry(&sh->batch_list,
3822 struct stripe_head, batch_list);
3823 if (sh != head_sh)
3824 goto unhash;
3825 }
3826 sh = head_sh;
3827
3828 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
3829 set_bit(STRIPE_HANDLE, &sh->state);
3830
3831 }
3832
3833 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3834 if (atomic_dec_and_test(&conf->pending_full_writes))
3835 md_wakeup_thread(conf->mddev->thread);
3836
3837 if (head_sh->batch_head && do_endio)
3838 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3839}
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849static inline bool uptodate_for_rmw(struct r5dev *dev)
3850{
3851 return (test_bit(R5_UPTODATE, &dev->flags)) &&
3852 (!test_bit(R5_InJournal, &dev->flags) ||
3853 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
3854}
3855
3856static int handle_stripe_dirtying(struct r5conf *conf,
3857 struct stripe_head *sh,
3858 struct stripe_head_state *s,
3859 int disks)
3860{
3861 int rmw = 0, rcw = 0, i;
3862 sector_t recovery_cp = conf->mddev->recovery_cp;
3863
3864
3865
3866
3867
3868
3869
3870
3871 if (conf->rmw_level == PARITY_DISABLE_RMW ||
3872 (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
3873 s->failed == 0)) {
3874
3875
3876
3877 rcw = 1; rmw = 2;
3878 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3879 conf->rmw_level, (unsigned long long)recovery_cp,
3880 (unsigned long long)sh->sector);
3881 } else for (i = disks; i--; ) {
3882
3883 struct r5dev *dev = &sh->dev[i];
3884 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3885 i == sh->pd_idx || i == sh->qd_idx ||
3886 test_bit(R5_InJournal, &dev->flags)) &&
3887 !test_bit(R5_LOCKED, &dev->flags) &&
3888 !(uptodate_for_rmw(dev) ||
3889 test_bit(R5_Wantcompute, &dev->flags))) {
3890 if (test_bit(R5_Insync, &dev->flags))
3891 rmw++;
3892 else
3893 rmw += 2*disks;
3894 }
3895
3896 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3897 i != sh->pd_idx && i != sh->qd_idx &&
3898 !test_bit(R5_LOCKED, &dev->flags) &&
3899 !(test_bit(R5_UPTODATE, &dev->flags) ||
3900 test_bit(R5_Wantcompute, &dev->flags))) {
3901 if (test_bit(R5_Insync, &dev->flags))
3902 rcw++;
3903 else
3904 rcw += 2*disks;
3905 }
3906 }
3907
3908 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
3909 (unsigned long long)sh->sector, sh->state, rmw, rcw);
3910 set_bit(STRIPE_HANDLE, &sh->state);
3911 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
3912
3913 if (conf->mddev->queue)
3914 blk_add_trace_msg(conf->mddev->queue,
3915 "raid5 rmw %llu %d",
3916 (unsigned long long)sh->sector, rmw);
3917 for (i = disks; i--; ) {
3918 struct r5dev *dev = &sh->dev[i];
3919 if (test_bit(R5_InJournal, &dev->flags) &&
3920 dev->page == dev->orig_page &&
3921 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
3922
3923 struct page *p = alloc_page(GFP_NOIO);
3924
3925 if (p) {
3926 dev->orig_page = p;
3927 continue;
3928 }
3929
3930
3931
3932
3933
3934 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
3935 &conf->cache_state)) {
3936 r5c_use_extra_page(sh);
3937 break;
3938 }
3939
3940
3941 set_bit(STRIPE_DELAYED, &sh->state);
3942 s->waiting_extra_page = 1;
3943 return -EAGAIN;
3944 }
3945 }
3946
3947 for (i = disks; i--; ) {
3948 struct r5dev *dev = &sh->dev[i];
3949 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3950 i == sh->pd_idx || i == sh->qd_idx ||
3951 test_bit(R5_InJournal, &dev->flags)) &&
3952 !test_bit(R5_LOCKED, &dev->flags) &&
3953 !(uptodate_for_rmw(dev) ||
3954 test_bit(R5_Wantcompute, &dev->flags)) &&
3955 test_bit(R5_Insync, &dev->flags)) {
3956 if (test_bit(STRIPE_PREREAD_ACTIVE,
3957 &sh->state)) {
3958 pr_debug("Read_old block %d for r-m-w\n",
3959 i);
3960 set_bit(R5_LOCKED, &dev->flags);
3961 set_bit(R5_Wantread, &dev->flags);
3962 s->locked++;
3963 } else {
3964 set_bit(STRIPE_DELAYED, &sh->state);
3965 set_bit(STRIPE_HANDLE, &sh->state);
3966 }
3967 }
3968 }
3969 }
3970 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
3971
3972 int qread =0;
3973 rcw = 0;
3974 for (i = disks; i--; ) {
3975 struct r5dev *dev = &sh->dev[i];
3976 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3977 i != sh->pd_idx && i != sh->qd_idx &&
3978 !test_bit(R5_LOCKED, &dev->flags) &&
3979 !(test_bit(R5_UPTODATE, &dev->flags) ||
3980 test_bit(R5_Wantcompute, &dev->flags))) {
3981 rcw++;
3982 if (test_bit(R5_Insync, &dev->flags) &&
3983 test_bit(STRIPE_PREREAD_ACTIVE,
3984 &sh->state)) {
3985 pr_debug("Read_old block "
3986 "%d for Reconstruct\n", i);
3987 set_bit(R5_LOCKED, &dev->flags);
3988 set_bit(R5_Wantread, &dev->flags);
3989 s->locked++;
3990 qread++;
3991 } else {
3992 set_bit(STRIPE_DELAYED, &sh->state);
3993 set_bit(STRIPE_HANDLE, &sh->state);
3994 }
3995 }
3996 }
3997 if (rcw && conf->mddev->queue)
3998 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
3999 (unsigned long long)sh->sector,
4000 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4001 }
4002
4003 if (rcw > disks && rmw > disks &&
4004 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4005 set_bit(STRIPE_DELAYED, &sh->state);
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4018 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4019 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4020 schedule_reconstruction(sh, s, rcw == 0, 0);
4021 return 0;
4022}
4023
4024static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4025 struct stripe_head_state *s, int disks)
4026{
4027 struct r5dev *dev = NULL;
4028
4029 BUG_ON(sh->batch_head);
4030 set_bit(STRIPE_HANDLE, &sh->state);
4031
4032 switch (sh->check_state) {
4033 case check_state_idle:
4034
4035 if (s->failed == 0) {
4036 BUG_ON(s->uptodate != disks);
4037 sh->check_state = check_state_run;
4038 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4039 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4040 s->uptodate--;
4041 break;
4042 }
4043 dev = &sh->dev[s->failed_num[0]];
4044
4045 case check_state_compute_result:
4046 sh->check_state = check_state_idle;
4047 if (!dev)
4048 dev = &sh->dev[sh->pd_idx];
4049
4050
4051 if (test_bit(STRIPE_INSYNC, &sh->state))
4052 break;
4053
4054
4055 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4056 BUG_ON(s->uptodate != disks);
4057
4058 set_bit(R5_LOCKED, &dev->flags);
4059 s->locked++;
4060 set_bit(R5_Wantwrite, &dev->flags);
4061
4062 clear_bit(STRIPE_DEGRADED, &sh->state);
4063 set_bit(STRIPE_INSYNC, &sh->state);
4064 break;
4065 case check_state_run:
4066 break;
4067 case check_state_check_result:
4068 sh->check_state = check_state_idle;
4069
4070
4071
4072
4073 if (s->failed)
4074 break;
4075
4076
4077
4078
4079
4080 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4081
4082
4083
4084 set_bit(STRIPE_INSYNC, &sh->state);
4085 else {
4086 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4087 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4088
4089 set_bit(STRIPE_INSYNC, &sh->state);
4090 pr_warn_ratelimited("%s: mismatch sector in range "
4091 "%llu-%llu\n", mdname(conf->mddev),
4092 (unsigned long long) sh->sector,
4093 (unsigned long long) sh->sector +
4094 STRIPE_SECTORS);
4095 } else {
4096 sh->check_state = check_state_compute_run;
4097 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4098 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4099 set_bit(R5_Wantcompute,
4100 &sh->dev[sh->pd_idx].flags);
4101 sh->ops.target = sh->pd_idx;
4102 sh->ops.target2 = -1;
4103 s->uptodate++;
4104 }
4105 }
4106 break;
4107 case check_state_compute_run:
4108 break;
4109 default:
4110 pr_err("%s: unknown check_state: %d sector: %llu\n",
4111 __func__, sh->check_state,
4112 (unsigned long long) sh->sector);
4113 BUG();
4114 }
4115}
4116
4117static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4118 struct stripe_head_state *s,
4119 int disks)
4120{
4121 int pd_idx = sh->pd_idx;
4122 int qd_idx = sh->qd_idx;
4123 struct r5dev *dev;
4124
4125 BUG_ON(sh->batch_head);
4126 set_bit(STRIPE_HANDLE, &sh->state);
4127
4128 BUG_ON(s->failed > 2);
4129
4130
4131
4132
4133
4134
4135
4136 switch (sh->check_state) {
4137 case check_state_idle:
4138
4139 if (s->failed == s->q_failed) {
4140
4141
4142
4143
4144 sh->check_state = check_state_run;
4145 }
4146 if (!s->q_failed && s->failed < 2) {
4147
4148
4149
4150 if (sh->check_state == check_state_run)
4151 sh->check_state = check_state_run_pq;
4152 else
4153 sh->check_state = check_state_run_q;
4154 }
4155
4156
4157 sh->ops.zero_sum_result = 0;
4158
4159 if (sh->check_state == check_state_run) {
4160
4161 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4162 s->uptodate--;
4163 }
4164 if (sh->check_state >= check_state_run &&
4165 sh->check_state <= check_state_run_pq) {
4166
4167
4168
4169 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4170 break;
4171 }
4172
4173
4174 BUG_ON(s->failed != 2);
4175
4176 case check_state_compute_result:
4177 sh->check_state = check_state_idle;
4178
4179
4180 if (test_bit(STRIPE_INSYNC, &sh->state))
4181 break;
4182
4183
4184
4185
4186 dev = NULL;
4187 if (s->failed == 2) {
4188 dev = &sh->dev[s->failed_num[1]];
4189 s->locked++;
4190 set_bit(R5_LOCKED, &dev->flags);
4191 set_bit(R5_Wantwrite, &dev->flags);
4192 }
4193 if (s->failed >= 1) {
4194 dev = &sh->dev[s->failed_num[0]];
4195 s->locked++;
4196 set_bit(R5_LOCKED, &dev->flags);
4197 set_bit(R5_Wantwrite, &dev->flags);
4198 }
4199 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4200 dev = &sh->dev[pd_idx];
4201 s->locked++;
4202 set_bit(R5_LOCKED, &dev->flags);
4203 set_bit(R5_Wantwrite, &dev->flags);
4204 }
4205 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4206 dev = &sh->dev[qd_idx];
4207 s->locked++;
4208 set_bit(R5_LOCKED, &dev->flags);
4209 set_bit(R5_Wantwrite, &dev->flags);
4210 }
4211 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4212 "%s: disk%td not up to date\n",
4213 mdname(conf->mddev),
4214 dev - (struct r5dev *) &sh->dev)) {
4215 clear_bit(R5_LOCKED, &dev->flags);
4216 clear_bit(R5_Wantwrite, &dev->flags);
4217 s->locked--;
4218 }
4219 clear_bit(STRIPE_DEGRADED, &sh->state);
4220
4221 set_bit(STRIPE_INSYNC, &sh->state);
4222 break;
4223 case check_state_run:
4224 case check_state_run_q:
4225 case check_state_run_pq:
4226 break;
4227 case check_state_check_result:
4228 sh->check_state = check_state_idle;
4229
4230
4231
4232
4233
4234 if (sh->ops.zero_sum_result == 0) {
4235
4236 if (!s->failed)
4237 set_bit(STRIPE_INSYNC, &sh->state);
4238 else {
4239
4240
4241
4242
4243 sh->check_state = check_state_compute_result;
4244
4245
4246
4247
4248
4249 }
4250 } else {
4251 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4252 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4253
4254 set_bit(STRIPE_INSYNC, &sh->state);
4255 pr_warn_ratelimited("%s: mismatch sector in range "
4256 "%llu-%llu\n", mdname(conf->mddev),
4257 (unsigned long long) sh->sector,
4258 (unsigned long long) sh->sector +
4259 STRIPE_SECTORS);
4260 } else {
4261 int *target = &sh->ops.target;
4262
4263 sh->ops.target = -1;
4264 sh->ops.target2 = -1;
4265 sh->check_state = check_state_compute_run;
4266 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4267 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4268 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4269 set_bit(R5_Wantcompute,
4270 &sh->dev[pd_idx].flags);
4271 *target = pd_idx;
4272 target = &sh->ops.target2;
4273 s->uptodate++;
4274 }
4275 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4276 set_bit(R5_Wantcompute,
4277 &sh->dev[qd_idx].flags);
4278 *target = qd_idx;
4279 s->uptodate++;
4280 }
4281 }
4282 }
4283 break;
4284 case check_state_compute_run:
4285 break;
4286 default:
4287 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4288 __func__, sh->check_state,
4289 (unsigned long long) sh->sector);
4290 BUG();
4291 }
4292}
4293
4294static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4295{
4296 int i;
4297
4298
4299
4300
4301 struct dma_async_tx_descriptor *tx = NULL;
4302 BUG_ON(sh->batch_head);
4303 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4304 for (i = 0; i < sh->disks; i++)
4305 if (i != sh->pd_idx && i != sh->qd_idx) {
4306 int dd_idx, j;
4307 struct stripe_head *sh2;
4308 struct async_submit_ctl submit;
4309
4310 sector_t bn = raid5_compute_blocknr(sh, i, 1);
4311 sector_t s = raid5_compute_sector(conf, bn, 0,
4312 &dd_idx, NULL);
4313 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4314 if (sh2 == NULL)
4315
4316
4317
4318
4319 continue;
4320 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4321 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4322
4323 raid5_release_stripe(sh2);
4324 continue;
4325 }
4326
4327
4328 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4329 tx = async_memcpy(sh2->dev[dd_idx].page,
4330 sh->dev[i].page, 0, 0, STRIPE_SIZE,
4331 &submit);
4332
4333 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4334 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4335 for (j = 0; j < conf->raid_disks; j++)
4336 if (j != sh2->pd_idx &&
4337 j != sh2->qd_idx &&
4338 !test_bit(R5_Expanded, &sh2->dev[j].flags))
4339 break;
4340 if (j == conf->raid_disks) {
4341 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4342 set_bit(STRIPE_HANDLE, &sh2->state);
4343 }
4344 raid5_release_stripe(sh2);
4345
4346 }
4347
4348 async_tx_quiesce(&tx);
4349}
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4366{
4367 struct r5conf *conf = sh->raid_conf;
4368 int disks = sh->disks;
4369 struct r5dev *dev;
4370 int i;
4371 int do_recovery = 0;
4372
4373 memset(s, 0, sizeof(*s));
4374
4375 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4376 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4377 s->failed_num[0] = -1;
4378 s->failed_num[1] = -1;
4379 s->log_failed = r5l_log_disk_error(conf);
4380
4381
4382 rcu_read_lock();
4383 for (i=disks; i--; ) {
4384 struct md_rdev *rdev;
4385 sector_t first_bad;
4386 int bad_sectors;
4387 int is_bad = 0;
4388
4389 dev = &sh->dev[i];
4390
4391 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4392 i, dev->flags,
4393 dev->toread, dev->towrite, dev->written);
4394
4395
4396
4397
4398
4399 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4400 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4401 set_bit(R5_Wantfill, &dev->flags);
4402
4403
4404 if (test_bit(R5_LOCKED, &dev->flags))
4405 s->locked++;
4406 if (test_bit(R5_UPTODATE, &dev->flags))
4407 s->uptodate++;
4408 if (test_bit(R5_Wantcompute, &dev->flags)) {
4409 s->compute++;
4410 BUG_ON(s->compute > 2);
4411 }
4412
4413 if (test_bit(R5_Wantfill, &dev->flags))
4414 s->to_fill++;
4415 else if (dev->toread)
4416 s->to_read++;
4417 if (dev->towrite) {
4418 s->to_write++;
4419 if (!test_bit(R5_OVERWRITE, &dev->flags))
4420 s->non_overwrite++;
4421 }
4422 if (dev->written)
4423 s->written++;
4424
4425
4426
4427 rdev = rcu_dereference(conf->disks[i].replacement);
4428 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4429 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4430 !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4431 &first_bad, &bad_sectors))
4432 set_bit(R5_ReadRepl, &dev->flags);
4433 else {
4434 if (rdev && !test_bit(Faulty, &rdev->flags))
4435 set_bit(R5_NeedReplace, &dev->flags);
4436 else
4437 clear_bit(R5_NeedReplace, &dev->flags);
4438 rdev = rcu_dereference(conf->disks[i].rdev);
4439 clear_bit(R5_ReadRepl, &dev->flags);
4440 }
4441 if (rdev && test_bit(Faulty, &rdev->flags))
4442 rdev = NULL;
4443 if (rdev) {
4444 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4445 &first_bad, &bad_sectors);
4446 if (s->blocked_rdev == NULL
4447 && (test_bit(Blocked, &rdev->flags)
4448 || is_bad < 0)) {
4449 if (is_bad < 0)
4450 set_bit(BlockedBadBlocks,
4451 &rdev->flags);
4452 s->blocked_rdev = rdev;
4453 atomic_inc(&rdev->nr_pending);
4454 }
4455 }
4456 clear_bit(R5_Insync, &dev->flags);
4457 if (!rdev)
4458 ;
4459 else if (is_bad) {
4460
4461 if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4462 test_bit(R5_UPTODATE, &dev->flags)) {
4463
4464
4465
4466 set_bit(R5_Insync, &dev->flags);
4467 set_bit(R5_ReadError, &dev->flags);
4468 }
4469 } else if (test_bit(In_sync, &rdev->flags))
4470 set_bit(R5_Insync, &dev->flags);
4471 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4472
4473 set_bit(R5_Insync, &dev->flags);
4474 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4475 test_bit(R5_Expanded, &dev->flags))
4476
4477
4478
4479
4480 set_bit(R5_Insync, &dev->flags);
4481
4482 if (test_bit(R5_WriteError, &dev->flags)) {
4483
4484
4485 struct md_rdev *rdev2 = rcu_dereference(
4486 conf->disks[i].rdev);
4487 if (rdev2 == rdev)
4488 clear_bit(R5_Insync, &dev->flags);
4489 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4490 s->handle_bad_blocks = 1;
4491 atomic_inc(&rdev2->nr_pending);
4492 } else
4493 clear_bit(R5_WriteError, &dev->flags);
4494 }
4495 if (test_bit(R5_MadeGood, &dev->flags)) {
4496
4497
4498 struct md_rdev *rdev2 = rcu_dereference(
4499 conf->disks[i].rdev);
4500 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4501 s->handle_bad_blocks = 1;
4502 atomic_inc(&rdev2->nr_pending);
4503 } else
4504 clear_bit(R5_MadeGood, &dev->flags);
4505 }
4506 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4507 struct md_rdev *rdev2 = rcu_dereference(
4508 conf->disks[i].replacement);
4509 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4510 s->handle_bad_blocks = 1;
4511 atomic_inc(&rdev2->nr_pending);
4512 } else
4513 clear_bit(R5_MadeGoodRepl, &dev->flags);
4514 }
4515 if (!test_bit(R5_Insync, &dev->flags)) {
4516
4517 clear_bit(R5_ReadError, &dev->flags);
4518 clear_bit(R5_ReWrite, &dev->flags);
4519 }
4520 if (test_bit(R5_ReadError, &dev->flags))
4521 clear_bit(R5_Insync, &dev->flags);
4522 if (!test_bit(R5_Insync, &dev->flags)) {
4523 if (s->failed < 2)
4524 s->failed_num[s->failed] = i;
4525 s->failed++;
4526 if (rdev && !test_bit(Faulty, &rdev->flags))
4527 do_recovery = 1;
4528 else if (!rdev) {
4529 rdev = rcu_dereference(
4530 conf->disks[i].replacement);
4531 if (rdev && !test_bit(Faulty, &rdev->flags))
4532 do_recovery = 1;
4533 }
4534 }
4535
4536 if (test_bit(R5_InJournal, &dev->flags))
4537 s->injournal++;
4538 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4539 s->just_cached++;
4540 }
4541 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4542
4543
4544
4545
4546
4547
4548
4549
4550 if (do_recovery ||
4551 sh->sector >= conf->mddev->recovery_cp ||
4552 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4553 s->syncing = 1;
4554 else
4555 s->replacing = 1;
4556 }
4557 rcu_read_unlock();
4558}
4559
4560static int clear_batch_ready(struct stripe_head *sh)
4561{
4562
4563
4564
4565
4566 struct stripe_head *tmp;
4567 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4568 return (sh->batch_head && sh->batch_head != sh);
4569 spin_lock(&sh->stripe_lock);
4570 if (!sh->batch_head) {
4571 spin_unlock(&sh->stripe_lock);
4572 return 0;
4573 }
4574
4575
4576
4577
4578
4579 if (sh->batch_head != sh) {
4580 spin_unlock(&sh->stripe_lock);
4581 return 1;
4582 }
4583 spin_lock(&sh->batch_lock);
4584 list_for_each_entry(tmp, &sh->batch_list, batch_list)
4585 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4586 spin_unlock(&sh->batch_lock);
4587 spin_unlock(&sh->stripe_lock);
4588
4589
4590
4591
4592
4593 return 0;
4594}
4595
4596static void break_stripe_batch_list(struct stripe_head *head_sh,
4597 unsigned long handle_flags)
4598{
4599 struct stripe_head *sh, *next;
4600 int i;
4601 int do_wakeup = 0;
4602
4603 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4604
4605 list_del_init(&sh->batch_list);
4606
4607 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4608 (1 << STRIPE_SYNCING) |
4609 (1 << STRIPE_REPLACED) |
4610 (1 << STRIPE_DELAYED) |
4611 (1 << STRIPE_BIT_DELAY) |
4612 (1 << STRIPE_FULL_WRITE) |
4613 (1 << STRIPE_BIOFILL_RUN) |
4614 (1 << STRIPE_COMPUTE_RUN) |
4615 (1 << STRIPE_OPS_REQ_PENDING) |
4616 (1 << STRIPE_DISCARD) |
4617 (1 << STRIPE_BATCH_READY) |
4618 (1 << STRIPE_BATCH_ERR) |
4619 (1 << STRIPE_BITMAP_PENDING)),
4620 "stripe state: %lx\n", sh->state);
4621 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4622 (1 << STRIPE_REPLACED)),
4623 "head stripe state: %lx\n", head_sh->state);
4624
4625 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4626 (1 << STRIPE_PREREAD_ACTIVE) |
4627 (1 << STRIPE_DEGRADED) |
4628 (1 << STRIPE_ON_UNPLUG_LIST)),
4629 head_sh->state & (1 << STRIPE_INSYNC));
4630
4631 sh->check_state = head_sh->check_state;
4632 sh->reconstruct_state = head_sh->reconstruct_state;
4633 spin_lock_irq(&sh->stripe_lock);
4634 sh->batch_head = NULL;
4635 spin_unlock_irq(&sh->stripe_lock);
4636 for (i = 0; i < sh->disks; i++) {
4637 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4638 do_wakeup = 1;
4639 sh->dev[i].flags = head_sh->dev[i].flags &
4640 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4641 }
4642 if (handle_flags == 0 ||
4643 sh->state & handle_flags)
4644 set_bit(STRIPE_HANDLE, &sh->state);
4645 raid5_release_stripe(sh);
4646 }
4647 spin_lock_irq(&head_sh->stripe_lock);
4648 head_sh->batch_head = NULL;
4649 spin_unlock_irq(&head_sh->stripe_lock);
4650 for (i = 0; i < head_sh->disks; i++)
4651 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4652 do_wakeup = 1;
4653 if (head_sh->state & handle_flags)
4654 set_bit(STRIPE_HANDLE, &head_sh->state);
4655
4656 if (do_wakeup)
4657 wake_up(&head_sh->raid_conf->wait_for_overlap);
4658}
4659
4660static void handle_stripe(struct stripe_head *sh)
4661{
4662 struct stripe_head_state s;
4663 struct r5conf *conf = sh->raid_conf;
4664 int i;
4665 int prexor;
4666 int disks = sh->disks;
4667 struct r5dev *pdev, *qdev;
4668
4669 clear_bit(STRIPE_HANDLE, &sh->state);
4670 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4671
4672
4673 set_bit(STRIPE_HANDLE, &sh->state);
4674 return;
4675 }
4676
4677 if (clear_batch_ready(sh) ) {
4678 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4679 return;
4680 }
4681
4682 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4683 break_stripe_batch_list(sh, 0);
4684
4685 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4686 spin_lock(&sh->stripe_lock);
4687
4688
4689
4690
4691 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4692 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4693 !test_bit(STRIPE_DISCARD, &sh->state) &&
4694 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4695 set_bit(STRIPE_SYNCING, &sh->state);
4696 clear_bit(STRIPE_INSYNC, &sh->state);
4697 clear_bit(STRIPE_REPLACED, &sh->state);
4698 }
4699 spin_unlock(&sh->stripe_lock);
4700 }
4701 clear_bit(STRIPE_DELAYED, &sh->state);
4702
4703 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4704 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4705 (unsigned long long)sh->sector, sh->state,
4706 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4707 sh->check_state, sh->reconstruct_state);
4708
4709 analyse_stripe(sh, &s);
4710
4711 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4712 goto finish;
4713
4714 if (s.handle_bad_blocks ||
4715 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4716 set_bit(STRIPE_HANDLE, &sh->state);
4717 goto finish;
4718 }
4719
4720 if (unlikely(s.blocked_rdev)) {
4721 if (s.syncing || s.expanding || s.expanded ||
4722 s.replacing || s.to_write || s.written) {
4723 set_bit(STRIPE_HANDLE, &sh->state);
4724 goto finish;
4725 }
4726
4727 rdev_dec_pending(s.blocked_rdev, conf->mddev);
4728 s.blocked_rdev = NULL;
4729 }
4730
4731 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4732 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4733 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4734 }
4735
4736 pr_debug("locked=%d uptodate=%d to_read=%d"
4737 " to_write=%d failed=%d failed_num=%d,%d\n",
4738 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4739 s.failed_num[0], s.failed_num[1]);
4740
4741
4742
4743
4744
4745
4746
4747 if (s.failed > conf->max_degraded ||
4748 (s.log_failed && s.injournal == 0)) {
4749 sh->check_state = 0;
4750 sh->reconstruct_state = 0;
4751 break_stripe_batch_list(sh, 0);
4752 if (s.to_read+s.to_write+s.written)
4753 handle_failed_stripe(conf, sh, &s, disks);
4754 if (s.syncing + s.replacing)
4755 handle_failed_sync(conf, sh, &s);
4756 }
4757
4758
4759
4760
4761 prexor = 0;
4762 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4763 prexor = 1;
4764 if (sh->reconstruct_state == reconstruct_state_drain_result ||
4765 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4766 sh->reconstruct_state = reconstruct_state_idle;
4767
4768
4769
4770
4771 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4772 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4773 BUG_ON(sh->qd_idx >= 0 &&
4774 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4775 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4776 for (i = disks; i--; ) {
4777 struct r5dev *dev = &sh->dev[i];
4778 if (test_bit(R5_LOCKED, &dev->flags) &&
4779 (i == sh->pd_idx || i == sh->qd_idx ||
4780 dev->written || test_bit(R5_InJournal,
4781 &dev->flags))) {
4782 pr_debug("Writing block %d\n", i);
4783 set_bit(R5_Wantwrite, &dev->flags);
4784 if (prexor)
4785 continue;
4786 if (s.failed > 1)
4787 continue;
4788 if (!test_bit(R5_Insync, &dev->flags) ||
4789 ((i == sh->pd_idx || i == sh->qd_idx) &&
4790 s.failed == 0))
4791 set_bit(STRIPE_INSYNC, &sh->state);
4792 }
4793 }
4794 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4795 s.dec_preread_active = 1;
4796 }
4797
4798
4799
4800
4801
4802 pdev = &sh->dev[sh->pd_idx];
4803 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
4804 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
4805 qdev = &sh->dev[sh->qd_idx];
4806 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
4807 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
4808 || conf->level < 6;
4809
4810 if (s.written &&
4811 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
4812 && !test_bit(R5_LOCKED, &pdev->flags)
4813 && (test_bit(R5_UPTODATE, &pdev->flags) ||
4814 test_bit(R5_Discard, &pdev->flags))))) &&
4815 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
4816 && !test_bit(R5_LOCKED, &qdev->flags)
4817 && (test_bit(R5_UPTODATE, &qdev->flags) ||
4818 test_bit(R5_Discard, &qdev->flags))))))
4819 handle_stripe_clean_event(conf, sh, disks);
4820
4821 if (s.just_cached)
4822 r5c_handle_cached_data_endio(conf, sh, disks);
4823 log_stripe_write_finished(sh);
4824
4825
4826
4827
4828
4829 if (s.to_read || s.non_overwrite
4830 || (conf->level == 6 && s.to_write && s.failed)
4831 || (s.syncing && (s.uptodate + s.compute < disks))
4832 || s.replacing
4833 || s.expanding)
4834 handle_stripe_fill(sh, &s, disks);
4835
4836
4837
4838
4839
4840
4841 r5c_finish_stripe_write_out(conf, sh, &s);
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4853 if (!r5c_is_writeback(conf->log)) {
4854 if (s.to_write)
4855 handle_stripe_dirtying(conf, sh, &s, disks);
4856 } else {
4857 int ret = 0;
4858
4859
4860 if (s.to_write)
4861 ret = r5c_try_caching_write(conf, sh, &s,
4862 disks);
4863
4864
4865
4866
4867
4868
4869
4870 if (ret == -EAGAIN ||
4871
4872 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4873 s.injournal > 0)) {
4874 ret = handle_stripe_dirtying(conf, sh, &s,
4875 disks);
4876 if (ret == -EAGAIN)
4877 goto finish;
4878 }
4879 }
4880 }
4881
4882
4883
4884
4885
4886
4887 if (sh->check_state ||
4888 (s.syncing && s.locked == 0 &&
4889 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4890 !test_bit(STRIPE_INSYNC, &sh->state))) {
4891 if (conf->level == 6)
4892 handle_parity_checks6(conf, sh, &s, disks);
4893 else
4894 handle_parity_checks5(conf, sh, &s, disks);
4895 }
4896
4897 if ((s.replacing || s.syncing) && s.locked == 0
4898 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
4899 && !test_bit(STRIPE_REPLACED, &sh->state)) {
4900
4901 for (i = 0; i < conf->raid_disks; i++)
4902 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
4903 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
4904 set_bit(R5_WantReplace, &sh->dev[i].flags);
4905 set_bit(R5_LOCKED, &sh->dev[i].flags);
4906 s.locked++;
4907 }
4908 if (s.replacing)
4909 set_bit(STRIPE_INSYNC, &sh->state);
4910 set_bit(STRIPE_REPLACED, &sh->state);
4911 }
4912 if ((s.syncing || s.replacing) && s.locked == 0 &&
4913 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4914 test_bit(STRIPE_INSYNC, &sh->state)) {
4915 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4916 clear_bit(STRIPE_SYNCING, &sh->state);
4917 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
4918 wake_up(&conf->wait_for_overlap);
4919 }
4920
4921
4922
4923
4924 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
4925 for (i = 0; i < s.failed; i++) {
4926 struct r5dev *dev = &sh->dev[s.failed_num[i]];
4927 if (test_bit(R5_ReadError, &dev->flags)
4928 && !test_bit(R5_LOCKED, &dev->flags)
4929 && test_bit(R5_UPTODATE, &dev->flags)
4930 ) {
4931 if (!test_bit(R5_ReWrite, &dev->flags)) {
4932 set_bit(R5_Wantwrite, &dev->flags);
4933 set_bit(R5_ReWrite, &dev->flags);
4934 set_bit(R5_LOCKED, &dev->flags);
4935 s.locked++;
4936 } else {
4937
4938 set_bit(R5_Wantread, &dev->flags);
4939 set_bit(R5_LOCKED, &dev->flags);
4940 s.locked++;
4941 }
4942 }
4943 }
4944
4945
4946 if (sh->reconstruct_state == reconstruct_state_result) {
4947 struct stripe_head *sh_src
4948 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
4949 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
4950
4951
4952
4953 set_bit(STRIPE_DELAYED, &sh->state);
4954 set_bit(STRIPE_HANDLE, &sh->state);
4955 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
4956 &sh_src->state))
4957 atomic_inc(&conf->preread_active_stripes);
4958 raid5_release_stripe(sh_src);
4959 goto finish;
4960 }
4961 if (sh_src)
4962 raid5_release_stripe(sh_src);
4963
4964 sh->reconstruct_state = reconstruct_state_idle;
4965 clear_bit(STRIPE_EXPANDING, &sh->state);
4966 for (i = conf->raid_disks; i--; ) {
4967 set_bit(R5_Wantwrite, &sh->dev[i].flags);
4968 set_bit(R5_LOCKED, &sh->dev[i].flags);
4969 s.locked++;
4970 }
4971 }
4972
4973 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
4974 !sh->reconstruct_state) {
4975
4976 sh->disks = conf->raid_disks;
4977 stripe_set_idx(sh->sector, conf, 0, sh);
4978 schedule_reconstruction(sh, &s, 1, 1);
4979 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
4980 clear_bit(STRIPE_EXPAND_READY, &sh->state);
4981 atomic_dec(&conf->reshape_stripes);
4982 wake_up(&conf->wait_for_overlap);
4983 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4984 }
4985
4986 if (s.expanding && s.locked == 0 &&
4987 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
4988 handle_stripe_expansion(conf, sh);
4989
4990finish:
4991
4992 if (unlikely(s.blocked_rdev)) {
4993 if (conf->mddev->external)
4994 md_wait_for_blocked_rdev(s.blocked_rdev,
4995 conf->mddev);
4996 else
4997
4998
4999
5000
5001 rdev_dec_pending(s.blocked_rdev,
5002 conf->mddev);
5003 }
5004
5005 if (s.handle_bad_blocks)
5006 for (i = disks; i--; ) {
5007 struct md_rdev *rdev;
5008 struct r5dev *dev = &sh->dev[i];
5009 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5010
5011 rdev = conf->disks[i].rdev;
5012 if (!rdev_set_badblocks(rdev, sh->sector,
5013 STRIPE_SECTORS, 0))
5014 md_error(conf->mddev, rdev);
5015 rdev_dec_pending(rdev, conf->mddev);
5016 }
5017 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5018 rdev = conf->disks[i].rdev;
5019 rdev_clear_badblocks(rdev, sh->sector,
5020 STRIPE_SECTORS, 0);
5021 rdev_dec_pending(rdev, conf->mddev);
5022 }
5023 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5024 rdev = conf->disks[i].replacement;
5025 if (!rdev)
5026
5027 rdev = conf->disks[i].rdev;
5028 rdev_clear_badblocks(rdev, sh->sector,
5029 STRIPE_SECTORS, 0);
5030 rdev_dec_pending(rdev, conf->mddev);
5031 }
5032 }
5033
5034 if (s.ops_request)
5035 raid_run_ops(sh, s.ops_request);
5036
5037 ops_run_io(sh, &s);
5038
5039 if (s.dec_preread_active) {
5040
5041
5042
5043
5044 atomic_dec(&conf->preread_active_stripes);
5045 if (atomic_read(&conf->preread_active_stripes) <
5046 IO_THRESHOLD)
5047 md_wakeup_thread(conf->mddev->thread);
5048 }
5049
5050 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5051}
5052
5053static void raid5_activate_delayed(struct r5conf *conf)
5054{
5055 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5056 while (!list_empty(&conf->delayed_list)) {
5057 struct list_head *l = conf->delayed_list.next;
5058 struct stripe_head *sh;
5059 sh = list_entry(l, struct stripe_head, lru);
5060 list_del_init(l);
5061 clear_bit(STRIPE_DELAYED, &sh->state);
5062 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5063 atomic_inc(&conf->preread_active_stripes);
5064 list_add_tail(&sh->lru, &conf->hold_list);
5065 raid5_wakeup_stripe_thread(sh);
5066 }
5067 }
5068}
5069
5070static void activate_bit_delay(struct r5conf *conf,
5071 struct list_head *temp_inactive_list)
5072{
5073
5074 struct list_head head;
5075 list_add(&head, &conf->bitmap_list);
5076 list_del_init(&conf->bitmap_list);
5077 while (!list_empty(&head)) {
5078 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5079 int hash;
5080 list_del_init(&sh->lru);
5081 atomic_inc(&sh->count);
5082 hash = sh->hash_lock_index;
5083 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5084 }
5085}
5086
5087static int raid5_congested(struct mddev *mddev, int bits)
5088{
5089 struct r5conf *conf = mddev->private;
5090
5091
5092
5093
5094
5095 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
5096 return 1;
5097
5098
5099 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
5100 return 1;
5101 if (conf->quiesce)
5102 return 1;
5103 if (atomic_read(&conf->empty_inactive_list_nr))
5104 return 1;
5105
5106 return 0;
5107}
5108
5109static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5110{
5111 struct r5conf *conf = mddev->private;
5112 sector_t sector = bio->bi_iter.bi_sector;
5113 unsigned int chunk_sectors;
5114 unsigned int bio_sectors = bio_sectors(bio);
5115
5116 WARN_ON_ONCE(bio->bi_partno);
5117
5118 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5119 return chunk_sectors >=
5120 ((sector & (chunk_sectors - 1)) + bio_sectors);
5121}
5122
5123
5124
5125
5126
5127static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5128{
5129 unsigned long flags;
5130
5131 spin_lock_irqsave(&conf->device_lock, flags);
5132
5133 bi->bi_next = conf->retry_read_aligned_list;
5134 conf->retry_read_aligned_list = bi;
5135
5136 spin_unlock_irqrestore(&conf->device_lock, flags);
5137 md_wakeup_thread(conf->mddev->thread);
5138}
5139
5140static struct bio *remove_bio_from_retry(struct r5conf *conf,
5141 unsigned int *offset)
5142{
5143 struct bio *bi;
5144
5145 bi = conf->retry_read_aligned;
5146 if (bi) {
5147 *offset = conf->retry_read_offset;
5148 conf->retry_read_aligned = NULL;
5149 return bi;
5150 }
5151 bi = conf->retry_read_aligned_list;
5152 if(bi) {
5153 conf->retry_read_aligned_list = bi->bi_next;
5154 bi->bi_next = NULL;
5155 *offset = 0;
5156 }
5157
5158 return bi;
5159}
5160
5161
5162
5163
5164
5165
5166
5167static void raid5_align_endio(struct bio *bi)
5168{
5169 struct bio* raid_bi = bi->bi_private;
5170 struct mddev *mddev;
5171 struct r5conf *conf;
5172 struct md_rdev *rdev;
5173 blk_status_t error = bi->bi_status;
5174
5175 bio_put(bi);
5176
5177 rdev = (void*)raid_bi->bi_next;
5178 raid_bi->bi_next = NULL;
5179 mddev = rdev->mddev;
5180 conf = mddev->private;
5181
5182 rdev_dec_pending(rdev, conf->mddev);
5183
5184 if (!error) {
5185 bio_endio(raid_bi);
5186 if (atomic_dec_and_test(&conf->active_aligned_reads))
5187 wake_up(&conf->wait_for_quiescent);
5188 return;
5189 }
5190
5191 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5192
5193 add_bio_to_retry(raid_bi, conf);
5194}
5195
5196static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5197{
5198 struct r5conf *conf = mddev->private;
5199 int dd_idx;
5200 struct bio* align_bi;
5201 struct md_rdev *rdev;
5202 sector_t end_sector;
5203
5204 if (!in_chunk_boundary(mddev, raid_bio)) {
5205 pr_debug("%s: non aligned\n", __func__);
5206 return 0;
5207 }
5208
5209
5210
5211 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
5212 if (!align_bi)
5213 return 0;
5214
5215
5216
5217
5218 align_bi->bi_end_io = raid5_align_endio;
5219 align_bi->bi_private = raid_bio;
5220
5221
5222
5223 align_bi->bi_iter.bi_sector =
5224 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
5225 0, &dd_idx, NULL);
5226
5227 end_sector = bio_end_sector(align_bi);
5228 rcu_read_lock();
5229 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5230 if (!rdev || test_bit(Faulty, &rdev->flags) ||
5231 rdev->recovery_offset < end_sector) {
5232 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5233 if (rdev &&
5234 (test_bit(Faulty, &rdev->flags) ||
5235 !(test_bit(In_sync, &rdev->flags) ||
5236 rdev->recovery_offset >= end_sector)))
5237 rdev = NULL;
5238 }
5239
5240 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
5241 rcu_read_unlock();
5242 bio_put(align_bi);
5243 return 0;
5244 }
5245
5246 if (rdev) {
5247 sector_t first_bad;
5248 int bad_sectors;
5249
5250 atomic_inc(&rdev->nr_pending);
5251 rcu_read_unlock();
5252 raid_bio->bi_next = (void*)rdev;
5253 bio_set_dev(align_bi, rdev->bdev);
5254
5255 if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
5256 bio_sectors(align_bi),
5257 &first_bad, &bad_sectors)) {
5258 bio_put(align_bi);
5259 rdev_dec_pending(rdev, mddev);
5260 return 0;
5261 }
5262
5263
5264 align_bi->bi_iter.bi_sector += rdev->data_offset;
5265
5266 spin_lock_irq(&conf->device_lock);
5267 wait_event_lock_irq(conf->wait_for_quiescent,
5268 conf->quiesce == 0,
5269 conf->device_lock);
5270 atomic_inc(&conf->active_aligned_reads);
5271 spin_unlock_irq(&conf->device_lock);
5272
5273 if (mddev->gendisk)
5274 trace_block_bio_remap(align_bi->bi_disk->queue,
5275 align_bi, disk_devt(mddev->gendisk),
5276 raid_bio->bi_iter.bi_sector);
5277 generic_make_request(align_bi);
5278 return 1;
5279 } else {
5280 rcu_read_unlock();
5281 bio_put(align_bi);
5282 return 0;
5283 }
5284}
5285
5286static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5287{
5288 struct bio *split;
5289 sector_t sector = raid_bio->bi_iter.bi_sector;
5290 unsigned chunk_sects = mddev->chunk_sectors;
5291 unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5292
5293 if (sectors < bio_sectors(raid_bio)) {
5294 struct r5conf *conf = mddev->private;
5295 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5296 bio_chain(split, raid_bio);
5297 generic_make_request(raid_bio);
5298 raid_bio = split;
5299 }
5300
5301 if (!raid5_read_one_chunk(mddev, raid_bio))
5302 return raid_bio;
5303
5304 return NULL;
5305}
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5318{
5319 struct stripe_head *sh, *tmp;
5320 struct list_head *handle_list = NULL;
5321 struct r5worker_group *wg;
5322 bool second_try = !r5c_is_writeback(conf->log) &&
5323 !r5l_log_disk_error(conf);
5324 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5325 r5l_log_disk_error(conf);
5326
5327again:
5328 wg = NULL;
5329 sh = NULL;
5330 if (conf->worker_cnt_per_group == 0) {
5331 handle_list = try_loprio ? &conf->loprio_list :
5332 &conf->handle_list;
5333 } else if (group != ANY_GROUP) {
5334 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5335 &conf->worker_groups[group].handle_list;
5336 wg = &conf->worker_groups[group];
5337 } else {
5338 int i;
5339 for (i = 0; i < conf->group_cnt; i++) {
5340 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5341 &conf->worker_groups[i].handle_list;
5342 wg = &conf->worker_groups[i];
5343 if (!list_empty(handle_list))
5344 break;
5345 }
5346 }
5347
5348 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5349 __func__,
5350 list_empty(handle_list) ? "empty" : "busy",
5351 list_empty(&conf->hold_list) ? "empty" : "busy",
5352 atomic_read(&conf->pending_full_writes), conf->bypass_count);
5353
5354 if (!list_empty(handle_list)) {
5355 sh = list_entry(handle_list->next, typeof(*sh), lru);
5356
5357 if (list_empty(&conf->hold_list))
5358 conf->bypass_count = 0;
5359 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5360 if (conf->hold_list.next == conf->last_hold)
5361 conf->bypass_count++;
5362 else {
5363 conf->last_hold = conf->hold_list.next;
5364 conf->bypass_count -= conf->bypass_threshold;
5365 if (conf->bypass_count < 0)
5366 conf->bypass_count = 0;
5367 }
5368 }
5369 } else if (!list_empty(&conf->hold_list) &&
5370 ((conf->bypass_threshold &&
5371 conf->bypass_count > conf->bypass_threshold) ||
5372 atomic_read(&conf->pending_full_writes) == 0)) {
5373
5374 list_for_each_entry(tmp, &conf->hold_list, lru) {
5375 if (conf->worker_cnt_per_group == 0 ||
5376 group == ANY_GROUP ||
5377 !cpu_online(tmp->cpu) ||
5378 cpu_to_group(tmp->cpu) == group) {
5379 sh = tmp;
5380 break;
5381 }
5382 }
5383
5384 if (sh) {
5385 conf->bypass_count -= conf->bypass_threshold;
5386 if (conf->bypass_count < 0)
5387 conf->bypass_count = 0;
5388 }
5389 wg = NULL;
5390 }
5391
5392 if (!sh) {
5393 if (second_try)
5394 return NULL;
5395 second_try = true;
5396 try_loprio = !try_loprio;
5397 goto again;
5398 }
5399
5400 if (wg) {
5401 wg->stripes_cnt--;
5402 sh->group = NULL;
5403 }
5404 list_del_init(&sh->lru);
5405 BUG_ON(atomic_inc_return(&sh->count) != 1);
5406 return sh;
5407}
5408
5409struct raid5_plug_cb {
5410 struct blk_plug_cb cb;
5411 struct list_head list;
5412 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5413};
5414
5415static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5416{
5417 struct raid5_plug_cb *cb = container_of(
5418 blk_cb, struct raid5_plug_cb, cb);
5419 struct stripe_head *sh;
5420 struct mddev *mddev = cb->cb.data;
5421 struct r5conf *conf = mddev->private;
5422 int cnt = 0;
5423 int hash;
5424
5425 if (cb->list.next && !list_empty(&cb->list)) {
5426 spin_lock_irq(&conf->device_lock);
5427 while (!list_empty(&cb->list)) {
5428 sh = list_first_entry(&cb->list, struct stripe_head, lru);
5429 list_del_init(&sh->lru);
5430
5431
5432
5433
5434
5435 smp_mb__before_atomic();
5436 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5437
5438
5439
5440
5441 hash = sh->hash_lock_index;
5442 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5443 cnt++;
5444 }
5445 spin_unlock_irq(&conf->device_lock);
5446 }
5447 release_inactive_stripe_list(conf, cb->temp_inactive_list,
5448 NR_STRIPE_HASH_LOCKS);
5449 if (mddev->queue)
5450 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5451 kfree(cb);
5452}
5453
5454static void release_stripe_plug(struct mddev *mddev,
5455 struct stripe_head *sh)
5456{
5457 struct blk_plug_cb *blk_cb = blk_check_plugged(
5458 raid5_unplug, mddev,
5459 sizeof(struct raid5_plug_cb));
5460 struct raid5_plug_cb *cb;
5461
5462 if (!blk_cb) {
5463 raid5_release_stripe(sh);
5464 return;
5465 }
5466
5467 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5468
5469 if (cb->list.next == NULL) {
5470 int i;
5471 INIT_LIST_HEAD(&cb->list);
5472 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5473 INIT_LIST_HEAD(cb->temp_inactive_list + i);
5474 }
5475
5476 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5477 list_add_tail(&sh->lru, &cb->list);
5478 else
5479 raid5_release_stripe(sh);
5480}
5481
5482static void make_discard_request(struct mddev *mddev, struct bio *bi)
5483{
5484 struct r5conf *conf = mddev->private;
5485 sector_t logical_sector, last_sector;
5486 struct stripe_head *sh;
5487 int stripe_sectors;
5488
5489 if (mddev->reshape_position != MaxSector)
5490
5491 return;
5492
5493 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5494 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
5495
5496 bi->bi_next = NULL;
5497
5498 stripe_sectors = conf->chunk_sectors *
5499 (conf->raid_disks - conf->max_degraded);
5500 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5501 stripe_sectors);
5502 sector_div(last_sector, stripe_sectors);
5503
5504 logical_sector *= conf->chunk_sectors;
5505 last_sector *= conf->chunk_sectors;
5506
5507 for (; logical_sector < last_sector;
5508 logical_sector += STRIPE_SECTORS) {
5509 DEFINE_WAIT(w);
5510 int d;
5511 again:
5512 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5513 prepare_to_wait(&conf->wait_for_overlap, &w,
5514 TASK_UNINTERRUPTIBLE);
5515 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5516 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5517 raid5_release_stripe(sh);
5518 schedule();
5519 goto again;
5520 }
5521 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5522 spin_lock_irq(&sh->stripe_lock);
5523 for (d = 0; d < conf->raid_disks; d++) {
5524 if (d == sh->pd_idx || d == sh->qd_idx)
5525 continue;
5526 if (sh->dev[d].towrite || sh->dev[d].toread) {
5527 set_bit(R5_Overlap, &sh->dev[d].flags);
5528 spin_unlock_irq(&sh->stripe_lock);
5529 raid5_release_stripe(sh);
5530 schedule();
5531 goto again;
5532 }
5533 }
5534 set_bit(STRIPE_DISCARD, &sh->state);
5535 finish_wait(&conf->wait_for_overlap, &w);
5536 sh->overwrite_disks = 0;
5537 for (d = 0; d < conf->raid_disks; d++) {
5538 if (d == sh->pd_idx || d == sh->qd_idx)
5539 continue;
5540 sh->dev[d].towrite = bi;
5541 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5542 bio_inc_remaining(bi);
5543 md_write_inc(mddev, bi);
5544 sh->overwrite_disks++;
5545 }
5546 spin_unlock_irq(&sh->stripe_lock);
5547 if (conf->mddev->bitmap) {
5548 for (d = 0;
5549 d < conf->raid_disks - conf->max_degraded;
5550 d++)
5551 md_bitmap_startwrite(mddev->bitmap,
5552 sh->sector,
5553 STRIPE_SECTORS,
5554 0);
5555 sh->bm_seq = conf->seq_flush + 1;
5556 set_bit(STRIPE_BIT_DELAY, &sh->state);
5557 }
5558
5559 set_bit(STRIPE_HANDLE, &sh->state);
5560 clear_bit(STRIPE_DELAYED, &sh->state);
5561 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5562 atomic_inc(&conf->preread_active_stripes);
5563 release_stripe_plug(mddev, sh);
5564 }
5565
5566 bio_endio(bi);
5567}
5568
5569static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5570{
5571 struct r5conf *conf = mddev->private;
5572 int dd_idx;
5573 sector_t new_sector;
5574 sector_t logical_sector, last_sector;
5575 struct stripe_head *sh;
5576 const int rw = bio_data_dir(bi);
5577 DEFINE_WAIT(w);
5578 bool do_prepare;
5579 bool do_flush = false;
5580
5581 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5582 int ret = log_handle_flush_request(conf, bi);
5583
5584 if (ret == 0)
5585 return true;
5586 if (ret == -ENODEV) {
5587 md_flush_request(mddev, bi);
5588 return true;
5589 }
5590
5591
5592
5593
5594
5595 do_flush = bi->bi_opf & REQ_PREFLUSH;
5596 }
5597
5598 if (!md_write_start(mddev, bi))
5599 return false;
5600
5601
5602
5603
5604
5605 if (rw == READ && mddev->degraded == 0 &&
5606 mddev->reshape_position == MaxSector) {
5607 bi = chunk_aligned_read(mddev, bi);
5608 if (!bi)
5609 return true;
5610 }
5611
5612 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5613 make_discard_request(mddev, bi);
5614 md_write_end(mddev);
5615 return true;
5616 }
5617
5618 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5619 last_sector = bio_end_sector(bi);
5620 bi->bi_next = NULL;
5621
5622 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5623 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5624 int previous;
5625 int seq;
5626
5627 do_prepare = false;
5628 retry:
5629 seq = read_seqcount_begin(&conf->gen_lock);
5630 previous = 0;
5631 if (do_prepare)
5632 prepare_to_wait(&conf->wait_for_overlap, &w,
5633 TASK_UNINTERRUPTIBLE);
5634 if (unlikely(conf->reshape_progress != MaxSector)) {
5635
5636
5637
5638
5639
5640
5641
5642
5643 spin_lock_irq(&conf->device_lock);
5644 if (mddev->reshape_backwards
5645 ? logical_sector < conf->reshape_progress
5646 : logical_sector >= conf->reshape_progress) {
5647 previous = 1;
5648 } else {
5649 if (mddev->reshape_backwards
5650 ? logical_sector < conf->reshape_safe
5651 : logical_sector >= conf->reshape_safe) {
5652 spin_unlock_irq(&conf->device_lock);
5653 schedule();
5654 do_prepare = true;
5655 goto retry;
5656 }
5657 }
5658 spin_unlock_irq(&conf->device_lock);
5659 }
5660
5661 new_sector = raid5_compute_sector(conf, logical_sector,
5662 previous,
5663 &dd_idx, NULL);
5664 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5665 (unsigned long long)new_sector,
5666 (unsigned long long)logical_sector);
5667
5668 sh = raid5_get_active_stripe(conf, new_sector, previous,
5669 (bi->bi_opf & REQ_RAHEAD), 0);
5670 if (sh) {
5671 if (unlikely(previous)) {
5672
5673
5674
5675
5676
5677
5678
5679
5680 int must_retry = 0;
5681 spin_lock_irq(&conf->device_lock);
5682 if (mddev->reshape_backwards
5683 ? logical_sector >= conf->reshape_progress
5684 : logical_sector < conf->reshape_progress)
5685
5686 must_retry = 1;
5687 spin_unlock_irq(&conf->device_lock);
5688 if (must_retry) {
5689 raid5_release_stripe(sh);
5690 schedule();
5691 do_prepare = true;
5692 goto retry;
5693 }
5694 }
5695 if (read_seqcount_retry(&conf->gen_lock, seq)) {
5696
5697
5698
5699 raid5_release_stripe(sh);
5700 goto retry;
5701 }
5702
5703 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5704 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5705
5706
5707
5708
5709 md_wakeup_thread(mddev->thread);
5710 raid5_release_stripe(sh);
5711 schedule();
5712 do_prepare = true;
5713 goto retry;
5714 }
5715 if (do_flush) {
5716 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5717
5718 do_flush = false;
5719 }
5720
5721 set_bit(STRIPE_HANDLE, &sh->state);
5722 clear_bit(STRIPE_DELAYED, &sh->state);
5723 if ((!sh->batch_head || sh == sh->batch_head) &&
5724 (bi->bi_opf & REQ_SYNC) &&
5725 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5726 atomic_inc(&conf->preread_active_stripes);
5727 release_stripe_plug(mddev, sh);
5728 } else {
5729
5730 bi->bi_status = BLK_STS_IOERR;
5731 break;
5732 }
5733 }
5734 finish_wait(&conf->wait_for_overlap, &w);
5735
5736 if (rw == WRITE)
5737 md_write_end(mddev);
5738 bio_endio(bi);
5739 return true;
5740}
5741
5742static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5743
5744static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5745{
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755 struct r5conf *conf = mddev->private;
5756 struct stripe_head *sh;
5757 struct md_rdev *rdev;
5758 sector_t first_sector, last_sector;
5759 int raid_disks = conf->previous_raid_disks;
5760 int data_disks = raid_disks - conf->max_degraded;
5761 int new_data_disks = conf->raid_disks - conf->max_degraded;
5762 int i;
5763 int dd_idx;
5764 sector_t writepos, readpos, safepos;
5765 sector_t stripe_addr;
5766 int reshape_sectors;
5767 struct list_head stripes;
5768 sector_t retn;
5769
5770 if (sector_nr == 0) {
5771
5772 if (mddev->reshape_backwards &&
5773 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5774 sector_nr = raid5_size(mddev, 0, 0)
5775 - conf->reshape_progress;
5776 } else if (mddev->reshape_backwards &&
5777 conf->reshape_progress == MaxSector) {
5778
5779 sector_nr = MaxSector;
5780 } else if (!mddev->reshape_backwards &&
5781 conf->reshape_progress > 0)
5782 sector_nr = conf->reshape_progress;
5783 sector_div(sector_nr, new_data_disks);
5784 if (sector_nr) {
5785 mddev->curr_resync_completed = sector_nr;
5786 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5787 *skipped = 1;
5788 retn = sector_nr;
5789 goto finish;
5790 }
5791 }
5792
5793
5794
5795
5796
5797
5798 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
5799
5800
5801
5802
5803
5804
5805
5806 writepos = conf->reshape_progress;
5807 sector_div(writepos, new_data_disks);
5808 readpos = conf->reshape_progress;
5809 sector_div(readpos, data_disks);
5810 safepos = conf->reshape_safe;
5811 sector_div(safepos, data_disks);
5812 if (mddev->reshape_backwards) {
5813 BUG_ON(writepos < reshape_sectors);
5814 writepos -= reshape_sectors;
5815 readpos += reshape_sectors;
5816 safepos += reshape_sectors;
5817 } else {
5818 writepos += reshape_sectors;
5819
5820
5821
5822
5823 readpos -= min_t(sector_t, reshape_sectors, readpos);
5824 safepos -= min_t(sector_t, reshape_sectors, safepos);
5825 }
5826
5827
5828
5829
5830 if (mddev->reshape_backwards) {
5831 BUG_ON(conf->reshape_progress == 0);
5832 stripe_addr = writepos;
5833 BUG_ON((mddev->dev_sectors &
5834 ~((sector_t)reshape_sectors - 1))
5835 - reshape_sectors - stripe_addr
5836 != sector_nr);
5837 } else {
5838 BUG_ON(writepos != sector_nr + reshape_sectors);
5839 stripe_addr = sector_nr;
5840 }
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862 if (conf->min_offset_diff < 0) {
5863 safepos += -conf->min_offset_diff;
5864 readpos += -conf->min_offset_diff;
5865 } else
5866 writepos += conf->min_offset_diff;
5867
5868 if ((mddev->reshape_backwards
5869 ? (safepos > writepos && readpos < writepos)
5870 : (safepos < writepos && readpos > writepos)) ||
5871 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
5872
5873 wait_event(conf->wait_for_overlap,
5874 atomic_read(&conf->reshape_stripes)==0
5875 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5876 if (atomic_read(&conf->reshape_stripes) != 0)
5877 return 0;
5878 mddev->reshape_position = conf->reshape_progress;
5879 mddev->curr_resync_completed = sector_nr;
5880 if (!mddev->reshape_backwards)
5881
5882 rdev_for_each(rdev, mddev)
5883 if (rdev->raid_disk >= 0 &&
5884 !test_bit(Journal, &rdev->flags) &&
5885 !test_bit(In_sync, &rdev->flags) &&
5886 rdev->recovery_offset < sector_nr)
5887 rdev->recovery_offset = sector_nr;
5888
5889 conf->reshape_checkpoint = jiffies;
5890 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5891 md_wakeup_thread(mddev->thread);
5892 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
5893 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5894 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5895 return 0;
5896 spin_lock_irq(&conf->device_lock);
5897 conf->reshape_safe = mddev->reshape_position;
5898 spin_unlock_irq(&conf->device_lock);
5899 wake_up(&conf->wait_for_overlap);
5900 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5901 }
5902
5903 INIT_LIST_HEAD(&stripes);
5904 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
5905 int j;
5906 int skipped_disk = 0;
5907 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
5908 set_bit(STRIPE_EXPANDING, &sh->state);
5909 atomic_inc(&conf->reshape_stripes);
5910
5911
5912
5913 for (j=sh->disks; j--;) {
5914 sector_t s;
5915 if (j == sh->pd_idx)
5916 continue;
5917 if (conf->level == 6 &&
5918 j == sh->qd_idx)
5919 continue;
5920 s = raid5_compute_blocknr(sh, j, 0);
5921 if (s < raid5_size(mddev, 0, 0)) {
5922 skipped_disk = 1;
5923 continue;
5924 }
5925 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
5926 set_bit(R5_Expanded, &sh->dev[j].flags);
5927 set_bit(R5_UPTODATE, &sh->dev[j].flags);
5928 }
5929 if (!skipped_disk) {
5930 set_bit(STRIPE_EXPAND_READY, &sh->state);
5931 set_bit(STRIPE_HANDLE, &sh->state);
5932 }
5933 list_add(&sh->lru, &stripes);
5934 }
5935 spin_lock_irq(&conf->device_lock);
5936 if (mddev->reshape_backwards)
5937 conf->reshape_progress -= reshape_sectors * new_data_disks;
5938 else
5939 conf->reshape_progress += reshape_sectors * new_data_disks;
5940 spin_unlock_irq(&conf->device_lock);
5941
5942
5943
5944
5945
5946 first_sector =
5947 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
5948 1, &dd_idx, NULL);
5949 last_sector =
5950 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
5951 * new_data_disks - 1),
5952 1, &dd_idx, NULL);
5953 if (last_sector >= mddev->dev_sectors)
5954 last_sector = mddev->dev_sectors - 1;
5955 while (first_sector <= last_sector) {
5956 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
5957 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
5958 set_bit(STRIPE_HANDLE, &sh->state);
5959 raid5_release_stripe(sh);
5960 first_sector += STRIPE_SECTORS;
5961 }
5962
5963
5964
5965 while (!list_empty(&stripes)) {
5966 sh = list_entry(stripes.next, struct stripe_head, lru);
5967 list_del_init(&sh->lru);
5968 raid5_release_stripe(sh);
5969 }
5970
5971
5972
5973 sector_nr += reshape_sectors;
5974 retn = reshape_sectors;
5975finish:
5976 if (mddev->curr_resync_completed > mddev->resync_max ||
5977 (sector_nr - mddev->curr_resync_completed) * 2
5978 >= mddev->resync_max - mddev->curr_resync_completed) {
5979
5980 wait_event(conf->wait_for_overlap,
5981 atomic_read(&conf->reshape_stripes) == 0
5982 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5983 if (atomic_read(&conf->reshape_stripes) != 0)
5984 goto ret;
5985 mddev->reshape_position = conf->reshape_progress;
5986 mddev->curr_resync_completed = sector_nr;
5987 if (!mddev->reshape_backwards)
5988
5989 rdev_for_each(rdev, mddev)
5990 if (rdev->raid_disk >= 0 &&
5991 !test_bit(Journal, &rdev->flags) &&
5992 !test_bit(In_sync, &rdev->flags) &&
5993 rdev->recovery_offset < sector_nr)
5994 rdev->recovery_offset = sector_nr;
5995 conf->reshape_checkpoint = jiffies;
5996 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5997 md_wakeup_thread(mddev->thread);
5998 wait_event(mddev->sb_wait,
5999 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6000 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6001 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6002 goto ret;
6003 spin_lock_irq(&conf->device_lock);
6004 conf->reshape_safe = mddev->reshape_position;
6005 spin_unlock_irq(&conf->device_lock);
6006 wake_up(&conf->wait_for_overlap);
6007 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6008 }
6009ret:
6010 return retn;
6011}
6012
6013static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6014 int *skipped)
6015{
6016 struct r5conf *conf = mddev->private;
6017 struct stripe_head *sh;
6018 sector_t max_sector = mddev->dev_sectors;
6019 sector_t sync_blocks;
6020 int still_degraded = 0;
6021 int i;
6022
6023 if (sector_nr >= max_sector) {
6024
6025
6026 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6027 end_reshape(conf);
6028 return 0;
6029 }
6030
6031 if (mddev->curr_resync < max_sector)
6032 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6033 &sync_blocks, 1);
6034 else
6035 conf->fullsync = 0;
6036 md_bitmap_close_sync(mddev->bitmap);
6037
6038 return 0;
6039 }
6040
6041
6042 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6043
6044 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6045 return reshape_request(mddev, sector_nr, skipped);
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057 if (mddev->degraded >= conf->max_degraded &&
6058 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6059 sector_t rv = mddev->dev_sectors - sector_nr;
6060 *skipped = 1;
6061 return rv;
6062 }
6063 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6064 !conf->fullsync &&
6065 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6066 sync_blocks >= STRIPE_SECTORS) {
6067
6068 sync_blocks /= STRIPE_SECTORS;
6069 *skipped = 1;
6070 return sync_blocks * STRIPE_SECTORS;
6071 }
6072
6073 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6074
6075 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6076 if (sh == NULL) {
6077 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6078
6079
6080
6081 schedule_timeout_uninterruptible(1);
6082 }
6083
6084
6085
6086
6087 rcu_read_lock();
6088 for (i = 0; i < conf->raid_disks; i++) {
6089 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
6090
6091 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6092 still_degraded = 1;
6093 }
6094 rcu_read_unlock();
6095
6096 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6097
6098 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6099 set_bit(STRIPE_HANDLE, &sh->state);
6100
6101 raid5_release_stripe(sh);
6102
6103 return STRIPE_SECTORS;
6104}
6105
6106static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6107 unsigned int offset)
6108{
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119 struct stripe_head *sh;
6120 int dd_idx;
6121 sector_t sector, logical_sector, last_sector;
6122 int scnt = 0;
6123 int handled = 0;
6124
6125 logical_sector = raid_bio->bi_iter.bi_sector &
6126 ~((sector_t)STRIPE_SECTORS-1);
6127 sector = raid5_compute_sector(conf, logical_sector,
6128 0, &dd_idx, NULL);
6129 last_sector = bio_end_sector(raid_bio);
6130
6131 for (; logical_sector < last_sector;
6132 logical_sector += STRIPE_SECTORS,
6133 sector += STRIPE_SECTORS,
6134 scnt++) {
6135
6136 if (scnt < offset)
6137
6138 continue;
6139
6140 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6141
6142 if (!sh) {
6143
6144 conf->retry_read_aligned = raid_bio;
6145 conf->retry_read_offset = scnt;
6146 return handled;
6147 }
6148
6149 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6150 raid5_release_stripe(sh);
6151 conf->retry_read_aligned = raid_bio;
6152 conf->retry_read_offset = scnt;
6153 return handled;
6154 }
6155
6156 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6157 handle_stripe(sh);
6158 raid5_release_stripe(sh);
6159 handled++;
6160 }
6161
6162 bio_endio(raid_bio);
6163
6164 if (atomic_dec_and_test(&conf->active_aligned_reads))
6165 wake_up(&conf->wait_for_quiescent);
6166 return handled;
6167}
6168
6169static int handle_active_stripes(struct r5conf *conf, int group,
6170 struct r5worker *worker,
6171 struct list_head *temp_inactive_list)
6172 __releases(&conf->device_lock)
6173 __acquires(&conf->device_lock)
6174{
6175 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6176 int i, batch_size = 0, hash;
6177 bool release_inactive = false;
6178
6179 while (batch_size < MAX_STRIPE_BATCH &&
6180 (sh = __get_priority_stripe(conf, group)) != NULL)
6181 batch[batch_size++] = sh;
6182
6183 if (batch_size == 0) {
6184 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6185 if (!list_empty(temp_inactive_list + i))
6186 break;
6187 if (i == NR_STRIPE_HASH_LOCKS) {
6188 spin_unlock_irq(&conf->device_lock);
6189 log_flush_stripe_to_raid(conf);
6190 spin_lock_irq(&conf->device_lock);
6191 return batch_size;
6192 }
6193 release_inactive = true;
6194 }
6195 spin_unlock_irq(&conf->device_lock);
6196
6197 release_inactive_stripe_list(conf, temp_inactive_list,
6198 NR_STRIPE_HASH_LOCKS);
6199
6200 r5l_flush_stripe_to_raid(conf->log);
6201 if (release_inactive) {
6202 spin_lock_irq(&conf->device_lock);
6203 return 0;
6204 }
6205
6206 for (i = 0; i < batch_size; i++)
6207 handle_stripe(batch[i]);
6208 log_write_stripe_run(conf);
6209
6210 cond_resched();
6211
6212 spin_lock_irq(&conf->device_lock);
6213 for (i = 0; i < batch_size; i++) {
6214 hash = batch[i]->hash_lock_index;
6215 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6216 }
6217 return batch_size;
6218}
6219
6220static void raid5_do_work(struct work_struct *work)
6221{
6222 struct r5worker *worker = container_of(work, struct r5worker, work);
6223 struct r5worker_group *group = worker->group;
6224 struct r5conf *conf = group->conf;
6225 struct mddev *mddev = conf->mddev;
6226 int group_id = group - conf->worker_groups;
6227 int handled;
6228 struct blk_plug plug;
6229
6230 pr_debug("+++ raid5worker active\n");
6231
6232 blk_start_plug(&plug);
6233 handled = 0;
6234 spin_lock_irq(&conf->device_lock);
6235 while (1) {
6236 int batch_size, released;
6237
6238 released = release_stripe_list(conf, worker->temp_inactive_list);
6239
6240 batch_size = handle_active_stripes(conf, group_id, worker,
6241 worker->temp_inactive_list);
6242 worker->working = false;
6243 if (!batch_size && !released)
6244 break;
6245 handled += batch_size;
6246 wait_event_lock_irq(mddev->sb_wait,
6247 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6248 conf->device_lock);
6249 }
6250 pr_debug("%d stripes handled\n", handled);
6251
6252 spin_unlock_irq(&conf->device_lock);
6253
6254 flush_deferred_bios(conf);
6255
6256 r5l_flush_stripe_to_raid(conf->log);
6257
6258 async_tx_issue_pending_all();
6259 blk_finish_plug(&plug);
6260
6261 pr_debug("--- raid5worker inactive\n");
6262}
6263
6264
6265
6266
6267
6268
6269
6270
6271static void raid5d(struct md_thread *thread)
6272{
6273 struct mddev *mddev = thread->mddev;
6274 struct r5conf *conf = mddev->private;
6275 int handled;
6276 struct blk_plug plug;
6277
6278 pr_debug("+++ raid5d active\n");
6279
6280 md_check_recovery(mddev);
6281
6282 blk_start_plug(&plug);
6283 handled = 0;
6284 spin_lock_irq(&conf->device_lock);
6285 while (1) {
6286 struct bio *bio;
6287 int batch_size, released;
6288 unsigned int offset;
6289
6290 released = release_stripe_list(conf, conf->temp_inactive_list);
6291 if (released)
6292 clear_bit(R5_DID_ALLOC, &conf->cache_state);
6293
6294 if (
6295 !list_empty(&conf->bitmap_list)) {
6296
6297 conf->seq_flush++;
6298 spin_unlock_irq(&conf->device_lock);
6299 md_bitmap_unplug(mddev->bitmap);
6300 spin_lock_irq(&conf->device_lock);
6301 conf->seq_write = conf->seq_flush;
6302 activate_bit_delay(conf, conf->temp_inactive_list);
6303 }
6304 raid5_activate_delayed(conf);
6305
6306 while ((bio = remove_bio_from_retry(conf, &offset))) {
6307 int ok;
6308 spin_unlock_irq(&conf->device_lock);
6309 ok = retry_aligned_read(conf, bio, offset);
6310 spin_lock_irq(&conf->device_lock);
6311 if (!ok)
6312 break;
6313 handled++;
6314 }
6315
6316 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6317 conf->temp_inactive_list);
6318 if (!batch_size && !released)
6319 break;
6320 handled += batch_size;
6321
6322 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6323 spin_unlock_irq(&conf->device_lock);
6324 md_check_recovery(mddev);
6325 spin_lock_irq(&conf->device_lock);
6326 }
6327 }
6328 pr_debug("%d stripes handled\n", handled);
6329
6330 spin_unlock_irq(&conf->device_lock);
6331 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6332 mutex_trylock(&conf->cache_size_mutex)) {
6333 grow_one_stripe(conf, __GFP_NOWARN);
6334
6335
6336
6337 set_bit(R5_DID_ALLOC, &conf->cache_state);
6338 mutex_unlock(&conf->cache_size_mutex);
6339 }
6340
6341 flush_deferred_bios(conf);
6342
6343 r5l_flush_stripe_to_raid(conf->log);
6344
6345 async_tx_issue_pending_all();
6346 blk_finish_plug(&plug);
6347
6348 pr_debug("--- raid5d inactive\n");
6349}
6350
6351static ssize_t
6352raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6353{
6354 struct r5conf *conf;
6355 int ret = 0;
6356 spin_lock(&mddev->lock);
6357 conf = mddev->private;
6358 if (conf)
6359 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6360 spin_unlock(&mddev->lock);
6361 return ret;
6362}
6363
6364int
6365raid5_set_cache_size(struct mddev *mddev, int size)
6366{
6367 int result = 0;
6368 struct r5conf *conf = mddev->private;
6369
6370 if (size <= 16 || size > 32768)
6371 return -EINVAL;
6372
6373 conf->min_nr_stripes = size;
6374 mutex_lock(&conf->cache_size_mutex);
6375 while (size < conf->max_nr_stripes &&
6376 drop_one_stripe(conf))
6377 ;
6378 mutex_unlock(&conf->cache_size_mutex);
6379
6380 md_allow_write(mddev);
6381
6382 mutex_lock(&conf->cache_size_mutex);
6383 while (size > conf->max_nr_stripes)
6384 if (!grow_one_stripe(conf, GFP_KERNEL)) {
6385 conf->min_nr_stripes = conf->max_nr_stripes;
6386 result = -ENOMEM;
6387 break;
6388 }
6389 mutex_unlock(&conf->cache_size_mutex);
6390
6391 return result;
6392}
6393EXPORT_SYMBOL(raid5_set_cache_size);
6394
6395static ssize_t
6396raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6397{
6398 struct r5conf *conf;
6399 unsigned long new;
6400 int err;
6401
6402 if (len >= PAGE_SIZE)
6403 return -EINVAL;
6404 if (kstrtoul(page, 10, &new))
6405 return -EINVAL;
6406 err = mddev_lock(mddev);
6407 if (err)
6408 return err;
6409 conf = mddev->private;
6410 if (!conf)
6411 err = -ENODEV;
6412 else
6413 err = raid5_set_cache_size(mddev, new);
6414 mddev_unlock(mddev);
6415
6416 return err ?: len;
6417}
6418
6419static struct md_sysfs_entry
6420raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6421 raid5_show_stripe_cache_size,
6422 raid5_store_stripe_cache_size);
6423
6424static ssize_t
6425raid5_show_rmw_level(struct mddev *mddev, char *page)
6426{
6427 struct r5conf *conf = mddev->private;
6428 if (conf)
6429 return sprintf(page, "%d\n", conf->rmw_level);
6430 else
6431 return 0;
6432}
6433
6434static ssize_t
6435raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
6436{
6437 struct r5conf *conf = mddev->private;
6438 unsigned long new;
6439
6440 if (!conf)
6441 return -ENODEV;
6442
6443 if (len >= PAGE_SIZE)
6444 return -EINVAL;
6445
6446 if (kstrtoul(page, 10, &new))
6447 return -EINVAL;
6448
6449 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6450 return -EINVAL;
6451
6452 if (new != PARITY_DISABLE_RMW &&
6453 new != PARITY_ENABLE_RMW &&
6454 new != PARITY_PREFER_RMW)
6455 return -EINVAL;
6456
6457 conf->rmw_level = new;
6458 return len;
6459}
6460
6461static struct md_sysfs_entry
6462raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6463 raid5_show_rmw_level,
6464 raid5_store_rmw_level);
6465
6466
6467static ssize_t
6468raid5_show_preread_threshold(struct mddev *mddev, char *page)
6469{
6470 struct r5conf *conf;
6471 int ret = 0;
6472 spin_lock(&mddev->lock);
6473 conf = mddev->private;
6474 if (conf)
6475 ret = sprintf(page, "%d\n", conf->bypass_threshold);
6476 spin_unlock(&mddev->lock);
6477 return ret;
6478}
6479
6480static ssize_t
6481raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6482{
6483 struct r5conf *conf;
6484 unsigned long new;
6485 int err;
6486
6487 if (len >= PAGE_SIZE)
6488 return -EINVAL;
6489 if (kstrtoul(page, 10, &new))
6490 return -EINVAL;
6491
6492 err = mddev_lock(mddev);
6493 if (err)
6494 return err;
6495 conf = mddev->private;
6496 if (!conf)
6497 err = -ENODEV;
6498 else if (new > conf->min_nr_stripes)
6499 err = -EINVAL;
6500 else
6501 conf->bypass_threshold = new;
6502 mddev_unlock(mddev);
6503 return err ?: len;
6504}
6505
6506static struct md_sysfs_entry
6507raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6508 S_IRUGO | S_IWUSR,
6509 raid5_show_preread_threshold,
6510 raid5_store_preread_threshold);
6511
6512static ssize_t
6513raid5_show_skip_copy(struct mddev *mddev, char *page)
6514{
6515 struct r5conf *conf;
6516 int ret = 0;
6517 spin_lock(&mddev->lock);
6518 conf = mddev->private;
6519 if (conf)
6520 ret = sprintf(page, "%d\n", conf->skip_copy);
6521 spin_unlock(&mddev->lock);
6522 return ret;
6523}
6524
6525static ssize_t
6526raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6527{
6528 struct r5conf *conf;
6529 unsigned long new;
6530 int err;
6531
6532 if (len >= PAGE_SIZE)
6533 return -EINVAL;
6534 if (kstrtoul(page, 10, &new))
6535 return -EINVAL;
6536 new = !!new;
6537
6538 err = mddev_lock(mddev);
6539 if (err)
6540 return err;
6541 conf = mddev->private;
6542 if (!conf)
6543 err = -ENODEV;
6544 else if (new != conf->skip_copy) {
6545 mddev_suspend(mddev);
6546 conf->skip_copy = new;
6547 if (new)
6548 mddev->queue->backing_dev_info->capabilities |=
6549 BDI_CAP_STABLE_WRITES;
6550 else
6551 mddev->queue->backing_dev_info->capabilities &=
6552 ~BDI_CAP_STABLE_WRITES;
6553 mddev_resume(mddev);
6554 }
6555 mddev_unlock(mddev);
6556 return err ?: len;
6557}
6558
6559static struct md_sysfs_entry
6560raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6561 raid5_show_skip_copy,
6562 raid5_store_skip_copy);
6563
6564static ssize_t
6565stripe_cache_active_show(struct mddev *mddev, char *page)
6566{
6567 struct r5conf *conf = mddev->private;
6568 if (conf)
6569 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6570 else
6571 return 0;
6572}
6573
6574static struct md_sysfs_entry
6575raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6576
6577static ssize_t
6578raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6579{
6580 struct r5conf *conf;
6581 int ret = 0;
6582 spin_lock(&mddev->lock);
6583 conf = mddev->private;
6584 if (conf)
6585 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6586 spin_unlock(&mddev->lock);
6587 return ret;
6588}
6589
6590static int alloc_thread_groups(struct r5conf *conf, int cnt,
6591 int *group_cnt,
6592 int *worker_cnt_per_group,
6593 struct r5worker_group **worker_groups);
6594static ssize_t
6595raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6596{
6597 struct r5conf *conf;
6598 unsigned int new;
6599 int err;
6600 struct r5worker_group *new_groups, *old_groups;
6601 int group_cnt, worker_cnt_per_group;
6602
6603 if (len >= PAGE_SIZE)
6604 return -EINVAL;
6605 if (kstrtouint(page, 10, &new))
6606 return -EINVAL;
6607
6608 if (new > 8192)
6609 return -EINVAL;
6610
6611 err = mddev_lock(mddev);
6612 if (err)
6613 return err;
6614 conf = mddev->private;
6615 if (!conf)
6616 err = -ENODEV;
6617 else if (new != conf->worker_cnt_per_group) {
6618 mddev_suspend(mddev);
6619
6620 old_groups = conf->worker_groups;
6621 if (old_groups)
6622 flush_workqueue(raid5_wq);
6623
6624 err = alloc_thread_groups(conf, new,
6625 &group_cnt, &worker_cnt_per_group,
6626 &new_groups);
6627 if (!err) {
6628 spin_lock_irq(&conf->device_lock);
6629 conf->group_cnt = group_cnt;
6630 conf->worker_cnt_per_group = worker_cnt_per_group;
6631 conf->worker_groups = new_groups;
6632 spin_unlock_irq(&conf->device_lock);
6633
6634 if (old_groups)
6635 kfree(old_groups[0].workers);
6636 kfree(old_groups);
6637 }
6638 mddev_resume(mddev);
6639 }
6640 mddev_unlock(mddev);
6641
6642 return err ?: len;
6643}
6644
6645static struct md_sysfs_entry
6646raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6647 raid5_show_group_thread_cnt,
6648 raid5_store_group_thread_cnt);
6649
6650static struct attribute *raid5_attrs[] = {
6651 &raid5_stripecache_size.attr,
6652 &raid5_stripecache_active.attr,
6653 &raid5_preread_bypass_threshold.attr,
6654 &raid5_group_thread_cnt.attr,
6655 &raid5_skip_copy.attr,
6656 &raid5_rmw_level.attr,
6657 &r5c_journal_mode.attr,
6658 &ppl_write_hint.attr,
6659 NULL,
6660};
6661static struct attribute_group raid5_attrs_group = {
6662 .name = NULL,
6663 .attrs = raid5_attrs,
6664};
6665
6666static int alloc_thread_groups(struct r5conf *conf, int cnt,
6667 int *group_cnt,
6668 int *worker_cnt_per_group,
6669 struct r5worker_group **worker_groups)
6670{
6671 int i, j, k;
6672 ssize_t size;
6673 struct r5worker *workers;
6674
6675 *worker_cnt_per_group = cnt;
6676 if (cnt == 0) {
6677 *group_cnt = 0;
6678 *worker_groups = NULL;
6679 return 0;
6680 }
6681 *group_cnt = num_possible_nodes();
6682 size = sizeof(struct r5worker) * cnt;
6683 workers = kcalloc(size, *group_cnt, GFP_NOIO);
6684 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
6685 GFP_NOIO);
6686 if (!*worker_groups || !workers) {
6687 kfree(workers);
6688 kfree(*worker_groups);
6689 return -ENOMEM;
6690 }
6691
6692 for (i = 0; i < *group_cnt; i++) {
6693 struct r5worker_group *group;
6694
6695 group = &(*worker_groups)[i];
6696 INIT_LIST_HEAD(&group->handle_list);
6697 INIT_LIST_HEAD(&group->loprio_list);
6698 group->conf = conf;
6699 group->workers = workers + i * cnt;
6700
6701 for (j = 0; j < cnt; j++) {
6702 struct r5worker *worker = group->workers + j;
6703 worker->group = group;
6704 INIT_WORK(&worker->work, raid5_do_work);
6705
6706 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6707 INIT_LIST_HEAD(worker->temp_inactive_list + k);
6708 }
6709 }
6710
6711 return 0;
6712}
6713
6714static void free_thread_groups(struct r5conf *conf)
6715{
6716 if (conf->worker_groups)
6717 kfree(conf->worker_groups[0].workers);
6718 kfree(conf->worker_groups);
6719 conf->worker_groups = NULL;
6720}
6721
6722static sector_t
6723raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
6724{
6725 struct r5conf *conf = mddev->private;
6726
6727 if (!sectors)
6728 sectors = mddev->dev_sectors;
6729 if (!raid_disks)
6730
6731 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
6732
6733 sectors &= ~((sector_t)conf->chunk_sectors - 1);
6734 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
6735 return sectors * (raid_disks - conf->max_degraded);
6736}
6737
6738static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6739{
6740 safe_put_page(percpu->spare_page);
6741 percpu->spare_page = NULL;
6742 kvfree(percpu->scribble);
6743 percpu->scribble = NULL;
6744}
6745
6746static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6747{
6748 if (conf->level == 6 && !percpu->spare_page) {
6749 percpu->spare_page = alloc_page(GFP_KERNEL);
6750 if (!percpu->spare_page)
6751 return -ENOMEM;
6752 }
6753
6754 if (scribble_alloc(percpu,
6755 max(conf->raid_disks,
6756 conf->previous_raid_disks),
6757 max(conf->chunk_sectors,
6758 conf->prev_chunk_sectors)
6759 / STRIPE_SECTORS,
6760 GFP_KERNEL)) {
6761 free_scratch_buffer(conf, percpu);
6762 return -ENOMEM;
6763 }
6764
6765 return 0;
6766}
6767
6768static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
6769{
6770 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6771
6772 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
6773 return 0;
6774}
6775
6776static void raid5_free_percpu(struct r5conf *conf)
6777{
6778 if (!conf->percpu)
6779 return;
6780
6781 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6782 free_percpu(conf->percpu);
6783}
6784
6785static void free_conf(struct r5conf *conf)
6786{
6787 int i;
6788
6789 log_exit(conf);
6790
6791 unregister_shrinker(&conf->shrinker);
6792 free_thread_groups(conf);
6793 shrink_stripes(conf);
6794 raid5_free_percpu(conf);
6795 for (i = 0; i < conf->pool_size; i++)
6796 if (conf->disks[i].extra_page)
6797 put_page(conf->disks[i].extra_page);
6798 kfree(conf->disks);
6799 bioset_exit(&conf->bio_split);
6800 kfree(conf->stripe_hashtbl);
6801 kfree(conf->pending_data);
6802 kfree(conf);
6803}
6804
6805static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
6806{
6807 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6808 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6809
6810 if (alloc_scratch_buffer(conf, percpu)) {
6811 pr_warn("%s: failed memory allocation for cpu%u\n",
6812 __func__, cpu);
6813 return -ENOMEM;
6814 }
6815 return 0;
6816}
6817
6818static int raid5_alloc_percpu(struct r5conf *conf)
6819{
6820 int err = 0;
6821
6822 conf->percpu = alloc_percpu(struct raid5_percpu);
6823 if (!conf->percpu)
6824 return -ENOMEM;
6825
6826 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6827 if (!err) {
6828 conf->scribble_disks = max(conf->raid_disks,
6829 conf->previous_raid_disks);
6830 conf->scribble_sectors = max(conf->chunk_sectors,
6831 conf->prev_chunk_sectors);
6832 }
6833 return err;
6834}
6835
6836static unsigned long raid5_cache_scan(struct shrinker *shrink,
6837 struct shrink_control *sc)
6838{
6839 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6840 unsigned long ret = SHRINK_STOP;
6841
6842 if (mutex_trylock(&conf->cache_size_mutex)) {
6843 ret= 0;
6844 while (ret < sc->nr_to_scan &&
6845 conf->max_nr_stripes > conf->min_nr_stripes) {
6846 if (drop_one_stripe(conf) == 0) {
6847 ret = SHRINK_STOP;
6848 break;
6849 }
6850 ret++;
6851 }
6852 mutex_unlock(&conf->cache_size_mutex);
6853 }
6854 return ret;
6855}
6856
6857static unsigned long raid5_cache_count(struct shrinker *shrink,
6858 struct shrink_control *sc)
6859{
6860 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6861
6862 if (conf->max_nr_stripes < conf->min_nr_stripes)
6863
6864 return 0;
6865 return conf->max_nr_stripes - conf->min_nr_stripes;
6866}
6867
6868static struct r5conf *setup_conf(struct mddev *mddev)
6869{
6870 struct r5conf *conf;
6871 int raid_disk, memory, max_disks;
6872 struct md_rdev *rdev;
6873 struct disk_info *disk;
6874 char pers_name[6];
6875 int i;
6876 int group_cnt, worker_cnt_per_group;
6877 struct r5worker_group *new_group;
6878 int ret;
6879
6880 if (mddev->new_level != 5
6881 && mddev->new_level != 4
6882 && mddev->new_level != 6) {
6883 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6884 mdname(mddev), mddev->new_level);
6885 return ERR_PTR(-EIO);
6886 }
6887 if ((mddev->new_level == 5
6888 && !algorithm_valid_raid5(mddev->new_layout)) ||
6889 (mddev->new_level == 6
6890 && !algorithm_valid_raid6(mddev->new_layout))) {
6891 pr_warn("md/raid:%s: layout %d not supported\n",
6892 mdname(mddev), mddev->new_layout);
6893 return ERR_PTR(-EIO);
6894 }
6895 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6896 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6897 mdname(mddev), mddev->raid_disks);
6898 return ERR_PTR(-EINVAL);
6899 }
6900
6901 if (!mddev->new_chunk_sectors ||
6902 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6903 !is_power_of_2(mddev->new_chunk_sectors)) {
6904 pr_warn("md/raid:%s: invalid chunk size %d\n",
6905 mdname(mddev), mddev->new_chunk_sectors << 9);
6906 return ERR_PTR(-EINVAL);
6907 }
6908
6909 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
6910 if (conf == NULL)
6911 goto abort;
6912 INIT_LIST_HEAD(&conf->free_list);
6913 INIT_LIST_HEAD(&conf->pending_list);
6914 conf->pending_data = kcalloc(PENDING_IO_MAX,
6915 sizeof(struct r5pending_data),
6916 GFP_KERNEL);
6917 if (!conf->pending_data)
6918 goto abort;
6919 for (i = 0; i < PENDING_IO_MAX; i++)
6920 list_add(&conf->pending_data[i].sibling, &conf->free_list);
6921
6922 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6923 &new_group)) {
6924 conf->group_cnt = group_cnt;
6925 conf->worker_cnt_per_group = worker_cnt_per_group;
6926 conf->worker_groups = new_group;
6927 } else
6928 goto abort;
6929 spin_lock_init(&conf->device_lock);
6930 seqcount_init(&conf->gen_lock);
6931 mutex_init(&conf->cache_size_mutex);
6932 init_waitqueue_head(&conf->wait_for_quiescent);
6933 init_waitqueue_head(&conf->wait_for_stripe);
6934 init_waitqueue_head(&conf->wait_for_overlap);
6935 INIT_LIST_HEAD(&conf->handle_list);
6936 INIT_LIST_HEAD(&conf->loprio_list);
6937 INIT_LIST_HEAD(&conf->hold_list);
6938 INIT_LIST_HEAD(&conf->delayed_list);
6939 INIT_LIST_HEAD(&conf->bitmap_list);
6940 init_llist_head(&conf->released_stripes);
6941 atomic_set(&conf->active_stripes, 0);
6942 atomic_set(&conf->preread_active_stripes, 0);
6943 atomic_set(&conf->active_aligned_reads, 0);
6944 spin_lock_init(&conf->pending_bios_lock);
6945 conf->batch_bio_dispatch = true;
6946 rdev_for_each(rdev, mddev) {
6947 if (test_bit(Journal, &rdev->flags))
6948 continue;
6949 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
6950 conf->batch_bio_dispatch = false;
6951 break;
6952 }
6953 }
6954
6955 conf->bypass_threshold = BYPASS_THRESHOLD;
6956 conf->recovery_disabled = mddev->recovery_disabled - 1;
6957
6958 conf->raid_disks = mddev->raid_disks;
6959 if (mddev->reshape_position == MaxSector)
6960 conf->previous_raid_disks = mddev->raid_disks;
6961 else
6962 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
6963 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
6964
6965 conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
6966 GFP_KERNEL);
6967
6968 if (!conf->disks)
6969 goto abort;
6970
6971 for (i = 0; i < max_disks; i++) {
6972 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
6973 if (!conf->disks[i].extra_page)
6974 goto abort;
6975 }
6976
6977 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
6978 if (ret)
6979 goto abort;
6980 conf->mddev = mddev;
6981
6982 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
6983 goto abort;
6984
6985
6986
6987
6988
6989
6990 spin_lock_init(conf->hash_locks);
6991 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
6992 spin_lock_init(conf->hash_locks + i);
6993
6994 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6995 INIT_LIST_HEAD(conf->inactive_list + i);
6996
6997 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6998 INIT_LIST_HEAD(conf->temp_inactive_list + i);
6999
7000 atomic_set(&conf->r5c_cached_full_stripes, 0);
7001 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7002 atomic_set(&conf->r5c_cached_partial_stripes, 0);
7003 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7004 atomic_set(&conf->r5c_flushing_full_stripes, 0);
7005 atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7006
7007 conf->level = mddev->new_level;
7008 conf->chunk_sectors = mddev->new_chunk_sectors;
7009 if (raid5_alloc_percpu(conf) != 0)
7010 goto abort;
7011
7012 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7013
7014 rdev_for_each(rdev, mddev) {
7015 raid_disk = rdev->raid_disk;
7016 if (raid_disk >= max_disks
7017 || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7018 continue;
7019 disk = conf->disks + raid_disk;
7020
7021 if (test_bit(Replacement, &rdev->flags)) {
7022 if (disk->replacement)
7023 goto abort;
7024 disk->replacement = rdev;
7025 } else {
7026 if (disk->rdev)
7027 goto abort;
7028 disk->rdev = rdev;
7029 }
7030
7031 if (test_bit(In_sync, &rdev->flags)) {
7032 char b[BDEVNAME_SIZE];
7033 pr_info("md/raid:%s: device %s operational as raid disk %d\n",
7034 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
7035 } else if (rdev->saved_raid_disk != raid_disk)
7036
7037 conf->fullsync = 1;
7038 }
7039
7040 conf->level = mddev->new_level;
7041 if (conf->level == 6) {
7042 conf->max_degraded = 2;
7043 if (raid6_call.xor_syndrome)
7044 conf->rmw_level = PARITY_ENABLE_RMW;
7045 else
7046 conf->rmw_level = PARITY_DISABLE_RMW;
7047 } else {
7048 conf->max_degraded = 1;
7049 conf->rmw_level = PARITY_ENABLE_RMW;
7050 }
7051 conf->algorithm = mddev->new_layout;
7052 conf->reshape_progress = mddev->reshape_position;
7053 if (conf->reshape_progress != MaxSector) {
7054 conf->prev_chunk_sectors = mddev->chunk_sectors;
7055 conf->prev_algo = mddev->layout;
7056 } else {
7057 conf->prev_chunk_sectors = conf->chunk_sectors;
7058 conf->prev_algo = conf->algorithm;
7059 }
7060
7061 conf->min_nr_stripes = NR_STRIPES;
7062 if (mddev->reshape_position != MaxSector) {
7063 int stripes = max_t(int,
7064 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
7065 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
7066 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7067 if (conf->min_nr_stripes != NR_STRIPES)
7068 pr_info("md/raid:%s: force stripe size %d for reshape\n",
7069 mdname(mddev), conf->min_nr_stripes);
7070 }
7071 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7072 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7073 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7074 if (grow_stripes(conf, conf->min_nr_stripes)) {
7075 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7076 mdname(mddev), memory);
7077 goto abort;
7078 } else
7079 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7080
7081
7082
7083
7084
7085 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7086 conf->shrinker.scan_objects = raid5_cache_scan;
7087 conf->shrinker.count_objects = raid5_cache_count;
7088 conf->shrinker.batch = 128;
7089 conf->shrinker.flags = 0;
7090 if (register_shrinker(&conf->shrinker)) {
7091 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7092 mdname(mddev));
7093 goto abort;
7094 }
7095
7096 sprintf(pers_name, "raid%d", mddev->new_level);
7097 conf->thread = md_register_thread(raid5d, mddev, pers_name);
7098 if (!conf->thread) {
7099 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7100 mdname(mddev));
7101 goto abort;
7102 }
7103
7104 return conf;
7105
7106 abort:
7107 if (conf) {
7108 free_conf(conf);
7109 return ERR_PTR(-EIO);
7110 } else
7111 return ERR_PTR(-ENOMEM);
7112}
7113
7114static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7115{
7116 switch (algo) {
7117 case ALGORITHM_PARITY_0:
7118 if (raid_disk < max_degraded)
7119 return 1;
7120 break;
7121 case ALGORITHM_PARITY_N:
7122 if (raid_disk >= raid_disks - max_degraded)
7123 return 1;
7124 break;
7125 case ALGORITHM_PARITY_0_6:
7126 if (raid_disk == 0 ||
7127 raid_disk == raid_disks - 1)
7128 return 1;
7129 break;
7130 case ALGORITHM_LEFT_ASYMMETRIC_6:
7131 case ALGORITHM_RIGHT_ASYMMETRIC_6:
7132 case ALGORITHM_LEFT_SYMMETRIC_6:
7133 case ALGORITHM_RIGHT_SYMMETRIC_6:
7134 if (raid_disk == raid_disks - 1)
7135 return 1;
7136 }
7137 return 0;
7138}
7139
7140static int raid5_run(struct mddev *mddev)
7141{
7142 struct r5conf *conf;
7143 int working_disks = 0;
7144 int dirty_parity_disks = 0;
7145 struct md_rdev *rdev;
7146 struct md_rdev *journal_dev = NULL;
7147 sector_t reshape_offset = 0;
7148 int i;
7149 long long min_offset_diff = 0;
7150 int first = 1;
7151
7152 if (mddev_init_writes_pending(mddev) < 0)
7153 return -ENOMEM;
7154
7155 if (mddev->recovery_cp != MaxSector)
7156 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7157 mdname(mddev));
7158
7159 rdev_for_each(rdev, mddev) {
7160 long long diff;
7161
7162 if (test_bit(Journal, &rdev->flags)) {
7163 journal_dev = rdev;
7164 continue;
7165 }
7166 if (rdev->raid_disk < 0)
7167 continue;
7168 diff = (rdev->new_data_offset - rdev->data_offset);
7169 if (first) {
7170 min_offset_diff = diff;
7171 first = 0;
7172 } else if (mddev->reshape_backwards &&
7173 diff < min_offset_diff)
7174 min_offset_diff = diff;
7175 else if (!mddev->reshape_backwards &&
7176 diff > min_offset_diff)
7177 min_offset_diff = diff;
7178 }
7179
7180 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7181 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7182 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7183 mdname(mddev));
7184 return -EINVAL;
7185 }
7186
7187 if (mddev->reshape_position != MaxSector) {
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200 sector_t here_new, here_old;
7201 int old_disks;
7202 int max_degraded = (mddev->level == 6 ? 2 : 1);
7203 int chunk_sectors;
7204 int new_data_disks;
7205
7206 if (journal_dev) {
7207 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7208 mdname(mddev));
7209 return -EINVAL;
7210 }
7211
7212 if (mddev->new_level != mddev->level) {
7213 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7214 mdname(mddev));
7215 return -EINVAL;
7216 }
7217 old_disks = mddev->raid_disks - mddev->delta_disks;
7218
7219
7220
7221
7222
7223
7224
7225 here_new = mddev->reshape_position;
7226 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7227 new_data_disks = mddev->raid_disks - max_degraded;
7228 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7229 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7230 mdname(mddev));
7231 return -EINVAL;
7232 }
7233 reshape_offset = here_new * chunk_sectors;
7234
7235 here_old = mddev->reshape_position;
7236 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7237
7238
7239 if (mddev->delta_disks == 0) {
7240
7241
7242
7243
7244
7245
7246
7247 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7248 abs(min_offset_diff) >= mddev->new_chunk_sectors)
7249 ;
7250 else if (mddev->ro == 0) {
7251 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7252 mdname(mddev));
7253 return -EINVAL;
7254 }
7255 } else if (mddev->reshape_backwards
7256 ? (here_new * chunk_sectors + min_offset_diff <=
7257 here_old * chunk_sectors)
7258 : (here_new * chunk_sectors >=
7259 here_old * chunk_sectors + (-min_offset_diff))) {
7260
7261 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7262 mdname(mddev));
7263 return -EINVAL;
7264 }
7265 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7266
7267 } else {
7268 BUG_ON(mddev->level != mddev->new_level);
7269 BUG_ON(mddev->layout != mddev->new_layout);
7270 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7271 BUG_ON(mddev->delta_disks != 0);
7272 }
7273
7274 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7275 test_bit(MD_HAS_PPL, &mddev->flags)) {
7276 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7277 mdname(mddev));
7278 clear_bit(MD_HAS_PPL, &mddev->flags);
7279 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7280 }
7281
7282 if (mddev->private == NULL)
7283 conf = setup_conf(mddev);
7284 else
7285 conf = mddev->private;
7286
7287 if (IS_ERR(conf))
7288 return PTR_ERR(conf);
7289
7290 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7291 if (!journal_dev) {
7292 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7293 mdname(mddev));
7294 mddev->ro = 1;
7295 set_disk_ro(mddev->gendisk, 1);
7296 } else if (mddev->recovery_cp == MaxSector)
7297 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7298 }
7299
7300 conf->min_offset_diff = min_offset_diff;
7301 mddev->thread = conf->thread;
7302 conf->thread = NULL;
7303 mddev->private = conf;
7304
7305 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7306 i++) {
7307 rdev = conf->disks[i].rdev;
7308 if (!rdev && conf->disks[i].replacement) {
7309
7310 rdev = conf->disks[i].replacement;
7311 conf->disks[i].replacement = NULL;
7312 clear_bit(Replacement, &rdev->flags);
7313 conf->disks[i].rdev = rdev;
7314 }
7315 if (!rdev)
7316 continue;
7317 if (conf->disks[i].replacement &&
7318 conf->reshape_progress != MaxSector) {
7319
7320 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7321 goto abort;
7322 }
7323 if (test_bit(In_sync, &rdev->flags)) {
7324 working_disks++;
7325 continue;
7326 }
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336 if (mddev->major_version == 0 &&
7337 mddev->minor_version > 90)
7338 rdev->recovery_offset = reshape_offset;
7339
7340 if (rdev->recovery_offset < reshape_offset) {
7341
7342 if (!only_parity(rdev->raid_disk,
7343 conf->algorithm,
7344 conf->raid_disks,
7345 conf->max_degraded))
7346 continue;
7347 }
7348 if (!only_parity(rdev->raid_disk,
7349 conf->prev_algo,
7350 conf->previous_raid_disks,
7351 conf->max_degraded))
7352 continue;
7353 dirty_parity_disks++;
7354 }
7355
7356
7357
7358
7359 mddev->degraded = raid5_calc_degraded(conf);
7360
7361 if (has_failed(conf)) {
7362 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7363 mdname(mddev), mddev->degraded, conf->raid_disks);
7364 goto abort;
7365 }
7366
7367
7368 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
7369 mddev->resync_max_sectors = mddev->dev_sectors;
7370
7371 if (mddev->degraded > dirty_parity_disks &&
7372 mddev->recovery_cp != MaxSector) {
7373 if (test_bit(MD_HAS_PPL, &mddev->flags))
7374 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7375 mdname(mddev));
7376 else if (mddev->ok_start_degraded)
7377 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7378 mdname(mddev));
7379 else {
7380 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7381 mdname(mddev));
7382 goto abort;
7383 }
7384 }
7385
7386 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7387 mdname(mddev), conf->level,
7388 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7389 mddev->new_layout);
7390
7391 print_raid5_conf(conf);
7392
7393 if (conf->reshape_progress != MaxSector) {
7394 conf->reshape_safe = conf->reshape_progress;
7395 atomic_set(&conf->reshape_stripes, 0);
7396 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7397 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7398 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7399 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7400 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7401 "reshape");
7402 if (!mddev->sync_thread)
7403 goto abort;
7404 }
7405
7406
7407 if (mddev->to_remove == &raid5_attrs_group)
7408 mddev->to_remove = NULL;
7409 else if (mddev->kobj.sd &&
7410 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7411 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7412 mdname(mddev));
7413 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7414
7415 if (mddev->queue) {
7416 int chunk_size;
7417
7418
7419
7420
7421 int data_disks = conf->previous_raid_disks - conf->max_degraded;
7422 int stripe = data_disks *
7423 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7424 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7425 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
7426
7427 chunk_size = mddev->chunk_sectors << 9;
7428 blk_queue_io_min(mddev->queue, chunk_size);
7429 blk_queue_io_opt(mddev->queue, chunk_size *
7430 (conf->raid_disks - conf->max_degraded));
7431 mddev->queue->limits.raid_partial_stripes_expensive = 1;
7432
7433
7434
7435
7436 stripe = stripe * PAGE_SIZE;
7437
7438
7439 while ((stripe-1) & stripe)
7440 stripe = (stripe | (stripe-1)) + 1;
7441 mddev->queue->limits.discard_alignment = stripe;
7442 mddev->queue->limits.discard_granularity = stripe;
7443
7444 blk_queue_max_write_same_sectors(mddev->queue, 0);
7445 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
7446
7447 rdev_for_each(rdev, mddev) {
7448 disk_stack_limits(mddev->gendisk, rdev->bdev,
7449 rdev->data_offset << 9);
7450 disk_stack_limits(mddev->gendisk, rdev->bdev,
7451 rdev->new_data_offset << 9);
7452 }
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469 if (devices_handle_discard_safely &&
7470 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7471 mddev->queue->limits.discard_granularity >= stripe)
7472 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
7473 mddev->queue);
7474 else
7475 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
7476 mddev->queue);
7477
7478 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7479 }
7480
7481 if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
7482 goto abort;
7483
7484 return 0;
7485abort:
7486 md_unregister_thread(&mddev->thread);
7487 print_raid5_conf(conf);
7488 free_conf(conf);
7489 mddev->private = NULL;
7490 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7491 return -EIO;
7492}
7493
7494static void raid5_free(struct mddev *mddev, void *priv)
7495{
7496 struct r5conf *conf = priv;
7497
7498 free_conf(conf);
7499 mddev->to_remove = &raid5_attrs_group;
7500}
7501
7502static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7503{
7504 struct r5conf *conf = mddev->private;
7505 int i;
7506
7507 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7508 conf->chunk_sectors / 2, mddev->layout);
7509 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7510 rcu_read_lock();
7511 for (i = 0; i < conf->raid_disks; i++) {
7512 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7513 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7514 }
7515 rcu_read_unlock();
7516 seq_printf (seq, "]");
7517}
7518
7519static void print_raid5_conf (struct r5conf *conf)
7520{
7521 int i;
7522 struct disk_info *tmp;
7523
7524 pr_debug("RAID conf printout:\n");
7525 if (!conf) {
7526 pr_debug("(conf==NULL)\n");
7527 return;
7528 }
7529 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7530 conf->raid_disks,
7531 conf->raid_disks - conf->mddev->degraded);
7532
7533 for (i = 0; i < conf->raid_disks; i++) {
7534 char b[BDEVNAME_SIZE];
7535 tmp = conf->disks + i;
7536 if (tmp->rdev)
7537 pr_debug(" disk %d, o:%d, dev:%s\n",
7538 i, !test_bit(Faulty, &tmp->rdev->flags),
7539 bdevname(tmp->rdev->bdev, b));
7540 }
7541}
7542
7543static int raid5_spare_active(struct mddev *mddev)
7544{
7545 int i;
7546 struct r5conf *conf = mddev->private;
7547 struct disk_info *tmp;
7548 int count = 0;
7549 unsigned long flags;
7550
7551 for (i = 0; i < conf->raid_disks; i++) {
7552 tmp = conf->disks + i;
7553 if (tmp->replacement
7554 && tmp->replacement->recovery_offset == MaxSector
7555 && !test_bit(Faulty, &tmp->replacement->flags)
7556 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7557
7558 if (!tmp->rdev
7559 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7560 count++;
7561 if (tmp->rdev) {
7562
7563
7564
7565
7566 set_bit(Faulty, &tmp->rdev->flags);
7567 sysfs_notify_dirent_safe(
7568 tmp->rdev->sysfs_state);
7569 }
7570 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7571 } else if (tmp->rdev
7572 && tmp->rdev->recovery_offset == MaxSector
7573 && !test_bit(Faulty, &tmp->rdev->flags)
7574 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7575 count++;
7576 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7577 }
7578 }
7579 spin_lock_irqsave(&conf->device_lock, flags);
7580 mddev->degraded = raid5_calc_degraded(conf);
7581 spin_unlock_irqrestore(&conf->device_lock, flags);
7582 print_raid5_conf(conf);
7583 return count;
7584}
7585
7586static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7587{
7588 struct r5conf *conf = mddev->private;
7589 int err = 0;
7590 int number = rdev->raid_disk;
7591 struct md_rdev **rdevp;
7592 struct disk_info *p = conf->disks + number;
7593
7594 print_raid5_conf(conf);
7595 if (test_bit(Journal, &rdev->flags) && conf->log) {
7596
7597
7598
7599
7600
7601
7602 if (atomic_read(&conf->active_stripes) ||
7603 atomic_read(&conf->r5c_cached_full_stripes) ||
7604 atomic_read(&conf->r5c_cached_partial_stripes)) {
7605 return -EBUSY;
7606 }
7607 log_exit(conf);
7608 return 0;
7609 }
7610 if (rdev == p->rdev)
7611 rdevp = &p->rdev;
7612 else if (rdev == p->replacement)
7613 rdevp = &p->replacement;
7614 else
7615 return 0;
7616
7617 if (number >= conf->raid_disks &&
7618 conf->reshape_progress == MaxSector)
7619 clear_bit(In_sync, &rdev->flags);
7620
7621 if (test_bit(In_sync, &rdev->flags) ||
7622 atomic_read(&rdev->nr_pending)) {
7623 err = -EBUSY;
7624 goto abort;
7625 }
7626
7627
7628
7629 if (!test_bit(Faulty, &rdev->flags) &&
7630 mddev->recovery_disabled != conf->recovery_disabled &&
7631 !has_failed(conf) &&
7632 (!p->replacement || p->replacement == rdev) &&
7633 number < conf->raid_disks) {
7634 err = -EBUSY;
7635 goto abort;
7636 }
7637 *rdevp = NULL;
7638 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7639 synchronize_rcu();
7640 if (atomic_read(&rdev->nr_pending)) {
7641
7642 err = -EBUSY;
7643 *rdevp = rdev;
7644 }
7645 }
7646 if (!err) {
7647 err = log_modify(conf, rdev, false);
7648 if (err)
7649 goto abort;
7650 }
7651 if (p->replacement) {
7652
7653 p->rdev = p->replacement;
7654 clear_bit(Replacement, &p->replacement->flags);
7655 smp_mb();
7656
7657
7658 p->replacement = NULL;
7659
7660 if (!err)
7661 err = log_modify(conf, p->rdev, true);
7662 }
7663
7664 clear_bit(WantReplacement, &rdev->flags);
7665abort:
7666
7667 print_raid5_conf(conf);
7668 return err;
7669}
7670
7671static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7672{
7673 struct r5conf *conf = mddev->private;
7674 int ret, err = -EEXIST;
7675 int disk;
7676 struct disk_info *p;
7677 int first = 0;
7678 int last = conf->raid_disks - 1;
7679
7680 if (test_bit(Journal, &rdev->flags)) {
7681 if (conf->log)
7682 return -EBUSY;
7683
7684 rdev->raid_disk = 0;
7685
7686
7687
7688
7689 ret = log_init(conf, rdev, false);
7690 if (ret)
7691 return ret;
7692
7693 ret = r5l_start(conf->log);
7694 if (ret)
7695 return ret;
7696
7697 return 0;
7698 }
7699 if (mddev->recovery_disabled == conf->recovery_disabled)
7700 return -EBUSY;
7701
7702 if (rdev->saved_raid_disk < 0 && has_failed(conf))
7703
7704 return -EINVAL;
7705
7706 if (rdev->raid_disk >= 0)
7707 first = last = rdev->raid_disk;
7708
7709
7710
7711
7712
7713 if (rdev->saved_raid_disk >= 0 &&
7714 rdev->saved_raid_disk >= first &&
7715 conf->disks[rdev->saved_raid_disk].rdev == NULL)
7716 first = rdev->saved_raid_disk;
7717
7718 for (disk = first; disk <= last; disk++) {
7719 p = conf->disks + disk;
7720 if (p->rdev == NULL) {
7721 clear_bit(In_sync, &rdev->flags);
7722 rdev->raid_disk = disk;
7723 if (rdev->saved_raid_disk != disk)
7724 conf->fullsync = 1;
7725 rcu_assign_pointer(p->rdev, rdev);
7726
7727 err = log_modify(conf, rdev, true);
7728
7729 goto out;
7730 }
7731 }
7732 for (disk = first; disk <= last; disk++) {
7733 p = conf->disks + disk;
7734 if (test_bit(WantReplacement, &p->rdev->flags) &&
7735 p->replacement == NULL) {
7736 clear_bit(In_sync, &rdev->flags);
7737 set_bit(Replacement, &rdev->flags);
7738 rdev->raid_disk = disk;
7739 err = 0;
7740 conf->fullsync = 1;
7741 rcu_assign_pointer(p->replacement, rdev);
7742 break;
7743 }
7744 }
7745out:
7746 print_raid5_conf(conf);
7747 return err;
7748}
7749
7750static int raid5_resize(struct mddev *mddev, sector_t sectors)
7751{
7752
7753
7754
7755
7756
7757
7758
7759 sector_t newsize;
7760 struct r5conf *conf = mddev->private;
7761
7762 if (raid5_has_log(conf) || raid5_has_ppl(conf))
7763 return -EINVAL;
7764 sectors &= ~((sector_t)conf->chunk_sectors - 1);
7765 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
7766 if (mddev->external_size &&
7767 mddev->array_sectors > newsize)
7768 return -EINVAL;
7769 if (mddev->bitmap) {
7770 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
7771 if (ret)
7772 return ret;
7773 }
7774 md_set_array_sectors(mddev, newsize);
7775 if (sectors > mddev->dev_sectors &&
7776 mddev->recovery_cp > mddev->dev_sectors) {
7777 mddev->recovery_cp = mddev->dev_sectors;
7778 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7779 }
7780 mddev->dev_sectors = sectors;
7781 mddev->resync_max_sectors = sectors;
7782 return 0;
7783}
7784
7785static int check_stripe_cache(struct mddev *mddev)
7786{
7787
7788
7789
7790
7791
7792
7793
7794
7795 struct r5conf *conf = mddev->private;
7796 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
7797 > conf->min_nr_stripes ||
7798 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
7799 > conf->min_nr_stripes) {
7800 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
7801 mdname(mddev),
7802 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7803 / STRIPE_SIZE)*4);
7804 return 0;
7805 }
7806 return 1;
7807}
7808
7809static int check_reshape(struct mddev *mddev)
7810{
7811 struct r5conf *conf = mddev->private;
7812
7813 if (raid5_has_log(conf) || raid5_has_ppl(conf))
7814 return -EINVAL;
7815 if (mddev->delta_disks == 0 &&
7816 mddev->new_layout == mddev->layout &&
7817 mddev->new_chunk_sectors == mddev->chunk_sectors)
7818 return 0;
7819 if (has_failed(conf))
7820 return -EINVAL;
7821 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
7822
7823
7824
7825
7826
7827 int min = 2;
7828 if (mddev->level == 6)
7829 min = 4;
7830 if (mddev->raid_disks + mddev->delta_disks < min)
7831 return -EINVAL;
7832 }
7833
7834 if (!check_stripe_cache(mddev))
7835 return -ENOSPC;
7836
7837 if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
7838 mddev->delta_disks > 0)
7839 if (resize_chunks(conf,
7840 conf->previous_raid_disks
7841 + max(0, mddev->delta_disks),
7842 max(mddev->new_chunk_sectors,
7843 mddev->chunk_sectors)
7844 ) < 0)
7845 return -ENOMEM;
7846
7847 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
7848 return 0;
7849 return resize_stripes(conf, (conf->previous_raid_disks
7850 + mddev->delta_disks));
7851}
7852
7853static int raid5_start_reshape(struct mddev *mddev)
7854{
7855 struct r5conf *conf = mddev->private;
7856 struct md_rdev *rdev;
7857 int spares = 0;
7858 unsigned long flags;
7859
7860 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7861 return -EBUSY;
7862
7863 if (!check_stripe_cache(mddev))
7864 return -ENOSPC;
7865
7866 if (has_failed(conf))
7867 return -EINVAL;
7868
7869 rdev_for_each(rdev, mddev) {
7870 if (!test_bit(In_sync, &rdev->flags)
7871 && !test_bit(Faulty, &rdev->flags))
7872 spares++;
7873 }
7874
7875 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
7876
7877
7878
7879 return -EINVAL;
7880
7881
7882
7883
7884
7885 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7886 < mddev->array_sectors) {
7887 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
7888 mdname(mddev));
7889 return -EINVAL;
7890 }
7891
7892 atomic_set(&conf->reshape_stripes, 0);
7893 spin_lock_irq(&conf->device_lock);
7894 write_seqcount_begin(&conf->gen_lock);
7895 conf->previous_raid_disks = conf->raid_disks;
7896 conf->raid_disks += mddev->delta_disks;
7897 conf->prev_chunk_sectors = conf->chunk_sectors;
7898 conf->chunk_sectors = mddev->new_chunk_sectors;
7899 conf->prev_algo = conf->algorithm;
7900 conf->algorithm = mddev->new_layout;
7901 conf->generation++;
7902
7903
7904
7905 smp_mb();
7906 if (mddev->reshape_backwards)
7907 conf->reshape_progress = raid5_size(mddev, 0, 0);
7908 else
7909 conf->reshape_progress = 0;
7910 conf->reshape_safe = conf->reshape_progress;
7911 write_seqcount_end(&conf->gen_lock);
7912 spin_unlock_irq(&conf->device_lock);
7913
7914
7915
7916
7917
7918 mddev_suspend(mddev);
7919 mddev_resume(mddev);
7920
7921
7922
7923
7924
7925
7926
7927
7928 if (mddev->delta_disks >= 0) {
7929 rdev_for_each(rdev, mddev)
7930 if (rdev->raid_disk < 0 &&
7931 !test_bit(Faulty, &rdev->flags)) {
7932 if (raid5_add_disk(mddev, rdev) == 0) {
7933 if (rdev->raid_disk
7934 >= conf->previous_raid_disks)
7935 set_bit(In_sync, &rdev->flags);
7936 else
7937 rdev->recovery_offset = 0;
7938
7939 if (sysfs_link_rdev(mddev, rdev))
7940 ;
7941 }
7942 } else if (rdev->raid_disk >= conf->previous_raid_disks
7943 && !test_bit(Faulty, &rdev->flags)) {
7944
7945 set_bit(In_sync, &rdev->flags);
7946 }
7947
7948
7949
7950
7951
7952 spin_lock_irqsave(&conf->device_lock, flags);
7953 mddev->degraded = raid5_calc_degraded(conf);
7954 spin_unlock_irqrestore(&conf->device_lock, flags);
7955 }
7956 mddev->raid_disks = conf->raid_disks;
7957 mddev->reshape_position = conf->reshape_progress;
7958 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7959
7960 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7961 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7962 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7963 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7964 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7965 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7966 "reshape");
7967 if (!mddev->sync_thread) {
7968 mddev->recovery = 0;
7969 spin_lock_irq(&conf->device_lock);
7970 write_seqcount_begin(&conf->gen_lock);
7971 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
7972 mddev->new_chunk_sectors =
7973 conf->chunk_sectors = conf->prev_chunk_sectors;
7974 mddev->new_layout = conf->algorithm = conf->prev_algo;
7975 rdev_for_each(rdev, mddev)
7976 rdev->new_data_offset = rdev->data_offset;
7977 smp_wmb();
7978 conf->generation --;
7979 conf->reshape_progress = MaxSector;
7980 mddev->reshape_position = MaxSector;
7981 write_seqcount_end(&conf->gen_lock);
7982 spin_unlock_irq(&conf->device_lock);
7983 return -EAGAIN;
7984 }
7985 conf->reshape_checkpoint = jiffies;
7986 md_wakeup_thread(mddev->sync_thread);
7987 md_new_event(mddev);
7988 return 0;
7989}
7990
7991
7992
7993
7994static void end_reshape(struct r5conf *conf)
7995{
7996
7997 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
7998 struct md_rdev *rdev;
7999
8000 spin_lock_irq(&conf->device_lock);
8001 conf->previous_raid_disks = conf->raid_disks;
8002 md_finish_reshape(conf->mddev);
8003 smp_wmb();
8004 conf->reshape_progress = MaxSector;
8005 conf->mddev->reshape_position = MaxSector;
8006 rdev_for_each(rdev, conf->mddev)
8007 if (rdev->raid_disk >= 0 &&
8008 !test_bit(Journal, &rdev->flags) &&
8009 !test_bit(In_sync, &rdev->flags))
8010 rdev->recovery_offset = MaxSector;
8011 spin_unlock_irq(&conf->device_lock);
8012 wake_up(&conf->wait_for_overlap);
8013
8014
8015
8016
8017 if (conf->mddev->queue) {
8018 int data_disks = conf->raid_disks - conf->max_degraded;
8019 int stripe = data_disks * ((conf->chunk_sectors << 9)
8020 / PAGE_SIZE);
8021 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
8022 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
8023 }
8024 }
8025}
8026
8027
8028
8029
8030static void raid5_finish_reshape(struct mddev *mddev)
8031{
8032 struct r5conf *conf = mddev->private;
8033
8034 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8035
8036 if (mddev->delta_disks <= 0) {
8037 int d;
8038 spin_lock_irq(&conf->device_lock);
8039 mddev->degraded = raid5_calc_degraded(conf);
8040 spin_unlock_irq(&conf->device_lock);
8041 for (d = conf->raid_disks ;
8042 d < conf->raid_disks - mddev->delta_disks;
8043 d++) {
8044 struct md_rdev *rdev = conf->disks[d].rdev;
8045 if (rdev)
8046 clear_bit(In_sync, &rdev->flags);
8047 rdev = conf->disks[d].replacement;
8048 if (rdev)
8049 clear_bit(In_sync, &rdev->flags);
8050 }
8051 }
8052 mddev->layout = conf->algorithm;
8053 mddev->chunk_sectors = conf->chunk_sectors;
8054 mddev->reshape_position = MaxSector;
8055 mddev->delta_disks = 0;
8056 mddev->reshape_backwards = 0;
8057 }
8058}
8059
8060static void raid5_quiesce(struct mddev *mddev, int quiesce)
8061{
8062 struct r5conf *conf = mddev->private;
8063
8064 if (quiesce) {
8065
8066 lock_all_device_hash_locks_irq(conf);
8067
8068
8069
8070 r5c_flush_cache(conf, INT_MAX);
8071 conf->quiesce = 2;
8072 wait_event_cmd(conf->wait_for_quiescent,
8073 atomic_read(&conf->active_stripes) == 0 &&
8074 atomic_read(&conf->active_aligned_reads) == 0,
8075 unlock_all_device_hash_locks_irq(conf),
8076 lock_all_device_hash_locks_irq(conf));
8077 conf->quiesce = 1;
8078 unlock_all_device_hash_locks_irq(conf);
8079
8080 wake_up(&conf->wait_for_overlap);
8081 } else {
8082
8083 lock_all_device_hash_locks_irq(conf);
8084 conf->quiesce = 0;
8085 wake_up(&conf->wait_for_quiescent);
8086 wake_up(&conf->wait_for_overlap);
8087 unlock_all_device_hash_locks_irq(conf);
8088 }
8089 log_quiesce(conf, quiesce);
8090}
8091
8092static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8093{
8094 struct r0conf *raid0_conf = mddev->private;
8095 sector_t sectors;
8096
8097
8098 if (raid0_conf->nr_strip_zones > 1) {
8099 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8100 mdname(mddev));
8101 return ERR_PTR(-EINVAL);
8102 }
8103
8104 sectors = raid0_conf->strip_zone[0].zone_end;
8105 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8106 mddev->dev_sectors = sectors;
8107 mddev->new_level = level;
8108 mddev->new_layout = ALGORITHM_PARITY_N;
8109 mddev->new_chunk_sectors = mddev->chunk_sectors;
8110 mddev->raid_disks += 1;
8111 mddev->delta_disks = 1;
8112
8113 mddev->recovery_cp = MaxSector;
8114
8115 return setup_conf(mddev);
8116}
8117
8118static void *raid5_takeover_raid1(struct mddev *mddev)
8119{
8120 int chunksect;
8121 void *ret;
8122
8123 if (mddev->raid_disks != 2 ||
8124 mddev->degraded > 1)
8125 return ERR_PTR(-EINVAL);
8126
8127
8128
8129 chunksect = 64*2;
8130
8131
8132 while (chunksect && (mddev->array_sectors & (chunksect-1)))
8133 chunksect >>= 1;
8134
8135 if ((chunksect<<9) < STRIPE_SIZE)
8136
8137 return ERR_PTR(-EINVAL);
8138
8139 mddev->new_level = 5;
8140 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8141 mddev->new_chunk_sectors = chunksect;
8142
8143 ret = setup_conf(mddev);
8144 if (!IS_ERR(ret))
8145 mddev_clear_unsupported_flags(mddev,
8146 UNSUPPORTED_MDDEV_FLAGS);
8147 return ret;
8148}
8149
8150static void *raid5_takeover_raid6(struct mddev *mddev)
8151{
8152 int new_layout;
8153
8154 switch (mddev->layout) {
8155 case ALGORITHM_LEFT_ASYMMETRIC_6:
8156 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8157 break;
8158 case ALGORITHM_RIGHT_ASYMMETRIC_6:
8159 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8160 break;
8161 case ALGORITHM_LEFT_SYMMETRIC_6:
8162 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8163 break;
8164 case ALGORITHM_RIGHT_SYMMETRIC_6:
8165 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8166 break;
8167 case ALGORITHM_PARITY_0_6:
8168 new_layout = ALGORITHM_PARITY_0;
8169 break;
8170 case ALGORITHM_PARITY_N:
8171 new_layout = ALGORITHM_PARITY_N;
8172 break;
8173 default:
8174 return ERR_PTR(-EINVAL);
8175 }
8176 mddev->new_level = 5;
8177 mddev->new_layout = new_layout;
8178 mddev->delta_disks = -1;
8179 mddev->raid_disks -= 1;
8180 return setup_conf(mddev);
8181}
8182
8183static int raid5_check_reshape(struct mddev *mddev)
8184{
8185
8186
8187
8188
8189
8190 struct r5conf *conf = mddev->private;
8191 int new_chunk = mddev->new_chunk_sectors;
8192
8193 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8194 return -EINVAL;
8195 if (new_chunk > 0) {
8196 if (!is_power_of_2(new_chunk))
8197 return -EINVAL;
8198 if (new_chunk < (PAGE_SIZE>>9))
8199 return -EINVAL;
8200 if (mddev->array_sectors & (new_chunk-1))
8201
8202 return -EINVAL;
8203 }
8204
8205
8206
8207 if (mddev->raid_disks == 2) {
8208
8209 if (mddev->new_layout >= 0) {
8210 conf->algorithm = mddev->new_layout;
8211 mddev->layout = mddev->new_layout;
8212 }
8213 if (new_chunk > 0) {
8214 conf->chunk_sectors = new_chunk ;
8215 mddev->chunk_sectors = new_chunk;
8216 }
8217 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8218 md_wakeup_thread(mddev->thread);
8219 }
8220 return check_reshape(mddev);
8221}
8222
8223static int raid6_check_reshape(struct mddev *mddev)
8224{
8225 int new_chunk = mddev->new_chunk_sectors;
8226
8227 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8228 return -EINVAL;
8229 if (new_chunk > 0) {
8230 if (!is_power_of_2(new_chunk))
8231 return -EINVAL;
8232 if (new_chunk < (PAGE_SIZE >> 9))
8233 return -EINVAL;
8234 if (mddev->array_sectors & (new_chunk-1))
8235
8236 return -EINVAL;
8237 }
8238
8239
8240 return check_reshape(mddev);
8241}
8242
8243static void *raid5_takeover(struct mddev *mddev)
8244{
8245
8246
8247
8248
8249
8250
8251 if (mddev->level == 0)
8252 return raid45_takeover_raid0(mddev, 5);
8253 if (mddev->level == 1)
8254 return raid5_takeover_raid1(mddev);
8255 if (mddev->level == 4) {
8256 mddev->new_layout = ALGORITHM_PARITY_N;
8257 mddev->new_level = 5;
8258 return setup_conf(mddev);
8259 }
8260 if (mddev->level == 6)
8261 return raid5_takeover_raid6(mddev);
8262
8263 return ERR_PTR(-EINVAL);
8264}
8265
8266static void *raid4_takeover(struct mddev *mddev)
8267{
8268
8269
8270
8271
8272 if (mddev->level == 0)
8273 return raid45_takeover_raid0(mddev, 4);
8274 if (mddev->level == 5 &&
8275 mddev->layout == ALGORITHM_PARITY_N) {
8276 mddev->new_layout = 0;
8277 mddev->new_level = 4;
8278 return setup_conf(mddev);
8279 }
8280 return ERR_PTR(-EINVAL);
8281}
8282
8283static struct md_personality raid5_personality;
8284
8285static void *raid6_takeover(struct mddev *mddev)
8286{
8287
8288
8289
8290
8291 int new_layout;
8292
8293 if (mddev->pers != &raid5_personality)
8294 return ERR_PTR(-EINVAL);
8295 if (mddev->degraded > 1)
8296 return ERR_PTR(-EINVAL);
8297 if (mddev->raid_disks > 253)
8298 return ERR_PTR(-EINVAL);
8299 if (mddev->raid_disks < 3)
8300 return ERR_PTR(-EINVAL);
8301
8302 switch (mddev->layout) {
8303 case ALGORITHM_LEFT_ASYMMETRIC:
8304 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8305 break;
8306 case ALGORITHM_RIGHT_ASYMMETRIC:
8307 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8308 break;
8309 case ALGORITHM_LEFT_SYMMETRIC:
8310 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8311 break;
8312 case ALGORITHM_RIGHT_SYMMETRIC:
8313 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8314 break;
8315 case ALGORITHM_PARITY_0:
8316 new_layout = ALGORITHM_PARITY_0_6;
8317 break;
8318 case ALGORITHM_PARITY_N:
8319 new_layout = ALGORITHM_PARITY_N;
8320 break;
8321 default:
8322 return ERR_PTR(-EINVAL);
8323 }
8324 mddev->new_level = 6;
8325 mddev->new_layout = new_layout;
8326 mddev->delta_disks = 1;
8327 mddev->raid_disks += 1;
8328 return setup_conf(mddev);
8329}
8330
8331static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8332{
8333 struct r5conf *conf;
8334 int err;
8335
8336 err = mddev_lock(mddev);
8337 if (err)
8338 return err;
8339 conf = mddev->private;
8340 if (!conf) {
8341 mddev_unlock(mddev);
8342 return -ENODEV;
8343 }
8344
8345 if (strncmp(buf, "ppl", 3) == 0) {
8346
8347 if (!raid5_has_ppl(conf) && conf->level == 5) {
8348 err = log_init(conf, NULL, true);
8349 if (!err) {
8350 err = resize_stripes(conf, conf->pool_size);
8351 if (err)
8352 log_exit(conf);
8353 }
8354 } else
8355 err = -EINVAL;
8356 } else if (strncmp(buf, "resync", 6) == 0) {
8357 if (raid5_has_ppl(conf)) {
8358 mddev_suspend(mddev);
8359 log_exit(conf);
8360 mddev_resume(mddev);
8361 err = resize_stripes(conf, conf->pool_size);
8362 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8363 r5l_log_disk_error(conf)) {
8364 bool journal_dev_exists = false;
8365 struct md_rdev *rdev;
8366
8367 rdev_for_each(rdev, mddev)
8368 if (test_bit(Journal, &rdev->flags)) {
8369 journal_dev_exists = true;
8370 break;
8371 }
8372
8373 if (!journal_dev_exists) {
8374 mddev_suspend(mddev);
8375 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8376 mddev_resume(mddev);
8377 } else
8378 err = -EBUSY;
8379 } else
8380 err = -EINVAL;
8381 } else {
8382 err = -EINVAL;
8383 }
8384
8385 if (!err)
8386 md_update_sb(mddev, 1);
8387
8388 mddev_unlock(mddev);
8389
8390 return err;
8391}
8392
8393static int raid5_start(struct mddev *mddev)
8394{
8395 struct r5conf *conf = mddev->private;
8396
8397 return r5l_start(conf->log);
8398}
8399
8400static struct md_personality raid6_personality =
8401{
8402 .name = "raid6",
8403 .level = 6,
8404 .owner = THIS_MODULE,
8405 .make_request = raid5_make_request,
8406 .run = raid5_run,
8407 .start = raid5_start,
8408 .free = raid5_free,
8409 .status = raid5_status,
8410 .error_handler = raid5_error,
8411 .hot_add_disk = raid5_add_disk,
8412 .hot_remove_disk= raid5_remove_disk,
8413 .spare_active = raid5_spare_active,
8414 .sync_request = raid5_sync_request,
8415 .resize = raid5_resize,
8416 .size = raid5_size,
8417 .check_reshape = raid6_check_reshape,
8418 .start_reshape = raid5_start_reshape,
8419 .finish_reshape = raid5_finish_reshape,
8420 .quiesce = raid5_quiesce,
8421 .takeover = raid6_takeover,
8422 .congested = raid5_congested,
8423 .change_consistency_policy = raid5_change_consistency_policy,
8424};
8425static struct md_personality raid5_personality =
8426{
8427 .name = "raid5",
8428 .level = 5,
8429 .owner = THIS_MODULE,
8430 .make_request = raid5_make_request,
8431 .run = raid5_run,
8432 .start = raid5_start,
8433 .free = raid5_free,
8434 .status = raid5_status,
8435 .error_handler = raid5_error,
8436 .hot_add_disk = raid5_add_disk,
8437 .hot_remove_disk= raid5_remove_disk,
8438 .spare_active = raid5_spare_active,
8439 .sync_request = raid5_sync_request,
8440 .resize = raid5_resize,
8441 .size = raid5_size,
8442 .check_reshape = raid5_check_reshape,
8443 .start_reshape = raid5_start_reshape,
8444 .finish_reshape = raid5_finish_reshape,
8445 .quiesce = raid5_quiesce,
8446 .takeover = raid5_takeover,
8447 .congested = raid5_congested,
8448 .change_consistency_policy = raid5_change_consistency_policy,
8449};
8450
8451static struct md_personality raid4_personality =
8452{
8453 .name = "raid4",
8454 .level = 4,
8455 .owner = THIS_MODULE,
8456 .make_request = raid5_make_request,
8457 .run = raid5_run,
8458 .start = raid5_start,
8459 .free = raid5_free,
8460 .status = raid5_status,
8461 .error_handler = raid5_error,
8462 .hot_add_disk = raid5_add_disk,
8463 .hot_remove_disk= raid5_remove_disk,
8464 .spare_active = raid5_spare_active,
8465 .sync_request = raid5_sync_request,
8466 .resize = raid5_resize,
8467 .size = raid5_size,
8468 .check_reshape = raid5_check_reshape,
8469 .start_reshape = raid5_start_reshape,
8470 .finish_reshape = raid5_finish_reshape,
8471 .quiesce = raid5_quiesce,
8472 .takeover = raid4_takeover,
8473 .congested = raid5_congested,
8474 .change_consistency_policy = raid5_change_consistency_policy,
8475};
8476
8477static int __init raid5_init(void)
8478{
8479 int ret;
8480
8481 raid5_wq = alloc_workqueue("raid5wq",
8482 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8483 if (!raid5_wq)
8484 return -ENOMEM;
8485
8486 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8487 "md/raid5:prepare",
8488 raid456_cpu_up_prepare,
8489 raid456_cpu_dead);
8490 if (ret) {
8491 destroy_workqueue(raid5_wq);
8492 return ret;
8493 }
8494 register_md_personality(&raid6_personality);
8495 register_md_personality(&raid5_personality);
8496 register_md_personality(&raid4_personality);
8497 return 0;
8498}
8499
8500static void raid5_exit(void)
8501{
8502 unregister_md_personality(&raid6_personality);
8503 unregister_md_personality(&raid5_personality);
8504 unregister_md_personality(&raid4_personality);
8505 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8506 destroy_workqueue(raid5_wq);
8507}
8508
8509module_init(raid5_init);
8510module_exit(raid5_exit);
8511MODULE_LICENSE("GPL");
8512MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8513MODULE_ALIAS("md-personality-4");
8514MODULE_ALIAS("md-raid5");
8515MODULE_ALIAS("md-raid4");
8516MODULE_ALIAS("md-level-5");
8517MODULE_ALIAS("md-level-4");
8518MODULE_ALIAS("md-personality-8");
8519MODULE_ALIAS("md-raid6");
8520MODULE_ALIAS("md-level-6");
8521
8522
8523MODULE_ALIAS("raid5");
8524MODULE_ALIAS("raid6");
8525