1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/kernel.h>
16#include <linux/wait.h>
17#include <linux/blkdev.h>
18#include <linux/slab.h>
19#include <linux/raid/md_p.h>
20#include <linux/crc32c.h>
21#include <linux/random.h>
22#include <linux/kthread.h>
23#include "md.h"
24#include "raid5.h"
25#include "bitmap.h"
26
27
28
29
30
31#define BLOCK_SECTORS (8)
32
33
34
35
36
37
38
39#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2)
40#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
41
42
43#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
44
45#define R5C_FULL_STRIPE_FLUSH_BATCH 256
46
47#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
48
49
50
51
52
53#define R5L_POOL_SIZE 4
54
55
56
57
58
59
60enum r5c_journal_mode {
61 R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
62 R5C_JOURNAL_MODE_WRITE_BACK = 1,
63};
64
65static char *r5c_journal_mode_str[] = {"write-through",
66 "write-back"};
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98struct r5l_log {
99 struct md_rdev *rdev;
100
101 u32 uuid_checksum;
102
103 sector_t device_size;
104
105 sector_t max_free_space;
106
107
108 sector_t last_checkpoint;
109
110 u64 last_cp_seq;
111
112 sector_t log_start;
113 u64 seq;
114
115 sector_t next_checkpoint;
116
117 struct mutex io_mutex;
118 struct r5l_io_unit *current_io;
119
120 spinlock_t io_list_lock;
121 struct list_head running_ios;
122
123
124 struct list_head io_end_ios;
125
126
127 struct list_head flushing_ios;
128
129 struct list_head finished_ios;
130 struct bio flush_bio;
131
132 struct list_head no_mem_stripes;
133
134 struct kmem_cache *io_kc;
135 mempool_t *io_pool;
136 struct bio_set *bs;
137 mempool_t *meta_pool;
138
139 struct md_thread *reclaim_thread;
140 unsigned long reclaim_target;
141
142
143
144
145
146
147 wait_queue_head_t iounit_wait;
148
149 struct list_head no_space_stripes;
150 spinlock_t no_space_stripes_lock;
151
152 bool need_cache_flush;
153
154
155 enum r5c_journal_mode r5c_journal_mode;
156
157
158 struct list_head stripe_in_journal_list;
159
160 spinlock_t stripe_in_journal_lock;
161 atomic_t stripe_in_journal_count;
162
163
164 struct work_struct deferred_io_work;
165
166 struct work_struct disable_writeback_work;
167};
168
169
170
171
172
173
174
175
176struct r5l_io_unit {
177 struct r5l_log *log;
178
179 struct page *meta_page;
180 int meta_offset;
181
182 struct bio *current_bio;
183
184 atomic_t pending_stripe;
185 u64 seq;
186 sector_t log_start;
187 sector_t log_end;
188 struct list_head log_sibling;
189 struct list_head stripe_list;
190
191 int state;
192 bool need_split_bio;
193 struct bio *split_bio;
194
195 unsigned int has_flush:1;
196 unsigned int has_fua:1;
197 unsigned int has_null_flush:1;
198
199
200
201
202 unsigned int io_deferred:1;
203
204 struct bio_list flush_barriers;
205};
206
207
208enum r5l_io_unit_state {
209 IO_UNIT_RUNNING = 0,
210 IO_UNIT_IO_START = 1,
211
212 IO_UNIT_IO_END = 2,
213 IO_UNIT_STRIPE_END = 3,
214};
215
216bool r5c_is_writeback(struct r5l_log *log)
217{
218 return (log != NULL &&
219 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
220}
221
222static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
223{
224 start += inc;
225 if (start >= log->device_size)
226 start = start - log->device_size;
227 return start;
228}
229
230static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
231 sector_t end)
232{
233 if (end >= start)
234 return end - start;
235 else
236 return end + log->device_size - start;
237}
238
239static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
240{
241 sector_t used_size;
242
243 used_size = r5l_ring_distance(log, log->last_checkpoint,
244 log->log_start);
245
246 return log->device_size > used_size + size;
247}
248
249static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
250 enum r5l_io_unit_state state)
251{
252 if (WARN_ON(io->state >= state))
253 return;
254 io->state = state;
255}
256
257static void
258r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
259 struct bio_list *return_bi)
260{
261 struct bio *wbi, *wbi2;
262
263 wbi = dev->written;
264 dev->written = NULL;
265 while (wbi && wbi->bi_iter.bi_sector <
266 dev->sector + STRIPE_SECTORS) {
267 wbi2 = r5_next_bio(wbi, dev->sector);
268 if (!raid5_dec_bi_active_stripes(wbi)) {
269 md_write_end(conf->mddev);
270 bio_list_add(return_bi, wbi);
271 }
272 wbi = wbi2;
273 }
274}
275
276void r5c_handle_cached_data_endio(struct r5conf *conf,
277 struct stripe_head *sh, int disks, struct bio_list *return_bi)
278{
279 int i;
280
281 for (i = sh->disks; i--; ) {
282 if (sh->dev[i].written) {
283 set_bit(R5_UPTODATE, &sh->dev[i].flags);
284 r5c_return_dev_pending_writes(conf, &sh->dev[i],
285 return_bi);
286 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
287 STRIPE_SECTORS,
288 !test_bit(STRIPE_DEGRADED, &sh->state),
289 0);
290 }
291 }
292}
293
294
295void r5c_check_stripe_cache_usage(struct r5conf *conf)
296{
297 int total_cached;
298
299 if (!r5c_is_writeback(conf->log))
300 return;
301
302 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
303 atomic_read(&conf->r5c_cached_full_stripes);
304
305
306
307
308
309
310
311
312
313 if (total_cached > conf->min_nr_stripes * 1 / 2 ||
314 atomic_read(&conf->empty_inactive_list_nr) > 0)
315 r5l_wake_reclaim(conf->log, 0);
316}
317
318
319
320
321
322void r5c_check_cached_full_stripe(struct r5conf *conf)
323{
324 if (!r5c_is_writeback(conf->log))
325 return;
326
327
328
329
330
331 if (atomic_read(&conf->r5c_cached_full_stripes) >=
332 min(R5C_FULL_STRIPE_FLUSH_BATCH,
333 conf->chunk_sectors >> STRIPE_SHIFT))
334 r5l_wake_reclaim(conf->log, 0);
335}
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
353{
354 struct r5l_log *log = conf->log;
355
356 if (!r5c_is_writeback(log))
357 return 0;
358
359 return BLOCK_SECTORS * (conf->raid_disks + 1) *
360 atomic_read(&log->stripe_in_journal_count);
361}
362
363
364
365
366
367
368
369
370static inline void r5c_update_log_state(struct r5l_log *log)
371{
372 struct r5conf *conf = log->rdev->mddev->private;
373 sector_t free_space;
374 sector_t reclaim_space;
375 bool wake_reclaim = false;
376
377 if (!r5c_is_writeback(log))
378 return;
379
380 free_space = r5l_ring_distance(log, log->log_start,
381 log->last_checkpoint);
382 reclaim_space = r5c_log_required_to_flush_cache(conf);
383 if (free_space < 2 * reclaim_space)
384 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
385 else {
386 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
387 wake_reclaim = true;
388 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
389 }
390 if (free_space < 3 * reclaim_space)
391 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
392 else
393 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
394
395 if (wake_reclaim)
396 r5l_wake_reclaim(log, 0);
397}
398
399
400
401
402
403void r5c_make_stripe_write_out(struct stripe_head *sh)
404{
405 struct r5conf *conf = sh->raid_conf;
406 struct r5l_log *log = conf->log;
407
408 BUG_ON(!r5c_is_writeback(log));
409
410 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
411 clear_bit(STRIPE_R5C_CACHING, &sh->state);
412
413 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
414 atomic_inc(&conf->preread_active_stripes);
415
416 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
417 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
418 atomic_dec(&conf->r5c_cached_partial_stripes);
419 }
420
421 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
422 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
423 atomic_dec(&conf->r5c_cached_full_stripes);
424 }
425}
426
427static void r5c_handle_data_cached(struct stripe_head *sh)
428{
429 int i;
430
431 for (i = sh->disks; i--; )
432 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
433 set_bit(R5_InJournal, &sh->dev[i].flags);
434 clear_bit(R5_LOCKED, &sh->dev[i].flags);
435 }
436 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
437}
438
439
440
441
442
443static void r5c_handle_parity_cached(struct stripe_head *sh)
444{
445 int i;
446
447 for (i = sh->disks; i--; )
448 if (test_bit(R5_InJournal, &sh->dev[i].flags))
449 set_bit(R5_Wantwrite, &sh->dev[i].flags);
450}
451
452
453
454
455
456static void r5c_finish_cache_stripe(struct stripe_head *sh)
457{
458 struct r5l_log *log = sh->raid_conf->log;
459
460 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
461 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
462
463
464
465
466
467
468 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
469 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
470 r5c_handle_data_cached(sh);
471 } else {
472 r5c_handle_parity_cached(sh);
473 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
474 }
475}
476
477static void r5l_io_run_stripes(struct r5l_io_unit *io)
478{
479 struct stripe_head *sh, *next;
480
481 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
482 list_del_init(&sh->log_list);
483
484 r5c_finish_cache_stripe(sh);
485
486 set_bit(STRIPE_HANDLE, &sh->state);
487 raid5_release_stripe(sh);
488 }
489}
490
491static void r5l_log_run_stripes(struct r5l_log *log)
492{
493 struct r5l_io_unit *io, *next;
494
495 assert_spin_locked(&log->io_list_lock);
496
497 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
498
499 if (io->state < IO_UNIT_IO_END)
500 break;
501
502 list_move_tail(&io->log_sibling, &log->finished_ios);
503 r5l_io_run_stripes(io);
504 }
505}
506
507static void r5l_move_to_end_ios(struct r5l_log *log)
508{
509 struct r5l_io_unit *io, *next;
510
511 assert_spin_locked(&log->io_list_lock);
512
513 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
514
515 if (io->state < IO_UNIT_IO_END)
516 break;
517 list_move_tail(&io->log_sibling, &log->io_end_ios);
518 }
519}
520
521static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
522static void r5l_log_endio(struct bio *bio)
523{
524 struct r5l_io_unit *io = bio->bi_private;
525 struct r5l_io_unit *io_deferred;
526 struct r5l_log *log = io->log;
527 unsigned long flags;
528
529 if (bio->bi_error)
530 md_error(log->rdev->mddev, log->rdev);
531
532 bio_put(bio);
533 mempool_free(io->meta_page, log->meta_pool);
534
535 spin_lock_irqsave(&log->io_list_lock, flags);
536 __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
537 if (log->need_cache_flush)
538 r5l_move_to_end_ios(log);
539 else
540 r5l_log_run_stripes(log);
541 if (!list_empty(&log->running_ios)) {
542
543
544
545
546 io_deferred = list_first_entry(&log->running_ios,
547 struct r5l_io_unit, log_sibling);
548 if (io_deferred->io_deferred)
549 schedule_work(&log->deferred_io_work);
550 }
551
552 spin_unlock_irqrestore(&log->io_list_lock, flags);
553
554 if (log->need_cache_flush)
555 md_wakeup_thread(log->rdev->mddev->thread);
556
557 if (io->has_null_flush) {
558 struct bio *bi;
559
560 WARN_ON(bio_list_empty(&io->flush_barriers));
561 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
562 bio_endio(bi);
563 atomic_dec(&io->pending_stripe);
564 }
565 if (atomic_read(&io->pending_stripe) == 0)
566 __r5l_stripe_write_finished(io);
567 }
568}
569
570static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
571{
572 unsigned long flags;
573
574 spin_lock_irqsave(&log->io_list_lock, flags);
575 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
576 spin_unlock_irqrestore(&log->io_list_lock, flags);
577
578 if (io->has_flush)
579 io->current_bio->bi_opf |= REQ_PREFLUSH;
580 if (io->has_fua)
581 io->current_bio->bi_opf |= REQ_FUA;
582 submit_bio(io->current_bio);
583
584 if (!io->split_bio)
585 return;
586
587 if (io->has_flush)
588 io->split_bio->bi_opf |= REQ_PREFLUSH;
589 if (io->has_fua)
590 io->split_bio->bi_opf |= REQ_FUA;
591 submit_bio(io->split_bio);
592}
593
594
595static void r5l_submit_io_async(struct work_struct *work)
596{
597 struct r5l_log *log = container_of(work, struct r5l_log,
598 deferred_io_work);
599 struct r5l_io_unit *io = NULL;
600 unsigned long flags;
601
602 spin_lock_irqsave(&log->io_list_lock, flags);
603 if (!list_empty(&log->running_ios)) {
604 io = list_first_entry(&log->running_ios, struct r5l_io_unit,
605 log_sibling);
606 if (!io->io_deferred)
607 io = NULL;
608 else
609 io->io_deferred = 0;
610 }
611 spin_unlock_irqrestore(&log->io_list_lock, flags);
612 if (io)
613 r5l_do_submit_io(log, io);
614}
615
616static void r5c_disable_writeback_async(struct work_struct *work)
617{
618 struct r5l_log *log = container_of(work, struct r5l_log,
619 disable_writeback_work);
620 struct mddev *mddev = log->rdev->mddev;
621
622 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
623 return;
624 pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
625 mdname(mddev));
626 mddev_suspend(mddev);
627 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
628 mddev_resume(mddev);
629}
630
631static void r5l_submit_current_io(struct r5l_log *log)
632{
633 struct r5l_io_unit *io = log->current_io;
634 struct bio *bio;
635 struct r5l_meta_block *block;
636 unsigned long flags;
637 u32 crc;
638 bool do_submit = true;
639
640 if (!io)
641 return;
642
643 block = page_address(io->meta_page);
644 block->meta_size = cpu_to_le32(io->meta_offset);
645 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
646 block->checksum = cpu_to_le32(crc);
647 bio = io->current_bio;
648
649 log->current_io = NULL;
650 spin_lock_irqsave(&log->io_list_lock, flags);
651 if (io->has_flush || io->has_fua) {
652 if (io != list_first_entry(&log->running_ios,
653 struct r5l_io_unit, log_sibling)) {
654 io->io_deferred = 1;
655 do_submit = false;
656 }
657 }
658 spin_unlock_irqrestore(&log->io_list_lock, flags);
659 if (do_submit)
660 r5l_do_submit_io(log, io);
661}
662
663static struct bio *r5l_bio_alloc(struct r5l_log *log)
664{
665 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
666
667 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
668 bio->bi_bdev = log->rdev->bdev;
669 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
670
671 return bio;
672}
673
674static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
675{
676 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
677
678 r5c_update_log_state(log);
679
680
681
682
683
684
685
686 if (log->log_start == 0)
687 io->need_split_bio = true;
688
689 io->log_end = log->log_start;
690}
691
692static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
693{
694 struct r5l_io_unit *io;
695 struct r5l_meta_block *block;
696
697 io = mempool_alloc(log->io_pool, GFP_ATOMIC);
698 if (!io)
699 return NULL;
700 memset(io, 0, sizeof(*io));
701
702 io->log = log;
703 INIT_LIST_HEAD(&io->log_sibling);
704 INIT_LIST_HEAD(&io->stripe_list);
705 bio_list_init(&io->flush_barriers);
706 io->state = IO_UNIT_RUNNING;
707
708 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
709 block = page_address(io->meta_page);
710 clear_page(block);
711 block->magic = cpu_to_le32(R5LOG_MAGIC);
712 block->version = R5LOG_VERSION;
713 block->seq = cpu_to_le64(log->seq);
714 block->position = cpu_to_le64(log->log_start);
715
716 io->log_start = log->log_start;
717 io->meta_offset = sizeof(struct r5l_meta_block);
718 io->seq = log->seq++;
719
720 io->current_bio = r5l_bio_alloc(log);
721 io->current_bio->bi_end_io = r5l_log_endio;
722 io->current_bio->bi_private = io;
723 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
724
725 r5_reserve_log_entry(log, io);
726
727 spin_lock_irq(&log->io_list_lock);
728 list_add_tail(&io->log_sibling, &log->running_ios);
729 spin_unlock_irq(&log->io_list_lock);
730
731 return io;
732}
733
734static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
735{
736 if (log->current_io &&
737 log->current_io->meta_offset + payload_size > PAGE_SIZE)
738 r5l_submit_current_io(log);
739
740 if (!log->current_io) {
741 log->current_io = r5l_new_meta(log);
742 if (!log->current_io)
743 return -ENOMEM;
744 }
745
746 return 0;
747}
748
749static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
750 sector_t location,
751 u32 checksum1, u32 checksum2,
752 bool checksum2_valid)
753{
754 struct r5l_io_unit *io = log->current_io;
755 struct r5l_payload_data_parity *payload;
756
757 payload = page_address(io->meta_page) + io->meta_offset;
758 payload->header.type = cpu_to_le16(type);
759 payload->header.flags = cpu_to_le16(0);
760 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
761 (PAGE_SHIFT - 9));
762 payload->location = cpu_to_le64(location);
763 payload->checksum[0] = cpu_to_le32(checksum1);
764 if (checksum2_valid)
765 payload->checksum[1] = cpu_to_le32(checksum2);
766
767 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
768 sizeof(__le32) * (1 + !!checksum2_valid);
769}
770
771static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
772{
773 struct r5l_io_unit *io = log->current_io;
774
775 if (io->need_split_bio) {
776 BUG_ON(io->split_bio);
777 io->split_bio = io->current_bio;
778 io->current_bio = r5l_bio_alloc(log);
779 bio_chain(io->current_bio, io->split_bio);
780 io->need_split_bio = false;
781 }
782
783 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
784 BUG();
785
786 r5_reserve_log_entry(log, io);
787}
788
789static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
790 int data_pages, int parity_pages)
791{
792 int i;
793 int meta_size;
794 int ret;
795 struct r5l_io_unit *io;
796
797 meta_size =
798 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
799 * data_pages) +
800 sizeof(struct r5l_payload_data_parity) +
801 sizeof(__le32) * parity_pages;
802
803 ret = r5l_get_meta(log, meta_size);
804 if (ret)
805 return ret;
806
807 io = log->current_io;
808
809 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
810 io->has_flush = 1;
811
812 for (i = 0; i < sh->disks; i++) {
813 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
814 test_bit(R5_InJournal, &sh->dev[i].flags))
815 continue;
816 if (i == sh->pd_idx || i == sh->qd_idx)
817 continue;
818 if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
819 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
820 io->has_fua = 1;
821
822
823
824
825 io->has_flush = 1;
826 }
827 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
828 raid5_compute_blocknr(sh, i, 0),
829 sh->dev[i].log_checksum, 0, false);
830 r5l_append_payload_page(log, sh->dev[i].page);
831 }
832
833 if (parity_pages == 2) {
834 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
835 sh->sector, sh->dev[sh->pd_idx].log_checksum,
836 sh->dev[sh->qd_idx].log_checksum, true);
837 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
838 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
839 } else if (parity_pages == 1) {
840 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
841 sh->sector, sh->dev[sh->pd_idx].log_checksum,
842 0, false);
843 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
844 } else
845 BUG_ON(parity_pages != 0);
846
847 list_add_tail(&sh->log_list, &io->stripe_list);
848 atomic_inc(&io->pending_stripe);
849 sh->log_io = io;
850
851 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
852 return 0;
853
854 if (sh->log_start == MaxSector) {
855 BUG_ON(!list_empty(&sh->r5c));
856 sh->log_start = io->log_start;
857 spin_lock_irq(&log->stripe_in_journal_lock);
858 list_add_tail(&sh->r5c,
859 &log->stripe_in_journal_list);
860 spin_unlock_irq(&log->stripe_in_journal_lock);
861 atomic_inc(&log->stripe_in_journal_count);
862 }
863 return 0;
864}
865
866
867static inline void r5l_add_no_space_stripe(struct r5l_log *log,
868 struct stripe_head *sh)
869{
870 spin_lock(&log->no_space_stripes_lock);
871 list_add_tail(&sh->log_list, &log->no_space_stripes);
872 spin_unlock(&log->no_space_stripes_lock);
873}
874
875
876
877
878
879int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
880{
881 struct r5conf *conf = sh->raid_conf;
882 int write_disks = 0;
883 int data_pages, parity_pages;
884 int reserve;
885 int i;
886 int ret = 0;
887 bool wake_reclaim = false;
888
889 if (!log)
890 return -EAGAIN;
891
892 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
893 test_bit(STRIPE_SYNCING, &sh->state)) {
894
895 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
896 return -EAGAIN;
897 }
898
899 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
900
901 for (i = 0; i < sh->disks; i++) {
902 void *addr;
903
904 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
905 test_bit(R5_InJournal, &sh->dev[i].flags))
906 continue;
907
908 write_disks++;
909
910 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
911 continue;
912 addr = kmap_atomic(sh->dev[i].page);
913 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
914 addr, PAGE_SIZE);
915 kunmap_atomic(addr);
916 }
917 parity_pages = 1 + !!(sh->qd_idx >= 0);
918 data_pages = write_disks - parity_pages;
919
920 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
921
922
923
924
925 clear_bit(STRIPE_DELAYED, &sh->state);
926 atomic_inc(&sh->count);
927
928 mutex_lock(&log->io_mutex);
929
930 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
931
932 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
933 if (!r5l_has_free_space(log, reserve)) {
934 r5l_add_no_space_stripe(log, sh);
935 wake_reclaim = true;
936 } else {
937 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
938 if (ret) {
939 spin_lock_irq(&log->io_list_lock);
940 list_add_tail(&sh->log_list,
941 &log->no_mem_stripes);
942 spin_unlock_irq(&log->io_list_lock);
943 }
944 }
945 } else {
946
947
948
949
950 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
951 sh->log_start == MaxSector) {
952 r5l_add_no_space_stripe(log, sh);
953 wake_reclaim = true;
954 reserve = 0;
955 } else if (!r5l_has_free_space(log, reserve)) {
956 if (sh->log_start == log->last_checkpoint)
957 BUG();
958 else
959 r5l_add_no_space_stripe(log, sh);
960 } else {
961 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
962 if (ret) {
963 spin_lock_irq(&log->io_list_lock);
964 list_add_tail(&sh->log_list,
965 &log->no_mem_stripes);
966 spin_unlock_irq(&log->io_list_lock);
967 }
968 }
969 }
970
971 mutex_unlock(&log->io_mutex);
972 if (wake_reclaim)
973 r5l_wake_reclaim(log, reserve);
974 return 0;
975}
976
977void r5l_write_stripe_run(struct r5l_log *log)
978{
979 if (!log)
980 return;
981 mutex_lock(&log->io_mutex);
982 r5l_submit_current_io(log);
983 mutex_unlock(&log->io_mutex);
984}
985
986int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
987{
988 if (!log)
989 return -ENODEV;
990
991 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
992
993
994
995
996
997
998
999 if (bio->bi_iter.bi_size == 0) {
1000 bio_endio(bio);
1001 return 0;
1002 }
1003 bio->bi_opf &= ~REQ_PREFLUSH;
1004 } else {
1005
1006 if (bio->bi_iter.bi_size == 0) {
1007 mutex_lock(&log->io_mutex);
1008 r5l_get_meta(log, 0);
1009 bio_list_add(&log->current_io->flush_barriers, bio);
1010 log->current_io->has_flush = 1;
1011 log->current_io->has_null_flush = 1;
1012 atomic_inc(&log->current_io->pending_stripe);
1013 r5l_submit_current_io(log);
1014 mutex_unlock(&log->io_mutex);
1015 return 0;
1016 }
1017 }
1018 return -EAGAIN;
1019}
1020
1021
1022static void r5l_run_no_space_stripes(struct r5l_log *log)
1023{
1024 struct stripe_head *sh;
1025
1026 spin_lock(&log->no_space_stripes_lock);
1027 while (!list_empty(&log->no_space_stripes)) {
1028 sh = list_first_entry(&log->no_space_stripes,
1029 struct stripe_head, log_list);
1030 list_del_init(&sh->log_list);
1031 set_bit(STRIPE_HANDLE, &sh->state);
1032 raid5_release_stripe(sh);
1033 }
1034 spin_unlock(&log->no_space_stripes_lock);
1035}
1036
1037
1038
1039
1040
1041
1042static sector_t r5c_calculate_new_cp(struct r5conf *conf)
1043{
1044 struct stripe_head *sh;
1045 struct r5l_log *log = conf->log;
1046 sector_t new_cp;
1047 unsigned long flags;
1048
1049 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1050 return log->next_checkpoint;
1051
1052 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1053 if (list_empty(&conf->log->stripe_in_journal_list)) {
1054
1055 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1056 return log->next_checkpoint;
1057 }
1058 sh = list_first_entry(&conf->log->stripe_in_journal_list,
1059 struct stripe_head, r5c);
1060 new_cp = sh->log_start;
1061 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1062 return new_cp;
1063}
1064
1065static sector_t r5l_reclaimable_space(struct r5l_log *log)
1066{
1067 struct r5conf *conf = log->rdev->mddev->private;
1068
1069 return r5l_ring_distance(log, log->last_checkpoint,
1070 r5c_calculate_new_cp(conf));
1071}
1072
1073static void r5l_run_no_mem_stripe(struct r5l_log *log)
1074{
1075 struct stripe_head *sh;
1076
1077 assert_spin_locked(&log->io_list_lock);
1078
1079 if (!list_empty(&log->no_mem_stripes)) {
1080 sh = list_first_entry(&log->no_mem_stripes,
1081 struct stripe_head, log_list);
1082 list_del_init(&sh->log_list);
1083 set_bit(STRIPE_HANDLE, &sh->state);
1084 raid5_release_stripe(sh);
1085 }
1086}
1087
1088static bool r5l_complete_finished_ios(struct r5l_log *log)
1089{
1090 struct r5l_io_unit *io, *next;
1091 bool found = false;
1092
1093 assert_spin_locked(&log->io_list_lock);
1094
1095 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
1096
1097 if (io->state < IO_UNIT_STRIPE_END)
1098 break;
1099
1100 log->next_checkpoint = io->log_start;
1101
1102 list_del(&io->log_sibling);
1103 mempool_free(io, log->io_pool);
1104 r5l_run_no_mem_stripe(log);
1105
1106 found = true;
1107 }
1108
1109 return found;
1110}
1111
1112static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
1113{
1114 struct r5l_log *log = io->log;
1115 struct r5conf *conf = log->rdev->mddev->private;
1116 unsigned long flags;
1117
1118 spin_lock_irqsave(&log->io_list_lock, flags);
1119 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
1120
1121 if (!r5l_complete_finished_ios(log)) {
1122 spin_unlock_irqrestore(&log->io_list_lock, flags);
1123 return;
1124 }
1125
1126 if (r5l_reclaimable_space(log) > log->max_free_space ||
1127 test_bit(R5C_LOG_TIGHT, &conf->cache_state))
1128 r5l_wake_reclaim(log, 0);
1129
1130 spin_unlock_irqrestore(&log->io_list_lock, flags);
1131 wake_up(&log->iounit_wait);
1132}
1133
1134void r5l_stripe_write_finished(struct stripe_head *sh)
1135{
1136 struct r5l_io_unit *io;
1137
1138 io = sh->log_io;
1139 sh->log_io = NULL;
1140
1141 if (io && atomic_dec_and_test(&io->pending_stripe))
1142 __r5l_stripe_write_finished(io);
1143}
1144
1145static void r5l_log_flush_endio(struct bio *bio)
1146{
1147 struct r5l_log *log = container_of(bio, struct r5l_log,
1148 flush_bio);
1149 unsigned long flags;
1150 struct r5l_io_unit *io;
1151
1152 if (bio->bi_error)
1153 md_error(log->rdev->mddev, log->rdev);
1154
1155 spin_lock_irqsave(&log->io_list_lock, flags);
1156 list_for_each_entry(io, &log->flushing_ios, log_sibling)
1157 r5l_io_run_stripes(io);
1158 list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1159 spin_unlock_irqrestore(&log->io_list_lock, flags);
1160}
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176void r5l_flush_stripe_to_raid(struct r5l_log *log)
1177{
1178 bool do_flush;
1179
1180 if (!log || !log->need_cache_flush)
1181 return;
1182
1183 spin_lock_irq(&log->io_list_lock);
1184
1185 if (!list_empty(&log->flushing_ios)) {
1186 spin_unlock_irq(&log->io_list_lock);
1187 return;
1188 }
1189 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1190 do_flush = !list_empty(&log->flushing_ios);
1191 spin_unlock_irq(&log->io_list_lock);
1192
1193 if (!do_flush)
1194 return;
1195 bio_reset(&log->flush_bio);
1196 log->flush_bio.bi_bdev = log->rdev->bdev;
1197 log->flush_bio.bi_end_io = r5l_log_flush_endio;
1198 log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1199 submit_bio(&log->flush_bio);
1200}
1201
1202static void r5l_write_super(struct r5l_log *log, sector_t cp);
1203static void r5l_write_super_and_discard_space(struct r5l_log *log,
1204 sector_t end)
1205{
1206 struct block_device *bdev = log->rdev->bdev;
1207 struct mddev *mddev;
1208
1209 r5l_write_super(log, end);
1210
1211 if (!blk_queue_discard(bdev_get_queue(bdev)))
1212 return;
1213
1214 mddev = log->rdev->mddev;
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226 set_mask_bits(&mddev->sb_flags, 0,
1227 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1228 if (!mddev_trylock(mddev))
1229 return;
1230 md_update_sb(mddev, 1);
1231 mddev_unlock(mddev);
1232
1233
1234 if (log->last_checkpoint < end) {
1235 blkdev_issue_discard(bdev,
1236 log->last_checkpoint + log->rdev->data_offset,
1237 end - log->last_checkpoint, GFP_NOIO, 0);
1238 } else {
1239 blkdev_issue_discard(bdev,
1240 log->last_checkpoint + log->rdev->data_offset,
1241 log->device_size - log->last_checkpoint,
1242 GFP_NOIO, 0);
1243 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1244 GFP_NOIO, 0);
1245 }
1246}
1247
1248
1249
1250
1251
1252
1253
1254static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1255{
1256 BUG_ON(list_empty(&sh->lru));
1257 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1258 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1259
1260
1261
1262
1263
1264 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1265 assert_spin_locked(&conf->device_lock);
1266
1267 list_del_init(&sh->lru);
1268 atomic_inc(&sh->count);
1269
1270 set_bit(STRIPE_HANDLE, &sh->state);
1271 atomic_inc(&conf->active_stripes);
1272 r5c_make_stripe_write_out(sh);
1273
1274 raid5_release_stripe(sh);
1275}
1276
1277
1278
1279
1280
1281
1282
1283void r5c_flush_cache(struct r5conf *conf, int num)
1284{
1285 int count;
1286 struct stripe_head *sh, *next;
1287
1288 assert_spin_locked(&conf->device_lock);
1289 if (!conf->log)
1290 return;
1291
1292 count = 0;
1293 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1294 r5c_flush_stripe(conf, sh);
1295 count++;
1296 }
1297
1298 if (count >= num)
1299 return;
1300 list_for_each_entry_safe(sh, next,
1301 &conf->r5c_partial_stripe_list, lru) {
1302 r5c_flush_stripe(conf, sh);
1303 if (++count >= num)
1304 break;
1305 }
1306}
1307
1308static void r5c_do_reclaim(struct r5conf *conf)
1309{
1310 struct r5l_log *log = conf->log;
1311 struct stripe_head *sh;
1312 int count = 0;
1313 unsigned long flags;
1314 int total_cached;
1315 int stripes_to_flush;
1316
1317 if (!r5c_is_writeback(log))
1318 return;
1319
1320 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1321 atomic_read(&conf->r5c_cached_full_stripes);
1322
1323 if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1324 atomic_read(&conf->empty_inactive_list_nr) > 0)
1325
1326
1327
1328
1329 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1330 else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1331 atomic_read(&conf->r5c_cached_full_stripes) >
1332 R5C_FULL_STRIPE_FLUSH_BATCH)
1333
1334
1335
1336
1337 stripes_to_flush = 0;
1338 else
1339
1340 stripes_to_flush = -1;
1341
1342 if (stripes_to_flush >= 0) {
1343 spin_lock_irqsave(&conf->device_lock, flags);
1344 r5c_flush_cache(conf, stripes_to_flush);
1345 spin_unlock_irqrestore(&conf->device_lock, flags);
1346 }
1347
1348
1349 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1350 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1351 spin_lock(&conf->device_lock);
1352 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1353
1354
1355
1356
1357
1358
1359
1360
1361 if (!list_empty(&sh->lru) &&
1362 !test_bit(STRIPE_HANDLE, &sh->state) &&
1363 atomic_read(&sh->count) == 0) {
1364 r5c_flush_stripe(conf, sh);
1365 }
1366 if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1367 break;
1368 }
1369 spin_unlock(&conf->device_lock);
1370 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1371 }
1372
1373 if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
1374 r5l_run_no_space_stripes(log);
1375
1376 md_wakeup_thread(conf->mddev->thread);
1377}
1378
1379static void r5l_do_reclaim(struct r5l_log *log)
1380{
1381 struct r5conf *conf = log->rdev->mddev->private;
1382 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
1383 sector_t reclaimable;
1384 sector_t next_checkpoint;
1385 bool write_super;
1386
1387 spin_lock_irq(&log->io_list_lock);
1388 write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1389 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
1390
1391
1392
1393
1394
1395 while (1) {
1396 reclaimable = r5l_reclaimable_space(log);
1397 if (reclaimable >= reclaim_target ||
1398 (list_empty(&log->running_ios) &&
1399 list_empty(&log->io_end_ios) &&
1400 list_empty(&log->flushing_ios) &&
1401 list_empty(&log->finished_ios)))
1402 break;
1403
1404 md_wakeup_thread(log->rdev->mddev->thread);
1405 wait_event_lock_irq(log->iounit_wait,
1406 r5l_reclaimable_space(log) > reclaimable,
1407 log->io_list_lock);
1408 }
1409
1410 next_checkpoint = r5c_calculate_new_cp(conf);
1411 spin_unlock_irq(&log->io_list_lock);
1412
1413 if (reclaimable == 0 || !write_super)
1414 return;
1415
1416
1417
1418
1419
1420
1421 r5l_write_super_and_discard_space(log, next_checkpoint);
1422
1423 mutex_lock(&log->io_mutex);
1424 log->last_checkpoint = next_checkpoint;
1425 r5c_update_log_state(log);
1426 mutex_unlock(&log->io_mutex);
1427
1428 r5l_run_no_space_stripes(log);
1429}
1430
1431static void r5l_reclaim_thread(struct md_thread *thread)
1432{
1433 struct mddev *mddev = thread->mddev;
1434 struct r5conf *conf = mddev->private;
1435 struct r5l_log *log = conf->log;
1436
1437 if (!log)
1438 return;
1439 r5c_do_reclaim(conf);
1440 r5l_do_reclaim(log);
1441}
1442
1443void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1444{
1445 unsigned long target;
1446 unsigned long new = (unsigned long)space;
1447
1448 if (!log)
1449 return;
1450 do {
1451 target = log->reclaim_target;
1452 if (new < target)
1453 return;
1454 } while (cmpxchg(&log->reclaim_target, target, new) != target);
1455 md_wakeup_thread(log->reclaim_thread);
1456}
1457
1458void r5l_quiesce(struct r5l_log *log, int state)
1459{
1460 struct mddev *mddev;
1461 if (!log || state == 2)
1462 return;
1463 if (state == 0)
1464 kthread_unpark(log->reclaim_thread->tsk);
1465 else if (state == 1) {
1466
1467 mddev = log->rdev->mddev;
1468 wake_up(&mddev->sb_wait);
1469 kthread_park(log->reclaim_thread->tsk);
1470 r5l_wake_reclaim(log, MaxSector);
1471 r5l_do_reclaim(log);
1472 }
1473}
1474
1475bool r5l_log_disk_error(struct r5conf *conf)
1476{
1477 struct r5l_log *log;
1478 bool ret;
1479
1480 rcu_read_lock();
1481 log = rcu_dereference(conf->log);
1482
1483 if (!log)
1484 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1485 else
1486 ret = test_bit(Faulty, &log->rdev->flags);
1487 rcu_read_unlock();
1488 return ret;
1489}
1490
1491struct r5l_recovery_ctx {
1492 struct page *meta_page;
1493 sector_t meta_total_blocks;
1494 sector_t pos;
1495 u64 seq;
1496 int data_parity_stripes;
1497 int data_only_stripes;
1498 struct list_head cached_list;
1499};
1500
1501static int r5l_recovery_read_meta_block(struct r5l_log *log,
1502 struct r5l_recovery_ctx *ctx)
1503{
1504 struct page *page = ctx->meta_page;
1505 struct r5l_meta_block *mb;
1506 u32 crc, stored_crc;
1507
1508 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1509 false))
1510 return -EIO;
1511
1512 mb = page_address(page);
1513 stored_crc = le32_to_cpu(mb->checksum);
1514 mb->checksum = 0;
1515
1516 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1517 le64_to_cpu(mb->seq) != ctx->seq ||
1518 mb->version != R5LOG_VERSION ||
1519 le64_to_cpu(mb->position) != ctx->pos)
1520 return -EINVAL;
1521
1522 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1523 if (stored_crc != crc)
1524 return -EINVAL;
1525
1526 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1527 return -EINVAL;
1528
1529 ctx->meta_total_blocks = BLOCK_SECTORS;
1530
1531 return 0;
1532}
1533
1534static void
1535r5l_recovery_create_empty_meta_block(struct r5l_log *log,
1536 struct page *page,
1537 sector_t pos, u64 seq)
1538{
1539 struct r5l_meta_block *mb;
1540
1541 mb = page_address(page);
1542 clear_page(mb);
1543 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1544 mb->version = R5LOG_VERSION;
1545 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1546 mb->seq = cpu_to_le64(seq);
1547 mb->position = cpu_to_le64(pos);
1548}
1549
1550static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1551 u64 seq)
1552{
1553 struct page *page;
1554 struct r5l_meta_block *mb;
1555
1556 page = alloc_page(GFP_KERNEL);
1557 if (!page)
1558 return -ENOMEM;
1559 r5l_recovery_create_empty_meta_block(log, page, pos, seq);
1560 mb = page_address(page);
1561 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
1562 mb, PAGE_SIZE));
1563 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1564 REQ_FUA, false)) {
1565 __free_page(page);
1566 return -EIO;
1567 }
1568 __free_page(page);
1569 return 0;
1570}
1571
1572
1573
1574
1575
1576
1577
1578
1579static void r5l_recovery_load_data(struct r5l_log *log,
1580 struct stripe_head *sh,
1581 struct r5l_recovery_ctx *ctx,
1582 struct r5l_payload_data_parity *payload,
1583 sector_t log_offset)
1584{
1585 struct mddev *mddev = log->rdev->mddev;
1586 struct r5conf *conf = mddev->private;
1587 int dd_idx;
1588
1589 raid5_compute_sector(conf,
1590 le64_to_cpu(payload->location), 0,
1591 &dd_idx, sh);
1592 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1593 sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
1594 sh->dev[dd_idx].log_checksum =
1595 le32_to_cpu(payload->checksum[0]);
1596 ctx->meta_total_blocks += BLOCK_SECTORS;
1597
1598 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
1599 set_bit(STRIPE_R5C_CACHING, &sh->state);
1600}
1601
1602static void r5l_recovery_load_parity(struct r5l_log *log,
1603 struct stripe_head *sh,
1604 struct r5l_recovery_ctx *ctx,
1605 struct r5l_payload_data_parity *payload,
1606 sector_t log_offset)
1607{
1608 struct mddev *mddev = log->rdev->mddev;
1609 struct r5conf *conf = mddev->private;
1610
1611 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
1612 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1613 sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
1614 sh->dev[sh->pd_idx].log_checksum =
1615 le32_to_cpu(payload->checksum[0]);
1616 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
1617
1618 if (sh->qd_idx >= 0) {
1619 sync_page_io(log->rdev,
1620 r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1621 PAGE_SIZE, sh->dev[sh->qd_idx].page,
1622 REQ_OP_READ, 0, false);
1623 sh->dev[sh->qd_idx].log_checksum =
1624 le32_to_cpu(payload->checksum[1]);
1625 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
1626 }
1627 clear_bit(STRIPE_R5C_CACHING, &sh->state);
1628}
1629
1630static void r5l_recovery_reset_stripe(struct stripe_head *sh)
1631{
1632 int i;
1633
1634 sh->state = 0;
1635 sh->log_start = MaxSector;
1636 for (i = sh->disks; i--; )
1637 sh->dev[i].flags = 0;
1638}
1639
1640static void
1641r5l_recovery_replay_one_stripe(struct r5conf *conf,
1642 struct stripe_head *sh,
1643 struct r5l_recovery_ctx *ctx)
1644{
1645 struct md_rdev *rdev, *rrdev;
1646 int disk_index;
1647 int data_count = 0;
1648
1649 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1650 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1651 continue;
1652 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
1653 continue;
1654 data_count++;
1655 }
1656
1657
1658
1659
1660
1661
1662 if (data_count == 0)
1663 goto out;
1664
1665 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1666 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1667 continue;
1668
1669
1670 rcu_read_lock();
1671 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1672 if (rdev) {
1673 atomic_inc(&rdev->nr_pending);
1674 rcu_read_unlock();
1675 sync_page_io(rdev, sh->sector, PAGE_SIZE,
1676 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1677 false);
1678 rdev_dec_pending(rdev, rdev->mddev);
1679 rcu_read_lock();
1680 }
1681 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1682 if (rrdev) {
1683 atomic_inc(&rrdev->nr_pending);
1684 rcu_read_unlock();
1685 sync_page_io(rrdev, sh->sector, PAGE_SIZE,
1686 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1687 false);
1688 rdev_dec_pending(rrdev, rrdev->mddev);
1689 rcu_read_lock();
1690 }
1691 rcu_read_unlock();
1692 }
1693 ctx->data_parity_stripes++;
1694out:
1695 r5l_recovery_reset_stripe(sh);
1696}
1697
1698static struct stripe_head *
1699r5c_recovery_alloc_stripe(struct r5conf *conf,
1700 sector_t stripe_sect)
1701{
1702 struct stripe_head *sh;
1703
1704 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
1705 if (!sh)
1706 return NULL;
1707
1708 r5l_recovery_reset_stripe(sh);
1709
1710 return sh;
1711}
1712
1713static struct stripe_head *
1714r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
1715{
1716 struct stripe_head *sh;
1717
1718 list_for_each_entry(sh, list, lru)
1719 if (sh->sector == sect)
1720 return sh;
1721 return NULL;
1722}
1723
1724static void
1725r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
1726 struct r5l_recovery_ctx *ctx)
1727{
1728 struct stripe_head *sh, *next;
1729
1730 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
1731 r5l_recovery_reset_stripe(sh);
1732 list_del_init(&sh->lru);
1733 raid5_release_stripe(sh);
1734 }
1735}
1736
1737static void
1738r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
1739 struct r5l_recovery_ctx *ctx)
1740{
1741 struct stripe_head *sh, *next;
1742
1743 list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
1744 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1745 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
1746 list_del_init(&sh->lru);
1747 raid5_release_stripe(sh);
1748 }
1749}
1750
1751
1752static int
1753r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
1754 sector_t log_offset, __le32 log_checksum)
1755{
1756 void *addr;
1757 u32 checksum;
1758
1759 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1760 page, REQ_OP_READ, 0, false);
1761 addr = kmap_atomic(page);
1762 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1763 kunmap_atomic(addr);
1764 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
1765}
1766
1767
1768
1769
1770
1771static int
1772r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
1773 struct r5l_recovery_ctx *ctx)
1774{
1775 struct mddev *mddev = log->rdev->mddev;
1776 struct r5conf *conf = mddev->private;
1777 struct r5l_meta_block *mb = page_address(ctx->meta_page);
1778 sector_t mb_offset = sizeof(struct r5l_meta_block);
1779 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1780 struct page *page;
1781 struct r5l_payload_data_parity *payload;
1782
1783 page = alloc_page(GFP_KERNEL);
1784 if (!page)
1785 return -ENOMEM;
1786
1787 while (mb_offset < le32_to_cpu(mb->meta_size)) {
1788 payload = (void *)mb + mb_offset;
1789
1790 if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1791 if (r5l_recovery_verify_data_checksum(
1792 log, page, log_offset,
1793 payload->checksum[0]) < 0)
1794 goto mismatch;
1795 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
1796 if (r5l_recovery_verify_data_checksum(
1797 log, page, log_offset,
1798 payload->checksum[0]) < 0)
1799 goto mismatch;
1800 if (conf->max_degraded == 2 &&
1801 r5l_recovery_verify_data_checksum(
1802 log, page,
1803 r5l_ring_add(log, log_offset,
1804 BLOCK_SECTORS),
1805 payload->checksum[1]) < 0)
1806 goto mismatch;
1807 } else
1808 goto mismatch;
1809
1810 log_offset = r5l_ring_add(log, log_offset,
1811 le32_to_cpu(payload->size));
1812
1813 mb_offset += sizeof(struct r5l_payload_data_parity) +
1814 sizeof(__le32) *
1815 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1816 }
1817
1818 put_page(page);
1819 return 0;
1820
1821mismatch:
1822 put_page(page);
1823 return -EINVAL;
1824}
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834static int
1835r5c_recovery_analyze_meta_block(struct r5l_log *log,
1836 struct r5l_recovery_ctx *ctx,
1837 struct list_head *cached_stripe_list)
1838{
1839 struct mddev *mddev = log->rdev->mddev;
1840 struct r5conf *conf = mddev->private;
1841 struct r5l_meta_block *mb;
1842 struct r5l_payload_data_parity *payload;
1843 int mb_offset;
1844 sector_t log_offset;
1845 sector_t stripe_sect;
1846 struct stripe_head *sh;
1847 int ret;
1848
1849
1850
1851
1852
1853
1854 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
1855 if (ret == -EINVAL)
1856 return -EAGAIN;
1857 else if (ret)
1858 return ret;
1859
1860 mb = page_address(ctx->meta_page);
1861 mb_offset = sizeof(struct r5l_meta_block);
1862 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1863
1864 while (mb_offset < le32_to_cpu(mb->meta_size)) {
1865 int dd;
1866
1867 payload = (void *)mb + mb_offset;
1868 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
1869 raid5_compute_sector(
1870 conf, le64_to_cpu(payload->location), 0, &dd,
1871 NULL)
1872 : le64_to_cpu(payload->location);
1873
1874 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
1875 stripe_sect);
1876
1877 if (!sh) {
1878 sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
1879
1880
1881
1882
1883 if (!sh) {
1884 r5c_recovery_replay_stripes(
1885 cached_stripe_list, ctx);
1886 sh = r5c_recovery_alloc_stripe(
1887 conf, stripe_sect);
1888 }
1889 if (!sh) {
1890 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
1891 mdname(mddev),
1892 conf->min_nr_stripes * 2);
1893 raid5_set_cache_size(mddev,
1894 conf->min_nr_stripes * 2);
1895 sh = r5c_recovery_alloc_stripe(conf,
1896 stripe_sect);
1897 }
1898 if (!sh) {
1899 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
1900 mdname(mddev));
1901 return -ENOMEM;
1902 }
1903 list_add_tail(&sh->lru, cached_stripe_list);
1904 }
1905
1906 if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1907 if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
1908 test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
1909 r5l_recovery_replay_one_stripe(conf, sh, ctx);
1910 list_move_tail(&sh->lru, cached_stripe_list);
1911 }
1912 r5l_recovery_load_data(log, sh, ctx, payload,
1913 log_offset);
1914 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
1915 r5l_recovery_load_parity(log, sh, ctx, payload,
1916 log_offset);
1917 else
1918 return -EINVAL;
1919
1920 log_offset = r5l_ring_add(log, log_offset,
1921 le32_to_cpu(payload->size));
1922
1923 mb_offset += sizeof(struct r5l_payload_data_parity) +
1924 sizeof(__le32) *
1925 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1926 }
1927
1928 return 0;
1929}
1930
1931
1932
1933
1934
1935static void r5c_recovery_load_one_stripe(struct r5l_log *log,
1936 struct stripe_head *sh)
1937{
1938 struct r5dev *dev;
1939 int i;
1940
1941 for (i = sh->disks; i--; ) {
1942 dev = sh->dev + i;
1943 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
1944 set_bit(R5_InJournal, &dev->flags);
1945 set_bit(R5_UPTODATE, &dev->flags);
1946 }
1947 }
1948}
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966static int r5c_recovery_flush_log(struct r5l_log *log,
1967 struct r5l_recovery_ctx *ctx)
1968{
1969 struct stripe_head *sh;
1970 int ret = 0;
1971
1972
1973 while (1) {
1974 if (r5l_recovery_read_meta_block(log, ctx))
1975 break;
1976
1977 ret = r5c_recovery_analyze_meta_block(log, ctx,
1978 &ctx->cached_list);
1979
1980
1981
1982
1983 if (ret && ret != -EAGAIN)
1984 break;
1985 ctx->seq++;
1986 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1987 }
1988
1989 if (ret == -ENOMEM) {
1990 r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
1991 return ret;
1992 }
1993
1994
1995 r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
1996
1997
1998 list_for_each_entry(sh, &ctx->cached_list, lru) {
1999 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2000 r5c_recovery_load_one_stripe(log, sh);
2001 ctx->data_only_stripes++;
2002 }
2003
2004 return 0;
2005}
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076static int
2077r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2078 struct r5l_recovery_ctx *ctx)
2079{
2080 struct stripe_head *sh;
2081 struct mddev *mddev = log->rdev->mddev;
2082 struct page *page;
2083 sector_t next_checkpoint = MaxSector;
2084
2085 page = alloc_page(GFP_KERNEL);
2086 if (!page) {
2087 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
2088 mdname(mddev));
2089 return -ENOMEM;
2090 }
2091
2092 WARN_ON(list_empty(&ctx->cached_list));
2093
2094 list_for_each_entry(sh, &ctx->cached_list, lru) {
2095 struct r5l_meta_block *mb;
2096 int i;
2097 int offset;
2098 sector_t write_pos;
2099
2100 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2101 r5l_recovery_create_empty_meta_block(log, page,
2102 ctx->pos, ctx->seq);
2103 mb = page_address(page);
2104 offset = le32_to_cpu(mb->meta_size);
2105 write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2106
2107 for (i = sh->disks; i--; ) {
2108 struct r5dev *dev = &sh->dev[i];
2109 struct r5l_payload_data_parity *payload;
2110 void *addr;
2111
2112 if (test_bit(R5_InJournal, &dev->flags)) {
2113 payload = (void *)mb + offset;
2114 payload->header.type = cpu_to_le16(
2115 R5LOG_PAYLOAD_DATA);
2116 payload->size = BLOCK_SECTORS;
2117 payload->location = cpu_to_le64(
2118 raid5_compute_blocknr(sh, i, 0));
2119 addr = kmap_atomic(dev->page);
2120 payload->checksum[0] = cpu_to_le32(
2121 crc32c_le(log->uuid_checksum, addr,
2122 PAGE_SIZE));
2123 kunmap_atomic(addr);
2124 sync_page_io(log->rdev, write_pos, PAGE_SIZE,
2125 dev->page, REQ_OP_WRITE, 0, false);
2126 write_pos = r5l_ring_add(log, write_pos,
2127 BLOCK_SECTORS);
2128 offset += sizeof(__le32) +
2129 sizeof(struct r5l_payload_data_parity);
2130
2131 }
2132 }
2133 mb->meta_size = cpu_to_le32(offset);
2134 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
2135 mb, PAGE_SIZE));
2136 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
2137 REQ_OP_WRITE, REQ_FUA, false);
2138 sh->log_start = ctx->pos;
2139 list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
2140 atomic_inc(&log->stripe_in_journal_count);
2141 ctx->pos = write_pos;
2142 ctx->seq += 1;
2143 next_checkpoint = sh->log_start;
2144 }
2145 log->next_checkpoint = next_checkpoint;
2146 __free_page(page);
2147 return 0;
2148}
2149
2150static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
2151 struct r5l_recovery_ctx *ctx)
2152{
2153 struct mddev *mddev = log->rdev->mddev;
2154 struct r5conf *conf = mddev->private;
2155 struct stripe_head *sh, *next;
2156
2157 if (ctx->data_only_stripes == 0)
2158 return;
2159
2160 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
2161
2162 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
2163 r5c_make_stripe_write_out(sh);
2164 set_bit(STRIPE_HANDLE, &sh->state);
2165 list_del_init(&sh->lru);
2166 raid5_release_stripe(sh);
2167 }
2168
2169 md_wakeup_thread(conf->mddev->thread);
2170
2171 wait_event(conf->wait_for_quiescent,
2172 atomic_read(&conf->active_stripes) == 0);
2173
2174 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2175}
2176
2177static int r5l_recovery_log(struct r5l_log *log)
2178{
2179 struct mddev *mddev = log->rdev->mddev;
2180 struct r5l_recovery_ctx ctx;
2181 int ret;
2182 sector_t pos;
2183
2184 ctx.pos = log->last_checkpoint;
2185 ctx.seq = log->last_cp_seq;
2186 ctx.meta_page = alloc_page(GFP_KERNEL);
2187 ctx.data_only_stripes = 0;
2188 ctx.data_parity_stripes = 0;
2189 INIT_LIST_HEAD(&ctx.cached_list);
2190
2191 if (!ctx.meta_page)
2192 return -ENOMEM;
2193
2194 ret = r5c_recovery_flush_log(log, &ctx);
2195 __free_page(ctx.meta_page);
2196
2197 if (ret)
2198 return ret;
2199
2200 pos = ctx.pos;
2201 ctx.seq += 10000;
2202
2203
2204 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
2205 pr_debug("md/raid:%s: starting from clean shutdown\n",
2206 mdname(mddev));
2207 else
2208 pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
2209 mdname(mddev), ctx.data_only_stripes,
2210 ctx.data_parity_stripes);
2211
2212 if (ctx.data_only_stripes == 0) {
2213 log->next_checkpoint = ctx.pos;
2214 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
2215 ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
2216 } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
2217 pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
2218 mdname(mddev));
2219 return -EIO;
2220 }
2221
2222 log->log_start = ctx.pos;
2223 log->seq = ctx.seq;
2224 log->last_checkpoint = pos;
2225 r5l_write_super(log, pos);
2226
2227 r5c_recovery_flush_data_only_stripes(log, &ctx);
2228 return 0;
2229}
2230
2231static void r5l_write_super(struct r5l_log *log, sector_t cp)
2232{
2233 struct mddev *mddev = log->rdev->mddev;
2234
2235 log->rdev->journal_tail = cp;
2236 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2237}
2238
2239static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2240{
2241 struct r5conf *conf = mddev->private;
2242 int ret;
2243
2244 if (!conf->log)
2245 return 0;
2246
2247 switch (conf->log->r5c_journal_mode) {
2248 case R5C_JOURNAL_MODE_WRITE_THROUGH:
2249 ret = snprintf(
2250 page, PAGE_SIZE, "[%s] %s\n",
2251 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2252 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2253 break;
2254 case R5C_JOURNAL_MODE_WRITE_BACK:
2255 ret = snprintf(
2256 page, PAGE_SIZE, "%s [%s]\n",
2257 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2258 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2259 break;
2260 default:
2261 ret = 0;
2262 }
2263 return ret;
2264}
2265
2266static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2267 const char *page, size_t length)
2268{
2269 struct r5conf *conf = mddev->private;
2270 struct r5l_log *log = conf->log;
2271 int val = -1, i;
2272 int len = length;
2273
2274 if (!log)
2275 return -ENODEV;
2276
2277 if (len && page[len - 1] == '\n')
2278 len -= 1;
2279 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
2280 if (strlen(r5c_journal_mode_str[i]) == len &&
2281 strncmp(page, r5c_journal_mode_str[i], len) == 0) {
2282 val = i;
2283 break;
2284 }
2285 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2286 val > R5C_JOURNAL_MODE_WRITE_BACK)
2287 return -EINVAL;
2288
2289 if (raid5_calc_degraded(conf) > 0 &&
2290 val == R5C_JOURNAL_MODE_WRITE_BACK)
2291 return -EINVAL;
2292
2293 mddev_suspend(mddev);
2294 conf->log->r5c_journal_mode = val;
2295 mddev_resume(mddev);
2296
2297 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2298 mdname(mddev), val, r5c_journal_mode_str[val]);
2299 return length;
2300}
2301
2302struct md_sysfs_entry
2303r5c_journal_mode = __ATTR(journal_mode, 0644,
2304 r5c_journal_mode_show, r5c_journal_mode_store);
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314int r5c_try_caching_write(struct r5conf *conf,
2315 struct stripe_head *sh,
2316 struct stripe_head_state *s,
2317 int disks)
2318{
2319 struct r5l_log *log = conf->log;
2320 int i;
2321 struct r5dev *dev;
2322 int to_cache = 0;
2323
2324 BUG_ON(!r5c_is_writeback(log));
2325
2326 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341 if (s->injournal > 0 || s->written > 0)
2342 return -EAGAIN;
2343
2344 set_bit(STRIPE_R5C_CACHING, &sh->state);
2345 }
2346
2347
2348
2349
2350
2351
2352 if (s->failed) {
2353 r5c_make_stripe_write_out(sh);
2354 return -EAGAIN;
2355 }
2356
2357 for (i = disks; i--; ) {
2358 dev = &sh->dev[i];
2359
2360 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
2361 !test_bit(R5_InJournal, &dev->flags)) {
2362 r5c_make_stripe_write_out(sh);
2363 return -EAGAIN;
2364 }
2365 }
2366
2367 for (i = disks; i--; ) {
2368 dev = &sh->dev[i];
2369 if (dev->towrite) {
2370 set_bit(R5_Wantwrite, &dev->flags);
2371 set_bit(R5_Wantdrain, &dev->flags);
2372 set_bit(R5_LOCKED, &dev->flags);
2373 to_cache++;
2374 }
2375 }
2376
2377 if (to_cache) {
2378 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2379
2380
2381
2382
2383
2384 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
2385 }
2386
2387 return 0;
2388}
2389
2390
2391
2392
2393void r5c_release_extra_page(struct stripe_head *sh)
2394{
2395 struct r5conf *conf = sh->raid_conf;
2396 int i;
2397 bool using_disk_info_extra_page;
2398
2399 using_disk_info_extra_page =
2400 sh->dev[0].orig_page == conf->disks[0].extra_page;
2401
2402 for (i = sh->disks; i--; )
2403 if (sh->dev[i].page != sh->dev[i].orig_page) {
2404 struct page *p = sh->dev[i].orig_page;
2405
2406 sh->dev[i].orig_page = sh->dev[i].page;
2407 clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2408
2409 if (!using_disk_info_extra_page)
2410 put_page(p);
2411 }
2412
2413 if (using_disk_info_extra_page) {
2414 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
2415 md_wakeup_thread(conf->mddev->thread);
2416 }
2417}
2418
2419void r5c_use_extra_page(struct stripe_head *sh)
2420{
2421 struct r5conf *conf = sh->raid_conf;
2422 int i;
2423 struct r5dev *dev;
2424
2425 for (i = sh->disks; i--; ) {
2426 dev = &sh->dev[i];
2427 if (dev->orig_page != dev->page)
2428 put_page(dev->orig_page);
2429 dev->orig_page = conf->disks[i].extra_page;
2430 }
2431}
2432
2433
2434
2435
2436
2437void r5c_finish_stripe_write_out(struct r5conf *conf,
2438 struct stripe_head *sh,
2439 struct stripe_head_state *s)
2440{
2441 int i;
2442 int do_wakeup = 0;
2443
2444 if (!conf->log ||
2445 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
2446 return;
2447
2448 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2449 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
2450
2451 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2452 return;
2453
2454 for (i = sh->disks; i--; ) {
2455 clear_bit(R5_InJournal, &sh->dev[i].flags);
2456 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2457 do_wakeup = 1;
2458 }
2459
2460
2461
2462
2463
2464 s->injournal = 0;
2465
2466 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2467 if (atomic_dec_and_test(&conf->pending_full_writes))
2468 md_wakeup_thread(conf->mddev->thread);
2469
2470 if (do_wakeup)
2471 wake_up(&conf->wait_for_overlap);
2472
2473 spin_lock_irq(&conf->log->stripe_in_journal_lock);
2474 list_del_init(&sh->r5c);
2475 spin_unlock_irq(&conf->log->stripe_in_journal_lock);
2476 sh->log_start = MaxSector;
2477 atomic_dec(&conf->log->stripe_in_journal_count);
2478 r5c_update_log_state(conf->log);
2479}
2480
2481int
2482r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
2483 struct stripe_head_state *s)
2484{
2485 struct r5conf *conf = sh->raid_conf;
2486 int pages = 0;
2487 int reserve;
2488 int i;
2489 int ret = 0;
2490
2491 BUG_ON(!log);
2492
2493 for (i = 0; i < sh->disks; i++) {
2494 void *addr;
2495
2496 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
2497 continue;
2498 addr = kmap_atomic(sh->dev[i].page);
2499 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
2500 addr, PAGE_SIZE);
2501 kunmap_atomic(addr);
2502 pages++;
2503 }
2504 WARN_ON(pages == 0);
2505
2506
2507
2508
2509
2510 clear_bit(STRIPE_DELAYED, &sh->state);
2511 atomic_inc(&sh->count);
2512
2513 mutex_lock(&log->io_mutex);
2514
2515 reserve = (1 + pages) << (PAGE_SHIFT - 9);
2516
2517 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
2518 sh->log_start == MaxSector)
2519 r5l_add_no_space_stripe(log, sh);
2520 else if (!r5l_has_free_space(log, reserve)) {
2521 if (sh->log_start == log->last_checkpoint)
2522 BUG();
2523 else
2524 r5l_add_no_space_stripe(log, sh);
2525 } else {
2526 ret = r5l_log_stripe(log, sh, pages, 0);
2527 if (ret) {
2528 spin_lock_irq(&log->io_list_lock);
2529 list_add_tail(&sh->log_list, &log->no_mem_stripes);
2530 spin_unlock_irq(&log->io_list_lock);
2531 }
2532 }
2533
2534 mutex_unlock(&log->io_mutex);
2535 return 0;
2536}
2537
2538static int r5l_load_log(struct r5l_log *log)
2539{
2540 struct md_rdev *rdev = log->rdev;
2541 struct page *page;
2542 struct r5l_meta_block *mb;
2543 sector_t cp = log->rdev->journal_tail;
2544 u32 stored_crc, expected_crc;
2545 bool create_super = false;
2546 int ret = 0;
2547
2548
2549 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
2550 cp = 0;
2551 page = alloc_page(GFP_KERNEL);
2552 if (!page)
2553 return -ENOMEM;
2554
2555 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
2556 ret = -EIO;
2557 goto ioerr;
2558 }
2559 mb = page_address(page);
2560
2561 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
2562 mb->version != R5LOG_VERSION) {
2563 create_super = true;
2564 goto create;
2565 }
2566 stored_crc = le32_to_cpu(mb->checksum);
2567 mb->checksum = 0;
2568 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
2569 if (stored_crc != expected_crc) {
2570 create_super = true;
2571 goto create;
2572 }
2573 if (le64_to_cpu(mb->position) != cp) {
2574 create_super = true;
2575 goto create;
2576 }
2577create:
2578 if (create_super) {
2579 log->last_cp_seq = prandom_u32();
2580 cp = 0;
2581 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
2582
2583
2584
2585
2586
2587 r5l_write_super(log, cp);
2588 } else
2589 log->last_cp_seq = le64_to_cpu(mb->seq);
2590
2591 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
2592 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
2593 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
2594 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
2595 log->last_checkpoint = cp;
2596
2597 __free_page(page);
2598
2599 if (create_super) {
2600 log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
2601 log->seq = log->last_cp_seq + 1;
2602 log->next_checkpoint = cp;
2603 } else
2604 ret = r5l_recovery_log(log);
2605
2606 r5c_update_log_state(log);
2607 return ret;
2608ioerr:
2609 __free_page(page);
2610 return ret;
2611}
2612
2613void r5c_update_on_rdev_error(struct mddev *mddev)
2614{
2615 struct r5conf *conf = mddev->private;
2616 struct r5l_log *log = conf->log;
2617
2618 if (!log)
2619 return;
2620
2621 if (raid5_calc_degraded(conf) > 0 &&
2622 conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
2623 schedule_work(&log->disable_writeback_work);
2624}
2625
2626int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
2627{
2628 struct request_queue *q = bdev_get_queue(rdev->bdev);
2629 struct r5l_log *log;
2630
2631 if (PAGE_SIZE != 4096)
2632 return -EINVAL;
2633
2634
2635
2636
2637
2638
2639
2640
2641 if (sizeof(struct r5l_meta_block) +
2642 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
2643 conf->raid_disks) > PAGE_SIZE) {
2644 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
2645 mdname(conf->mddev), conf->raid_disks);
2646 return -EINVAL;
2647 }
2648
2649 log = kzalloc(sizeof(*log), GFP_KERNEL);
2650 if (!log)
2651 return -ENOMEM;
2652 log->rdev = rdev;
2653
2654 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
2655
2656 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
2657 sizeof(rdev->mddev->uuid));
2658
2659 mutex_init(&log->io_mutex);
2660
2661 spin_lock_init(&log->io_list_lock);
2662 INIT_LIST_HEAD(&log->running_ios);
2663 INIT_LIST_HEAD(&log->io_end_ios);
2664 INIT_LIST_HEAD(&log->flushing_ios);
2665 INIT_LIST_HEAD(&log->finished_ios);
2666 bio_init(&log->flush_bio, NULL, 0);
2667
2668 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
2669 if (!log->io_kc)
2670 goto io_kc;
2671
2672 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
2673 if (!log->io_pool)
2674 goto io_pool;
2675
2676 log->bs = bioset_create(R5L_POOL_SIZE, 0);
2677 if (!log->bs)
2678 goto io_bs;
2679
2680 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
2681 if (!log->meta_pool)
2682 goto out_mempool;
2683
2684 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
2685 log->rdev->mddev, "reclaim");
2686 if (!log->reclaim_thread)
2687 goto reclaim_thread;
2688 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
2689
2690 init_waitqueue_head(&log->iounit_wait);
2691
2692 INIT_LIST_HEAD(&log->no_mem_stripes);
2693
2694 INIT_LIST_HEAD(&log->no_space_stripes);
2695 spin_lock_init(&log->no_space_stripes_lock);
2696
2697 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
2698 INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
2699
2700 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2701 INIT_LIST_HEAD(&log->stripe_in_journal_list);
2702 spin_lock_init(&log->stripe_in_journal_lock);
2703 atomic_set(&log->stripe_in_journal_count, 0);
2704
2705 rcu_assign_pointer(conf->log, log);
2706
2707 if (r5l_load_log(log))
2708 goto error;
2709
2710 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
2711 return 0;
2712
2713error:
2714 rcu_assign_pointer(conf->log, NULL);
2715 md_unregister_thread(&log->reclaim_thread);
2716reclaim_thread:
2717 mempool_destroy(log->meta_pool);
2718out_mempool:
2719 bioset_free(log->bs);
2720io_bs:
2721 mempool_destroy(log->io_pool);
2722io_pool:
2723 kmem_cache_destroy(log->io_kc);
2724io_kc:
2725 kfree(log);
2726 return -EINVAL;
2727}
2728
2729void r5l_exit_log(struct r5l_log *log)
2730{
2731 flush_work(&log->disable_writeback_work);
2732 md_unregister_thread(&log->reclaim_thread);
2733 mempool_destroy(log->meta_pool);
2734 bioset_free(log->bs);
2735 mempool_destroy(log->io_pool);
2736 kmem_cache_destroy(log->io_kc);
2737 kfree(log);
2738}
2739