1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/kernel.h>
16#include <linux/wait.h>
17#include <linux/blkdev.h>
18#include <linux/slab.h>
19#include <linux/raid/md_p.h>
20#include <linux/crc32c.h>
21#include <linux/random.h>
22#include <linux/kthread.h>
23#include <linux/types.h>
24#include "md.h"
25#include "raid5.h"
26#include "bitmap.h"
27
28
29
30
31
32#define BLOCK_SECTORS (8)
33
34
35
36
37
38
39
40#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2)
41#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
42
43
44#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
45
46#define R5C_FULL_STRIPE_FLUSH_BATCH 256
47
48#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
49
50
51
52
53
54#define R5L_POOL_SIZE 4
55
56
57
58
59
60
61enum r5c_journal_mode {
62 R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
63 R5C_JOURNAL_MODE_WRITE_BACK = 1,
64};
65
66static char *r5c_journal_mode_str[] = {"write-through",
67 "write-back"};
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99struct r5l_log {
100 struct md_rdev *rdev;
101
102 u32 uuid_checksum;
103
104 sector_t device_size;
105
106 sector_t max_free_space;
107
108
109 sector_t last_checkpoint;
110
111 u64 last_cp_seq;
112
113 sector_t log_start;
114 u64 seq;
115
116 sector_t next_checkpoint;
117
118 struct mutex io_mutex;
119 struct r5l_io_unit *current_io;
120
121 spinlock_t io_list_lock;
122 struct list_head running_ios;
123
124
125 struct list_head io_end_ios;
126
127
128 struct list_head flushing_ios;
129
130 struct list_head finished_ios;
131 struct bio flush_bio;
132
133 struct list_head no_mem_stripes;
134
135 struct kmem_cache *io_kc;
136 mempool_t *io_pool;
137 struct bio_set *bs;
138 mempool_t *meta_pool;
139
140 struct md_thread *reclaim_thread;
141 unsigned long reclaim_target;
142
143
144
145
146
147
148 wait_queue_head_t iounit_wait;
149
150 struct list_head no_space_stripes;
151 spinlock_t no_space_stripes_lock;
152
153 bool need_cache_flush;
154
155
156 enum r5c_journal_mode r5c_journal_mode;
157
158
159 struct list_head stripe_in_journal_list;
160
161 spinlock_t stripe_in_journal_lock;
162 atomic_t stripe_in_journal_count;
163
164
165 struct work_struct deferred_io_work;
166
167 struct work_struct disable_writeback_work;
168
169
170 spinlock_t tree_lock;
171 struct radix_tree_root big_stripe_tree;
172};
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205#define R5C_RADIX_COUNT_SHIFT 2
206
207
208
209
210
211
212static inline sector_t r5c_tree_index(struct r5conf *conf,
213 sector_t sect)
214{
215 sector_t offset;
216
217 offset = sector_div(sect, conf->chunk_sectors);
218 return sect;
219}
220
221
222
223
224
225
226
227
228struct r5l_io_unit {
229 struct r5l_log *log;
230
231 struct page *meta_page;
232 int meta_offset;
233
234 struct bio *current_bio;
235
236 atomic_t pending_stripe;
237 u64 seq;
238 sector_t log_start;
239 sector_t log_end;
240 struct list_head log_sibling;
241 struct list_head stripe_list;
242
243 int state;
244 bool need_split_bio;
245 struct bio *split_bio;
246
247 unsigned int has_flush:1;
248 unsigned int has_fua:1;
249 unsigned int has_null_flush:1;
250
251
252
253
254 unsigned int io_deferred:1;
255
256 struct bio_list flush_barriers;
257};
258
259
260enum r5l_io_unit_state {
261 IO_UNIT_RUNNING = 0,
262 IO_UNIT_IO_START = 1,
263
264 IO_UNIT_IO_END = 2,
265 IO_UNIT_STRIPE_END = 3,
266};
267
268bool r5c_is_writeback(struct r5l_log *log)
269{
270 return (log != NULL &&
271 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
272}
273
274static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
275{
276 start += inc;
277 if (start >= log->device_size)
278 start = start - log->device_size;
279 return start;
280}
281
282static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
283 sector_t end)
284{
285 if (end >= start)
286 return end - start;
287 else
288 return end + log->device_size - start;
289}
290
291static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
292{
293 sector_t used_size;
294
295 used_size = r5l_ring_distance(log, log->last_checkpoint,
296 log->log_start);
297
298 return log->device_size > used_size + size;
299}
300
301static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
302 enum r5l_io_unit_state state)
303{
304 if (WARN_ON(io->state >= state))
305 return;
306 io->state = state;
307}
308
309static void
310r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
311 struct bio_list *return_bi)
312{
313 struct bio *wbi, *wbi2;
314
315 wbi = dev->written;
316 dev->written = NULL;
317 while (wbi && wbi->bi_iter.bi_sector <
318 dev->sector + STRIPE_SECTORS) {
319 wbi2 = r5_next_bio(wbi, dev->sector);
320 if (!raid5_dec_bi_active_stripes(wbi)) {
321 md_write_end(conf->mddev);
322 bio_list_add(return_bi, wbi);
323 }
324 wbi = wbi2;
325 }
326}
327
328void r5c_handle_cached_data_endio(struct r5conf *conf,
329 struct stripe_head *sh, int disks, struct bio_list *return_bi)
330{
331 int i;
332
333 for (i = sh->disks; i--; ) {
334 if (sh->dev[i].written) {
335 set_bit(R5_UPTODATE, &sh->dev[i].flags);
336 r5c_return_dev_pending_writes(conf, &sh->dev[i],
337 return_bi);
338 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
339 STRIPE_SECTORS,
340 !test_bit(STRIPE_DEGRADED, &sh->state),
341 0);
342 }
343 }
344}
345
346
347void r5c_check_stripe_cache_usage(struct r5conf *conf)
348{
349 int total_cached;
350
351 if (!r5c_is_writeback(conf->log))
352 return;
353
354 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
355 atomic_read(&conf->r5c_cached_full_stripes);
356
357
358
359
360
361
362
363
364
365 if (total_cached > conf->min_nr_stripes * 1 / 2 ||
366 atomic_read(&conf->empty_inactive_list_nr) > 0)
367 r5l_wake_reclaim(conf->log, 0);
368}
369
370
371
372
373
374void r5c_check_cached_full_stripe(struct r5conf *conf)
375{
376 if (!r5c_is_writeback(conf->log))
377 return;
378
379
380
381
382
383 if (atomic_read(&conf->r5c_cached_full_stripes) >=
384 min(R5C_FULL_STRIPE_FLUSH_BATCH,
385 conf->chunk_sectors >> STRIPE_SHIFT))
386 r5l_wake_reclaim(conf->log, 0);
387}
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
418{
419 struct r5l_log *log = conf->log;
420
421 if (!r5c_is_writeback(log))
422 return 0;
423
424 return BLOCK_SECTORS *
425 ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
426 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
427}
428
429
430
431
432
433
434
435
436static inline void r5c_update_log_state(struct r5l_log *log)
437{
438 struct r5conf *conf = log->rdev->mddev->private;
439 sector_t free_space;
440 sector_t reclaim_space;
441 bool wake_reclaim = false;
442
443 if (!r5c_is_writeback(log))
444 return;
445
446 free_space = r5l_ring_distance(log, log->log_start,
447 log->last_checkpoint);
448 reclaim_space = r5c_log_required_to_flush_cache(conf);
449 if (free_space < 2 * reclaim_space)
450 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
451 else {
452 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
453 wake_reclaim = true;
454 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
455 }
456 if (free_space < 3 * reclaim_space)
457 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
458 else
459 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
460
461 if (wake_reclaim)
462 r5l_wake_reclaim(log, 0);
463}
464
465
466
467
468
469void r5c_make_stripe_write_out(struct stripe_head *sh)
470{
471 struct r5conf *conf = sh->raid_conf;
472 struct r5l_log *log = conf->log;
473
474 BUG_ON(!r5c_is_writeback(log));
475
476 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
477 clear_bit(STRIPE_R5C_CACHING, &sh->state);
478
479 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
480 atomic_inc(&conf->preread_active_stripes);
481}
482
483static void r5c_handle_data_cached(struct stripe_head *sh)
484{
485 int i;
486
487 for (i = sh->disks; i--; )
488 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
489 set_bit(R5_InJournal, &sh->dev[i].flags);
490 clear_bit(R5_LOCKED, &sh->dev[i].flags);
491 }
492 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
493}
494
495
496
497
498
499static void r5c_handle_parity_cached(struct stripe_head *sh)
500{
501 int i;
502
503 for (i = sh->disks; i--; )
504 if (test_bit(R5_InJournal, &sh->dev[i].flags))
505 set_bit(R5_Wantwrite, &sh->dev[i].flags);
506}
507
508
509
510
511
512static void r5c_finish_cache_stripe(struct stripe_head *sh)
513{
514 struct r5l_log *log = sh->raid_conf->log;
515
516 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
517 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
518
519
520
521
522
523
524 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
525 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
526 r5c_handle_data_cached(sh);
527 } else {
528 r5c_handle_parity_cached(sh);
529 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
530 }
531}
532
533static void r5l_io_run_stripes(struct r5l_io_unit *io)
534{
535 struct stripe_head *sh, *next;
536
537 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
538 list_del_init(&sh->log_list);
539
540 r5c_finish_cache_stripe(sh);
541
542 set_bit(STRIPE_HANDLE, &sh->state);
543 raid5_release_stripe(sh);
544 }
545}
546
547static void r5l_log_run_stripes(struct r5l_log *log)
548{
549 struct r5l_io_unit *io, *next;
550
551 assert_spin_locked(&log->io_list_lock);
552
553 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
554
555 if (io->state < IO_UNIT_IO_END)
556 break;
557
558 list_move_tail(&io->log_sibling, &log->finished_ios);
559 r5l_io_run_stripes(io);
560 }
561}
562
563static void r5l_move_to_end_ios(struct r5l_log *log)
564{
565 struct r5l_io_unit *io, *next;
566
567 assert_spin_locked(&log->io_list_lock);
568
569 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
570
571 if (io->state < IO_UNIT_IO_END)
572 break;
573 list_move_tail(&io->log_sibling, &log->io_end_ios);
574 }
575}
576
577static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
578static void r5l_log_endio(struct bio *bio)
579{
580 struct r5l_io_unit *io = bio->bi_private;
581 struct r5l_io_unit *io_deferred;
582 struct r5l_log *log = io->log;
583 unsigned long flags;
584
585 if (bio->bi_error)
586 md_error(log->rdev->mddev, log->rdev);
587
588 bio_put(bio);
589 mempool_free(io->meta_page, log->meta_pool);
590
591 spin_lock_irqsave(&log->io_list_lock, flags);
592 __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
593 if (log->need_cache_flush)
594 r5l_move_to_end_ios(log);
595 else
596 r5l_log_run_stripes(log);
597 if (!list_empty(&log->running_ios)) {
598
599
600
601
602 io_deferred = list_first_entry(&log->running_ios,
603 struct r5l_io_unit, log_sibling);
604 if (io_deferred->io_deferred)
605 schedule_work(&log->deferred_io_work);
606 }
607
608 spin_unlock_irqrestore(&log->io_list_lock, flags);
609
610 if (log->need_cache_flush)
611 md_wakeup_thread(log->rdev->mddev->thread);
612
613 if (io->has_null_flush) {
614 struct bio *bi;
615
616 WARN_ON(bio_list_empty(&io->flush_barriers));
617 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
618 bio_endio(bi);
619 atomic_dec(&io->pending_stripe);
620 }
621 if (atomic_read(&io->pending_stripe) == 0)
622 __r5l_stripe_write_finished(io);
623 }
624}
625
626static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
627{
628 unsigned long flags;
629
630 spin_lock_irqsave(&log->io_list_lock, flags);
631 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
632 spin_unlock_irqrestore(&log->io_list_lock, flags);
633
634 if (io->has_flush)
635 io->current_bio->bi_opf |= REQ_PREFLUSH;
636 if (io->has_fua)
637 io->current_bio->bi_opf |= REQ_FUA;
638 submit_bio(io->current_bio);
639
640 if (!io->split_bio)
641 return;
642
643 if (io->has_flush)
644 io->split_bio->bi_opf |= REQ_PREFLUSH;
645 if (io->has_fua)
646 io->split_bio->bi_opf |= REQ_FUA;
647 submit_bio(io->split_bio);
648}
649
650
651static void r5l_submit_io_async(struct work_struct *work)
652{
653 struct r5l_log *log = container_of(work, struct r5l_log,
654 deferred_io_work);
655 struct r5l_io_unit *io = NULL;
656 unsigned long flags;
657
658 spin_lock_irqsave(&log->io_list_lock, flags);
659 if (!list_empty(&log->running_ios)) {
660 io = list_first_entry(&log->running_ios, struct r5l_io_unit,
661 log_sibling);
662 if (!io->io_deferred)
663 io = NULL;
664 else
665 io->io_deferred = 0;
666 }
667 spin_unlock_irqrestore(&log->io_list_lock, flags);
668 if (io)
669 r5l_do_submit_io(log, io);
670}
671
672static void r5c_disable_writeback_async(struct work_struct *work)
673{
674 struct r5l_log *log = container_of(work, struct r5l_log,
675 disable_writeback_work);
676 struct mddev *mddev = log->rdev->mddev;
677
678 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
679 return;
680 pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
681 mdname(mddev));
682 mddev_suspend(mddev);
683 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
684 mddev_resume(mddev);
685}
686
687static void r5l_submit_current_io(struct r5l_log *log)
688{
689 struct r5l_io_unit *io = log->current_io;
690 struct bio *bio;
691 struct r5l_meta_block *block;
692 unsigned long flags;
693 u32 crc;
694 bool do_submit = true;
695
696 if (!io)
697 return;
698
699 block = page_address(io->meta_page);
700 block->meta_size = cpu_to_le32(io->meta_offset);
701 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
702 block->checksum = cpu_to_le32(crc);
703 bio = io->current_bio;
704
705 log->current_io = NULL;
706 spin_lock_irqsave(&log->io_list_lock, flags);
707 if (io->has_flush || io->has_fua) {
708 if (io != list_first_entry(&log->running_ios,
709 struct r5l_io_unit, log_sibling)) {
710 io->io_deferred = 1;
711 do_submit = false;
712 }
713 }
714 spin_unlock_irqrestore(&log->io_list_lock, flags);
715 if (do_submit)
716 r5l_do_submit_io(log, io);
717}
718
719static struct bio *r5l_bio_alloc(struct r5l_log *log)
720{
721 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
722
723 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
724 bio->bi_bdev = log->rdev->bdev;
725 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
726
727 return bio;
728}
729
730static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
731{
732 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
733
734 r5c_update_log_state(log);
735
736
737
738
739
740
741
742 if (log->log_start == 0)
743 io->need_split_bio = true;
744
745 io->log_end = log->log_start;
746}
747
748static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
749{
750 struct r5l_io_unit *io;
751 struct r5l_meta_block *block;
752
753 io = mempool_alloc(log->io_pool, GFP_ATOMIC);
754 if (!io)
755 return NULL;
756 memset(io, 0, sizeof(*io));
757
758 io->log = log;
759 INIT_LIST_HEAD(&io->log_sibling);
760 INIT_LIST_HEAD(&io->stripe_list);
761 bio_list_init(&io->flush_barriers);
762 io->state = IO_UNIT_RUNNING;
763
764 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
765 block = page_address(io->meta_page);
766 clear_page(block);
767 block->magic = cpu_to_le32(R5LOG_MAGIC);
768 block->version = R5LOG_VERSION;
769 block->seq = cpu_to_le64(log->seq);
770 block->position = cpu_to_le64(log->log_start);
771
772 io->log_start = log->log_start;
773 io->meta_offset = sizeof(struct r5l_meta_block);
774 io->seq = log->seq++;
775
776 io->current_bio = r5l_bio_alloc(log);
777 io->current_bio->bi_end_io = r5l_log_endio;
778 io->current_bio->bi_private = io;
779 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
780
781 r5_reserve_log_entry(log, io);
782
783 spin_lock_irq(&log->io_list_lock);
784 list_add_tail(&io->log_sibling, &log->running_ios);
785 spin_unlock_irq(&log->io_list_lock);
786
787 return io;
788}
789
790static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
791{
792 if (log->current_io &&
793 log->current_io->meta_offset + payload_size > PAGE_SIZE)
794 r5l_submit_current_io(log);
795
796 if (!log->current_io) {
797 log->current_io = r5l_new_meta(log);
798 if (!log->current_io)
799 return -ENOMEM;
800 }
801
802 return 0;
803}
804
805static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
806 sector_t location,
807 u32 checksum1, u32 checksum2,
808 bool checksum2_valid)
809{
810 struct r5l_io_unit *io = log->current_io;
811 struct r5l_payload_data_parity *payload;
812
813 payload = page_address(io->meta_page) + io->meta_offset;
814 payload->header.type = cpu_to_le16(type);
815 payload->header.flags = cpu_to_le16(0);
816 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
817 (PAGE_SHIFT - 9));
818 payload->location = cpu_to_le64(location);
819 payload->checksum[0] = cpu_to_le32(checksum1);
820 if (checksum2_valid)
821 payload->checksum[1] = cpu_to_le32(checksum2);
822
823 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
824 sizeof(__le32) * (1 + !!checksum2_valid);
825}
826
827static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
828{
829 struct r5l_io_unit *io = log->current_io;
830
831 if (io->need_split_bio) {
832 BUG_ON(io->split_bio);
833 io->split_bio = io->current_bio;
834 io->current_bio = r5l_bio_alloc(log);
835 bio_chain(io->current_bio, io->split_bio);
836 io->need_split_bio = false;
837 }
838
839 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
840 BUG();
841
842 r5_reserve_log_entry(log, io);
843}
844
845static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
846 int data_pages, int parity_pages)
847{
848 int i;
849 int meta_size;
850 int ret;
851 struct r5l_io_unit *io;
852
853 meta_size =
854 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
855 * data_pages) +
856 sizeof(struct r5l_payload_data_parity) +
857 sizeof(__le32) * parity_pages;
858
859 ret = r5l_get_meta(log, meta_size);
860 if (ret)
861 return ret;
862
863 io = log->current_io;
864
865 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
866 io->has_flush = 1;
867
868 for (i = 0; i < sh->disks; i++) {
869 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
870 test_bit(R5_InJournal, &sh->dev[i].flags))
871 continue;
872 if (i == sh->pd_idx || i == sh->qd_idx)
873 continue;
874 if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
875 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
876 io->has_fua = 1;
877
878
879
880
881 io->has_flush = 1;
882 }
883 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
884 raid5_compute_blocknr(sh, i, 0),
885 sh->dev[i].log_checksum, 0, false);
886 r5l_append_payload_page(log, sh->dev[i].page);
887 }
888
889 if (parity_pages == 2) {
890 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
891 sh->sector, sh->dev[sh->pd_idx].log_checksum,
892 sh->dev[sh->qd_idx].log_checksum, true);
893 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
894 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
895 } else if (parity_pages == 1) {
896 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
897 sh->sector, sh->dev[sh->pd_idx].log_checksum,
898 0, false);
899 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
900 } else
901 BUG_ON(parity_pages != 0);
902
903 list_add_tail(&sh->log_list, &io->stripe_list);
904 atomic_inc(&io->pending_stripe);
905 sh->log_io = io;
906
907 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
908 return 0;
909
910 if (sh->log_start == MaxSector) {
911 BUG_ON(!list_empty(&sh->r5c));
912 sh->log_start = io->log_start;
913 spin_lock_irq(&log->stripe_in_journal_lock);
914 list_add_tail(&sh->r5c,
915 &log->stripe_in_journal_list);
916 spin_unlock_irq(&log->stripe_in_journal_lock);
917 atomic_inc(&log->stripe_in_journal_count);
918 }
919 return 0;
920}
921
922
923static inline void r5l_add_no_space_stripe(struct r5l_log *log,
924 struct stripe_head *sh)
925{
926 spin_lock(&log->no_space_stripes_lock);
927 list_add_tail(&sh->log_list, &log->no_space_stripes);
928 spin_unlock(&log->no_space_stripes_lock);
929}
930
931
932
933
934
935int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
936{
937 struct r5conf *conf = sh->raid_conf;
938 int write_disks = 0;
939 int data_pages, parity_pages;
940 int reserve;
941 int i;
942 int ret = 0;
943 bool wake_reclaim = false;
944
945 if (!log)
946 return -EAGAIN;
947
948 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
949 test_bit(STRIPE_SYNCING, &sh->state)) {
950
951 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
952 return -EAGAIN;
953 }
954
955 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
956
957 for (i = 0; i < sh->disks; i++) {
958 void *addr;
959
960 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
961 test_bit(R5_InJournal, &sh->dev[i].flags))
962 continue;
963
964 write_disks++;
965
966 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
967 continue;
968 addr = kmap_atomic(sh->dev[i].page);
969 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
970 addr, PAGE_SIZE);
971 kunmap_atomic(addr);
972 }
973 parity_pages = 1 + !!(sh->qd_idx >= 0);
974 data_pages = write_disks - parity_pages;
975
976 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
977
978
979
980
981 clear_bit(STRIPE_DELAYED, &sh->state);
982 atomic_inc(&sh->count);
983
984 mutex_lock(&log->io_mutex);
985
986 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
987
988 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
989 if (!r5l_has_free_space(log, reserve)) {
990 r5l_add_no_space_stripe(log, sh);
991 wake_reclaim = true;
992 } else {
993 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
994 if (ret) {
995 spin_lock_irq(&log->io_list_lock);
996 list_add_tail(&sh->log_list,
997 &log->no_mem_stripes);
998 spin_unlock_irq(&log->io_list_lock);
999 }
1000 }
1001 } else {
1002
1003
1004
1005
1006 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
1007 sh->log_start == MaxSector) {
1008 r5l_add_no_space_stripe(log, sh);
1009 wake_reclaim = true;
1010 reserve = 0;
1011 } else if (!r5l_has_free_space(log, reserve)) {
1012 if (sh->log_start == log->last_checkpoint)
1013 BUG();
1014 else
1015 r5l_add_no_space_stripe(log, sh);
1016 } else {
1017 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
1018 if (ret) {
1019 spin_lock_irq(&log->io_list_lock);
1020 list_add_tail(&sh->log_list,
1021 &log->no_mem_stripes);
1022 spin_unlock_irq(&log->io_list_lock);
1023 }
1024 }
1025 }
1026
1027 mutex_unlock(&log->io_mutex);
1028 if (wake_reclaim)
1029 r5l_wake_reclaim(log, reserve);
1030 return 0;
1031}
1032
1033void r5l_write_stripe_run(struct r5l_log *log)
1034{
1035 if (!log)
1036 return;
1037 mutex_lock(&log->io_mutex);
1038 r5l_submit_current_io(log);
1039 mutex_unlock(&log->io_mutex);
1040}
1041
1042int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
1043{
1044 if (!log)
1045 return -ENODEV;
1046
1047 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
1048
1049
1050
1051
1052
1053
1054
1055 if (bio->bi_iter.bi_size == 0) {
1056 bio_endio(bio);
1057 return 0;
1058 }
1059 bio->bi_opf &= ~REQ_PREFLUSH;
1060 } else {
1061
1062 if (bio->bi_iter.bi_size == 0) {
1063 mutex_lock(&log->io_mutex);
1064 r5l_get_meta(log, 0);
1065 bio_list_add(&log->current_io->flush_barriers, bio);
1066 log->current_io->has_flush = 1;
1067 log->current_io->has_null_flush = 1;
1068 atomic_inc(&log->current_io->pending_stripe);
1069 r5l_submit_current_io(log);
1070 mutex_unlock(&log->io_mutex);
1071 return 0;
1072 }
1073 }
1074 return -EAGAIN;
1075}
1076
1077
1078static void r5l_run_no_space_stripes(struct r5l_log *log)
1079{
1080 struct stripe_head *sh;
1081
1082 spin_lock(&log->no_space_stripes_lock);
1083 while (!list_empty(&log->no_space_stripes)) {
1084 sh = list_first_entry(&log->no_space_stripes,
1085 struct stripe_head, log_list);
1086 list_del_init(&sh->log_list);
1087 set_bit(STRIPE_HANDLE, &sh->state);
1088 raid5_release_stripe(sh);
1089 }
1090 spin_unlock(&log->no_space_stripes_lock);
1091}
1092
1093
1094
1095
1096
1097
1098static sector_t r5c_calculate_new_cp(struct r5conf *conf)
1099{
1100 struct stripe_head *sh;
1101 struct r5l_log *log = conf->log;
1102 sector_t new_cp;
1103 unsigned long flags;
1104
1105 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1106 return log->next_checkpoint;
1107
1108 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1109 if (list_empty(&conf->log->stripe_in_journal_list)) {
1110
1111 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1112 return log->next_checkpoint;
1113 }
1114 sh = list_first_entry(&conf->log->stripe_in_journal_list,
1115 struct stripe_head, r5c);
1116 new_cp = sh->log_start;
1117 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1118 return new_cp;
1119}
1120
1121static sector_t r5l_reclaimable_space(struct r5l_log *log)
1122{
1123 struct r5conf *conf = log->rdev->mddev->private;
1124
1125 return r5l_ring_distance(log, log->last_checkpoint,
1126 r5c_calculate_new_cp(conf));
1127}
1128
1129static void r5l_run_no_mem_stripe(struct r5l_log *log)
1130{
1131 struct stripe_head *sh;
1132
1133 assert_spin_locked(&log->io_list_lock);
1134
1135 if (!list_empty(&log->no_mem_stripes)) {
1136 sh = list_first_entry(&log->no_mem_stripes,
1137 struct stripe_head, log_list);
1138 list_del_init(&sh->log_list);
1139 set_bit(STRIPE_HANDLE, &sh->state);
1140 raid5_release_stripe(sh);
1141 }
1142}
1143
1144static bool r5l_complete_finished_ios(struct r5l_log *log)
1145{
1146 struct r5l_io_unit *io, *next;
1147 bool found = false;
1148
1149 assert_spin_locked(&log->io_list_lock);
1150
1151 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
1152
1153 if (io->state < IO_UNIT_STRIPE_END)
1154 break;
1155
1156 log->next_checkpoint = io->log_start;
1157
1158 list_del(&io->log_sibling);
1159 mempool_free(io, log->io_pool);
1160 r5l_run_no_mem_stripe(log);
1161
1162 found = true;
1163 }
1164
1165 return found;
1166}
1167
1168static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
1169{
1170 struct r5l_log *log = io->log;
1171 struct r5conf *conf = log->rdev->mddev->private;
1172 unsigned long flags;
1173
1174 spin_lock_irqsave(&log->io_list_lock, flags);
1175 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
1176
1177 if (!r5l_complete_finished_ios(log)) {
1178 spin_unlock_irqrestore(&log->io_list_lock, flags);
1179 return;
1180 }
1181
1182 if (r5l_reclaimable_space(log) > log->max_free_space ||
1183 test_bit(R5C_LOG_TIGHT, &conf->cache_state))
1184 r5l_wake_reclaim(log, 0);
1185
1186 spin_unlock_irqrestore(&log->io_list_lock, flags);
1187 wake_up(&log->iounit_wait);
1188}
1189
1190void r5l_stripe_write_finished(struct stripe_head *sh)
1191{
1192 struct r5l_io_unit *io;
1193
1194 io = sh->log_io;
1195 sh->log_io = NULL;
1196
1197 if (io && atomic_dec_and_test(&io->pending_stripe))
1198 __r5l_stripe_write_finished(io);
1199}
1200
1201static void r5l_log_flush_endio(struct bio *bio)
1202{
1203 struct r5l_log *log = container_of(bio, struct r5l_log,
1204 flush_bio);
1205 unsigned long flags;
1206 struct r5l_io_unit *io;
1207
1208 if (bio->bi_error)
1209 md_error(log->rdev->mddev, log->rdev);
1210
1211 spin_lock_irqsave(&log->io_list_lock, flags);
1212 list_for_each_entry(io, &log->flushing_ios, log_sibling)
1213 r5l_io_run_stripes(io);
1214 list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1215 spin_unlock_irqrestore(&log->io_list_lock, flags);
1216}
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232void r5l_flush_stripe_to_raid(struct r5l_log *log)
1233{
1234 bool do_flush;
1235
1236 if (!log || !log->need_cache_flush)
1237 return;
1238
1239 spin_lock_irq(&log->io_list_lock);
1240
1241 if (!list_empty(&log->flushing_ios)) {
1242 spin_unlock_irq(&log->io_list_lock);
1243 return;
1244 }
1245 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1246 do_flush = !list_empty(&log->flushing_ios);
1247 spin_unlock_irq(&log->io_list_lock);
1248
1249 if (!do_flush)
1250 return;
1251 bio_reset(&log->flush_bio);
1252 log->flush_bio.bi_bdev = log->rdev->bdev;
1253 log->flush_bio.bi_end_io = r5l_log_flush_endio;
1254 log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1255 submit_bio(&log->flush_bio);
1256}
1257
1258static void r5l_write_super(struct r5l_log *log, sector_t cp);
1259static void r5l_write_super_and_discard_space(struct r5l_log *log,
1260 sector_t end)
1261{
1262 struct block_device *bdev = log->rdev->bdev;
1263 struct mddev *mddev;
1264
1265 r5l_write_super(log, end);
1266
1267 if (!blk_queue_discard(bdev_get_queue(bdev)))
1268 return;
1269
1270 mddev = log->rdev->mddev;
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282 set_mask_bits(&mddev->sb_flags, 0,
1283 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1284 if (!mddev_trylock(mddev))
1285 return;
1286 md_update_sb(mddev, 1);
1287 mddev_unlock(mddev);
1288
1289
1290 if (log->last_checkpoint < end) {
1291 blkdev_issue_discard(bdev,
1292 log->last_checkpoint + log->rdev->data_offset,
1293 end - log->last_checkpoint, GFP_NOIO, 0);
1294 } else {
1295 blkdev_issue_discard(bdev,
1296 log->last_checkpoint + log->rdev->data_offset,
1297 log->device_size - log->last_checkpoint,
1298 GFP_NOIO, 0);
1299 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1300 GFP_NOIO, 0);
1301 }
1302}
1303
1304
1305
1306
1307
1308
1309
1310static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1311{
1312 BUG_ON(list_empty(&sh->lru));
1313 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1314 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1315
1316
1317
1318
1319
1320 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1321 assert_spin_locked(&conf->device_lock);
1322
1323 list_del_init(&sh->lru);
1324 atomic_inc(&sh->count);
1325
1326 set_bit(STRIPE_HANDLE, &sh->state);
1327 atomic_inc(&conf->active_stripes);
1328 r5c_make_stripe_write_out(sh);
1329
1330 if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
1331 atomic_inc(&conf->r5c_flushing_partial_stripes);
1332 else
1333 atomic_inc(&conf->r5c_flushing_full_stripes);
1334 raid5_release_stripe(sh);
1335}
1336
1337
1338
1339
1340
1341
1342
1343void r5c_flush_cache(struct r5conf *conf, int num)
1344{
1345 int count;
1346 struct stripe_head *sh, *next;
1347
1348 assert_spin_locked(&conf->device_lock);
1349 if (!conf->log)
1350 return;
1351
1352 count = 0;
1353 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1354 r5c_flush_stripe(conf, sh);
1355 count++;
1356 }
1357
1358 if (count >= num)
1359 return;
1360 list_for_each_entry_safe(sh, next,
1361 &conf->r5c_partial_stripe_list, lru) {
1362 r5c_flush_stripe(conf, sh);
1363 if (++count >= num)
1364 break;
1365 }
1366}
1367
1368static void r5c_do_reclaim(struct r5conf *conf)
1369{
1370 struct r5l_log *log = conf->log;
1371 struct stripe_head *sh;
1372 int count = 0;
1373 unsigned long flags;
1374 int total_cached;
1375 int stripes_to_flush;
1376 int flushing_partial, flushing_full;
1377
1378 if (!r5c_is_writeback(log))
1379 return;
1380
1381 flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
1382 flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
1383 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1384 atomic_read(&conf->r5c_cached_full_stripes) -
1385 flushing_full - flushing_partial;
1386
1387 if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1388 atomic_read(&conf->empty_inactive_list_nr) > 0)
1389
1390
1391
1392
1393 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1394 else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1395 atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
1396 R5C_FULL_STRIPE_FLUSH_BATCH)
1397
1398
1399
1400
1401 stripes_to_flush = 0;
1402 else
1403
1404 stripes_to_flush = -1;
1405
1406 if (stripes_to_flush >= 0) {
1407 spin_lock_irqsave(&conf->device_lock, flags);
1408 r5c_flush_cache(conf, stripes_to_flush);
1409 spin_unlock_irqrestore(&conf->device_lock, flags);
1410 }
1411
1412
1413 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1414 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1415 spin_lock(&conf->device_lock);
1416 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1417
1418
1419
1420
1421
1422
1423
1424
1425 if (!list_empty(&sh->lru) &&
1426 !test_bit(STRIPE_HANDLE, &sh->state) &&
1427 atomic_read(&sh->count) == 0) {
1428 r5c_flush_stripe(conf, sh);
1429 if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1430 break;
1431 }
1432 }
1433 spin_unlock(&conf->device_lock);
1434 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1435 }
1436
1437 if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
1438 r5l_run_no_space_stripes(log);
1439
1440 md_wakeup_thread(conf->mddev->thread);
1441}
1442
1443static void r5l_do_reclaim(struct r5l_log *log)
1444{
1445 struct r5conf *conf = log->rdev->mddev->private;
1446 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
1447 sector_t reclaimable;
1448 sector_t next_checkpoint;
1449 bool write_super;
1450
1451 spin_lock_irq(&log->io_list_lock);
1452 write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1453 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
1454
1455
1456
1457
1458
1459 while (1) {
1460 reclaimable = r5l_reclaimable_space(log);
1461 if (reclaimable >= reclaim_target ||
1462 (list_empty(&log->running_ios) &&
1463 list_empty(&log->io_end_ios) &&
1464 list_empty(&log->flushing_ios) &&
1465 list_empty(&log->finished_ios)))
1466 break;
1467
1468 md_wakeup_thread(log->rdev->mddev->thread);
1469 wait_event_lock_irq(log->iounit_wait,
1470 r5l_reclaimable_space(log) > reclaimable,
1471 log->io_list_lock);
1472 }
1473
1474 next_checkpoint = r5c_calculate_new_cp(conf);
1475 spin_unlock_irq(&log->io_list_lock);
1476
1477 if (reclaimable == 0 || !write_super)
1478 return;
1479
1480
1481
1482
1483
1484
1485 r5l_write_super_and_discard_space(log, next_checkpoint);
1486
1487 mutex_lock(&log->io_mutex);
1488 log->last_checkpoint = next_checkpoint;
1489 r5c_update_log_state(log);
1490 mutex_unlock(&log->io_mutex);
1491
1492 r5l_run_no_space_stripes(log);
1493}
1494
1495static void r5l_reclaim_thread(struct md_thread *thread)
1496{
1497 struct mddev *mddev = thread->mddev;
1498 struct r5conf *conf = mddev->private;
1499 struct r5l_log *log = conf->log;
1500
1501 if (!log)
1502 return;
1503 r5c_do_reclaim(conf);
1504 r5l_do_reclaim(log);
1505}
1506
1507void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1508{
1509 unsigned long target;
1510 unsigned long new = (unsigned long)space;
1511
1512 if (!log)
1513 return;
1514 do {
1515 target = log->reclaim_target;
1516 if (new < target)
1517 return;
1518 } while (cmpxchg(&log->reclaim_target, target, new) != target);
1519 md_wakeup_thread(log->reclaim_thread);
1520}
1521
1522void r5l_quiesce(struct r5l_log *log, int state)
1523{
1524 struct mddev *mddev;
1525 if (!log || state == 2)
1526 return;
1527 if (state == 0)
1528 kthread_unpark(log->reclaim_thread->tsk);
1529 else if (state == 1) {
1530
1531 mddev = log->rdev->mddev;
1532 wake_up(&mddev->sb_wait);
1533 kthread_park(log->reclaim_thread->tsk);
1534 r5l_wake_reclaim(log, MaxSector);
1535 r5l_do_reclaim(log);
1536 }
1537}
1538
1539bool r5l_log_disk_error(struct r5conf *conf)
1540{
1541 struct r5l_log *log;
1542 bool ret;
1543
1544 rcu_read_lock();
1545 log = rcu_dereference(conf->log);
1546
1547 if (!log)
1548 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1549 else
1550 ret = test_bit(Faulty, &log->rdev->flags);
1551 rcu_read_unlock();
1552 return ret;
1553}
1554
1555struct r5l_recovery_ctx {
1556 struct page *meta_page;
1557 sector_t meta_total_blocks;
1558 sector_t pos;
1559 u64 seq;
1560 int data_parity_stripes;
1561 int data_only_stripes;
1562 struct list_head cached_list;
1563};
1564
1565static int r5l_recovery_read_meta_block(struct r5l_log *log,
1566 struct r5l_recovery_ctx *ctx)
1567{
1568 struct page *page = ctx->meta_page;
1569 struct r5l_meta_block *mb;
1570 u32 crc, stored_crc;
1571
1572 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1573 false))
1574 return -EIO;
1575
1576 mb = page_address(page);
1577 stored_crc = le32_to_cpu(mb->checksum);
1578 mb->checksum = 0;
1579
1580 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1581 le64_to_cpu(mb->seq) != ctx->seq ||
1582 mb->version != R5LOG_VERSION ||
1583 le64_to_cpu(mb->position) != ctx->pos)
1584 return -EINVAL;
1585
1586 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1587 if (stored_crc != crc)
1588 return -EINVAL;
1589
1590 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1591 return -EINVAL;
1592
1593 ctx->meta_total_blocks = BLOCK_SECTORS;
1594
1595 return 0;
1596}
1597
1598static void
1599r5l_recovery_create_empty_meta_block(struct r5l_log *log,
1600 struct page *page,
1601 sector_t pos, u64 seq)
1602{
1603 struct r5l_meta_block *mb;
1604
1605 mb = page_address(page);
1606 clear_page(mb);
1607 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1608 mb->version = R5LOG_VERSION;
1609 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1610 mb->seq = cpu_to_le64(seq);
1611 mb->position = cpu_to_le64(pos);
1612}
1613
1614static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1615 u64 seq)
1616{
1617 struct page *page;
1618 struct r5l_meta_block *mb;
1619
1620 page = alloc_page(GFP_KERNEL);
1621 if (!page)
1622 return -ENOMEM;
1623 r5l_recovery_create_empty_meta_block(log, page, pos, seq);
1624 mb = page_address(page);
1625 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
1626 mb, PAGE_SIZE));
1627 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1628 REQ_FUA, false)) {
1629 __free_page(page);
1630 return -EIO;
1631 }
1632 __free_page(page);
1633 return 0;
1634}
1635
1636
1637
1638
1639
1640
1641
1642
1643static void r5l_recovery_load_data(struct r5l_log *log,
1644 struct stripe_head *sh,
1645 struct r5l_recovery_ctx *ctx,
1646 struct r5l_payload_data_parity *payload,
1647 sector_t log_offset)
1648{
1649 struct mddev *mddev = log->rdev->mddev;
1650 struct r5conf *conf = mddev->private;
1651 int dd_idx;
1652
1653 raid5_compute_sector(conf,
1654 le64_to_cpu(payload->location), 0,
1655 &dd_idx, sh);
1656 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1657 sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
1658 sh->dev[dd_idx].log_checksum =
1659 le32_to_cpu(payload->checksum[0]);
1660 ctx->meta_total_blocks += BLOCK_SECTORS;
1661
1662 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
1663 set_bit(STRIPE_R5C_CACHING, &sh->state);
1664}
1665
1666static void r5l_recovery_load_parity(struct r5l_log *log,
1667 struct stripe_head *sh,
1668 struct r5l_recovery_ctx *ctx,
1669 struct r5l_payload_data_parity *payload,
1670 sector_t log_offset)
1671{
1672 struct mddev *mddev = log->rdev->mddev;
1673 struct r5conf *conf = mddev->private;
1674
1675 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
1676 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1677 sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
1678 sh->dev[sh->pd_idx].log_checksum =
1679 le32_to_cpu(payload->checksum[0]);
1680 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
1681
1682 if (sh->qd_idx >= 0) {
1683 sync_page_io(log->rdev,
1684 r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1685 PAGE_SIZE, sh->dev[sh->qd_idx].page,
1686 REQ_OP_READ, 0, false);
1687 sh->dev[sh->qd_idx].log_checksum =
1688 le32_to_cpu(payload->checksum[1]);
1689 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
1690 }
1691 clear_bit(STRIPE_R5C_CACHING, &sh->state);
1692}
1693
1694static void r5l_recovery_reset_stripe(struct stripe_head *sh)
1695{
1696 int i;
1697
1698 sh->state = 0;
1699 sh->log_start = MaxSector;
1700 for (i = sh->disks; i--; )
1701 sh->dev[i].flags = 0;
1702}
1703
1704static void
1705r5l_recovery_replay_one_stripe(struct r5conf *conf,
1706 struct stripe_head *sh,
1707 struct r5l_recovery_ctx *ctx)
1708{
1709 struct md_rdev *rdev, *rrdev;
1710 int disk_index;
1711 int data_count = 0;
1712
1713 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1714 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1715 continue;
1716 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
1717 continue;
1718 data_count++;
1719 }
1720
1721
1722
1723
1724
1725
1726 if (data_count == 0)
1727 goto out;
1728
1729 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1730 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1731 continue;
1732
1733
1734 rcu_read_lock();
1735 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1736 if (rdev) {
1737 atomic_inc(&rdev->nr_pending);
1738 rcu_read_unlock();
1739 sync_page_io(rdev, sh->sector, PAGE_SIZE,
1740 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1741 false);
1742 rdev_dec_pending(rdev, rdev->mddev);
1743 rcu_read_lock();
1744 }
1745 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1746 if (rrdev) {
1747 atomic_inc(&rrdev->nr_pending);
1748 rcu_read_unlock();
1749 sync_page_io(rrdev, sh->sector, PAGE_SIZE,
1750 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1751 false);
1752 rdev_dec_pending(rrdev, rrdev->mddev);
1753 rcu_read_lock();
1754 }
1755 rcu_read_unlock();
1756 }
1757 ctx->data_parity_stripes++;
1758out:
1759 r5l_recovery_reset_stripe(sh);
1760}
1761
1762static struct stripe_head *
1763r5c_recovery_alloc_stripe(struct r5conf *conf,
1764 sector_t stripe_sect)
1765{
1766 struct stripe_head *sh;
1767
1768 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
1769 if (!sh)
1770 return NULL;
1771
1772 r5l_recovery_reset_stripe(sh);
1773
1774 return sh;
1775}
1776
1777static struct stripe_head *
1778r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
1779{
1780 struct stripe_head *sh;
1781
1782 list_for_each_entry(sh, list, lru)
1783 if (sh->sector == sect)
1784 return sh;
1785 return NULL;
1786}
1787
1788static void
1789r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
1790 struct r5l_recovery_ctx *ctx)
1791{
1792 struct stripe_head *sh, *next;
1793
1794 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
1795 r5l_recovery_reset_stripe(sh);
1796 list_del_init(&sh->lru);
1797 raid5_release_stripe(sh);
1798 }
1799}
1800
1801static void
1802r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
1803 struct r5l_recovery_ctx *ctx)
1804{
1805 struct stripe_head *sh, *next;
1806
1807 list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
1808 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1809 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
1810 list_del_init(&sh->lru);
1811 raid5_release_stripe(sh);
1812 }
1813}
1814
1815
1816static int
1817r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
1818 sector_t log_offset, __le32 log_checksum)
1819{
1820 void *addr;
1821 u32 checksum;
1822
1823 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1824 page, REQ_OP_READ, 0, false);
1825 addr = kmap_atomic(page);
1826 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1827 kunmap_atomic(addr);
1828 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
1829}
1830
1831
1832
1833
1834
1835static int
1836r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
1837 struct r5l_recovery_ctx *ctx)
1838{
1839 struct mddev *mddev = log->rdev->mddev;
1840 struct r5conf *conf = mddev->private;
1841 struct r5l_meta_block *mb = page_address(ctx->meta_page);
1842 sector_t mb_offset = sizeof(struct r5l_meta_block);
1843 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1844 struct page *page;
1845 struct r5l_payload_data_parity *payload;
1846
1847 page = alloc_page(GFP_KERNEL);
1848 if (!page)
1849 return -ENOMEM;
1850
1851 while (mb_offset < le32_to_cpu(mb->meta_size)) {
1852 payload = (void *)mb + mb_offset;
1853
1854 if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1855 if (r5l_recovery_verify_data_checksum(
1856 log, page, log_offset,
1857 payload->checksum[0]) < 0)
1858 goto mismatch;
1859 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
1860 if (r5l_recovery_verify_data_checksum(
1861 log, page, log_offset,
1862 payload->checksum[0]) < 0)
1863 goto mismatch;
1864 if (conf->max_degraded == 2 &&
1865 r5l_recovery_verify_data_checksum(
1866 log, page,
1867 r5l_ring_add(log, log_offset,
1868 BLOCK_SECTORS),
1869 payload->checksum[1]) < 0)
1870 goto mismatch;
1871 } else
1872 goto mismatch;
1873
1874 log_offset = r5l_ring_add(log, log_offset,
1875 le32_to_cpu(payload->size));
1876
1877 mb_offset += sizeof(struct r5l_payload_data_parity) +
1878 sizeof(__le32) *
1879 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1880 }
1881
1882 put_page(page);
1883 return 0;
1884
1885mismatch:
1886 put_page(page);
1887 return -EINVAL;
1888}
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898static int
1899r5c_recovery_analyze_meta_block(struct r5l_log *log,
1900 struct r5l_recovery_ctx *ctx,
1901 struct list_head *cached_stripe_list)
1902{
1903 struct mddev *mddev = log->rdev->mddev;
1904 struct r5conf *conf = mddev->private;
1905 struct r5l_meta_block *mb;
1906 struct r5l_payload_data_parity *payload;
1907 int mb_offset;
1908 sector_t log_offset;
1909 sector_t stripe_sect;
1910 struct stripe_head *sh;
1911 int ret;
1912
1913
1914
1915
1916
1917
1918 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
1919 if (ret == -EINVAL)
1920 return -EAGAIN;
1921 else if (ret)
1922 return ret;
1923
1924 mb = page_address(ctx->meta_page);
1925 mb_offset = sizeof(struct r5l_meta_block);
1926 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1927
1928 while (mb_offset < le32_to_cpu(mb->meta_size)) {
1929 int dd;
1930
1931 payload = (void *)mb + mb_offset;
1932 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
1933 raid5_compute_sector(
1934 conf, le64_to_cpu(payload->location), 0, &dd,
1935 NULL)
1936 : le64_to_cpu(payload->location);
1937
1938 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
1939 stripe_sect);
1940
1941 if (!sh) {
1942 sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
1943
1944
1945
1946
1947 if (!sh) {
1948 r5c_recovery_replay_stripes(
1949 cached_stripe_list, ctx);
1950 sh = r5c_recovery_alloc_stripe(
1951 conf, stripe_sect);
1952 }
1953 if (!sh) {
1954 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
1955 mdname(mddev),
1956 conf->min_nr_stripes * 2);
1957 raid5_set_cache_size(mddev,
1958 conf->min_nr_stripes * 2);
1959 sh = r5c_recovery_alloc_stripe(conf,
1960 stripe_sect);
1961 }
1962 if (!sh) {
1963 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
1964 mdname(mddev));
1965 return -ENOMEM;
1966 }
1967 list_add_tail(&sh->lru, cached_stripe_list);
1968 }
1969
1970 if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1971 if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
1972 test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
1973 r5l_recovery_replay_one_stripe(conf, sh, ctx);
1974 list_move_tail(&sh->lru, cached_stripe_list);
1975 }
1976 r5l_recovery_load_data(log, sh, ctx, payload,
1977 log_offset);
1978 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
1979 r5l_recovery_load_parity(log, sh, ctx, payload,
1980 log_offset);
1981 else
1982 return -EINVAL;
1983
1984 log_offset = r5l_ring_add(log, log_offset,
1985 le32_to_cpu(payload->size));
1986
1987 mb_offset += sizeof(struct r5l_payload_data_parity) +
1988 sizeof(__le32) *
1989 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1990 }
1991
1992 return 0;
1993}
1994
1995
1996
1997
1998
1999static void r5c_recovery_load_one_stripe(struct r5l_log *log,
2000 struct stripe_head *sh)
2001{
2002 struct r5dev *dev;
2003 int i;
2004
2005 for (i = sh->disks; i--; ) {
2006 dev = sh->dev + i;
2007 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
2008 set_bit(R5_InJournal, &dev->flags);
2009 set_bit(R5_UPTODATE, &dev->flags);
2010 }
2011 }
2012}
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030static int r5c_recovery_flush_log(struct r5l_log *log,
2031 struct r5l_recovery_ctx *ctx)
2032{
2033 struct stripe_head *sh;
2034 int ret = 0;
2035
2036
2037 while (1) {
2038 if (r5l_recovery_read_meta_block(log, ctx))
2039 break;
2040
2041 ret = r5c_recovery_analyze_meta_block(log, ctx,
2042 &ctx->cached_list);
2043
2044
2045
2046
2047 if (ret && ret != -EAGAIN)
2048 break;
2049 ctx->seq++;
2050 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
2051 }
2052
2053 if (ret == -ENOMEM) {
2054 r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
2055 return ret;
2056 }
2057
2058
2059 r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
2060
2061
2062 list_for_each_entry(sh, &ctx->cached_list, lru) {
2063 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2064 r5c_recovery_load_one_stripe(log, sh);
2065 ctx->data_only_stripes++;
2066 }
2067
2068 return 0;
2069}
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140static int
2141r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2142 struct r5l_recovery_ctx *ctx)
2143{
2144 struct stripe_head *sh;
2145 struct mddev *mddev = log->rdev->mddev;
2146 struct page *page;
2147 sector_t next_checkpoint = MaxSector;
2148
2149 page = alloc_page(GFP_KERNEL);
2150 if (!page) {
2151 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
2152 mdname(mddev));
2153 return -ENOMEM;
2154 }
2155
2156 WARN_ON(list_empty(&ctx->cached_list));
2157
2158 list_for_each_entry(sh, &ctx->cached_list, lru) {
2159 struct r5l_meta_block *mb;
2160 int i;
2161 int offset;
2162 sector_t write_pos;
2163
2164 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2165 r5l_recovery_create_empty_meta_block(log, page,
2166 ctx->pos, ctx->seq);
2167 mb = page_address(page);
2168 offset = le32_to_cpu(mb->meta_size);
2169 write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2170
2171 for (i = sh->disks; i--; ) {
2172 struct r5dev *dev = &sh->dev[i];
2173 struct r5l_payload_data_parity *payload;
2174 void *addr;
2175
2176 if (test_bit(R5_InJournal, &dev->flags)) {
2177 payload = (void *)mb + offset;
2178 payload->header.type = cpu_to_le16(
2179 R5LOG_PAYLOAD_DATA);
2180 payload->size = BLOCK_SECTORS;
2181 payload->location = cpu_to_le64(
2182 raid5_compute_blocknr(sh, i, 0));
2183 addr = kmap_atomic(dev->page);
2184 payload->checksum[0] = cpu_to_le32(
2185 crc32c_le(log->uuid_checksum, addr,
2186 PAGE_SIZE));
2187 kunmap_atomic(addr);
2188 sync_page_io(log->rdev, write_pos, PAGE_SIZE,
2189 dev->page, REQ_OP_WRITE, 0, false);
2190 write_pos = r5l_ring_add(log, write_pos,
2191 BLOCK_SECTORS);
2192 offset += sizeof(__le32) +
2193 sizeof(struct r5l_payload_data_parity);
2194
2195 }
2196 }
2197 mb->meta_size = cpu_to_le32(offset);
2198 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
2199 mb, PAGE_SIZE));
2200 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
2201 REQ_OP_WRITE, REQ_FUA, false);
2202 sh->log_start = ctx->pos;
2203 list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
2204 atomic_inc(&log->stripe_in_journal_count);
2205 ctx->pos = write_pos;
2206 ctx->seq += 1;
2207 next_checkpoint = sh->log_start;
2208 }
2209 log->next_checkpoint = next_checkpoint;
2210 __free_page(page);
2211 return 0;
2212}
2213
2214static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
2215 struct r5l_recovery_ctx *ctx)
2216{
2217 struct mddev *mddev = log->rdev->mddev;
2218 struct r5conf *conf = mddev->private;
2219 struct stripe_head *sh, *next;
2220
2221 if (ctx->data_only_stripes == 0)
2222 return;
2223
2224 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
2225
2226 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
2227 r5c_make_stripe_write_out(sh);
2228 set_bit(STRIPE_HANDLE, &sh->state);
2229 list_del_init(&sh->lru);
2230 raid5_release_stripe(sh);
2231 }
2232
2233 md_wakeup_thread(conf->mddev->thread);
2234
2235 wait_event(conf->wait_for_quiescent,
2236 atomic_read(&conf->active_stripes) == 0);
2237
2238 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2239}
2240
2241static int r5l_recovery_log(struct r5l_log *log)
2242{
2243 struct mddev *mddev = log->rdev->mddev;
2244 struct r5l_recovery_ctx ctx;
2245 int ret;
2246 sector_t pos;
2247
2248 ctx.pos = log->last_checkpoint;
2249 ctx.seq = log->last_cp_seq;
2250 ctx.meta_page = alloc_page(GFP_KERNEL);
2251 ctx.data_only_stripes = 0;
2252 ctx.data_parity_stripes = 0;
2253 INIT_LIST_HEAD(&ctx.cached_list);
2254
2255 if (!ctx.meta_page)
2256 return -ENOMEM;
2257
2258 ret = r5c_recovery_flush_log(log, &ctx);
2259 __free_page(ctx.meta_page);
2260
2261 if (ret)
2262 return ret;
2263
2264 pos = ctx.pos;
2265 ctx.seq += 10000;
2266
2267
2268 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
2269 pr_debug("md/raid:%s: starting from clean shutdown\n",
2270 mdname(mddev));
2271 else
2272 pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
2273 mdname(mddev), ctx.data_only_stripes,
2274 ctx.data_parity_stripes);
2275
2276 if (ctx.data_only_stripes == 0) {
2277 log->next_checkpoint = ctx.pos;
2278 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
2279 ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
2280 } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
2281 pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
2282 mdname(mddev));
2283 return -EIO;
2284 }
2285
2286 log->log_start = ctx.pos;
2287 log->seq = ctx.seq;
2288 log->last_checkpoint = pos;
2289 r5l_write_super(log, pos);
2290
2291 r5c_recovery_flush_data_only_stripes(log, &ctx);
2292 return 0;
2293}
2294
2295static void r5l_write_super(struct r5l_log *log, sector_t cp)
2296{
2297 struct mddev *mddev = log->rdev->mddev;
2298
2299 log->rdev->journal_tail = cp;
2300 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2301}
2302
2303static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2304{
2305 struct r5conf *conf = mddev->private;
2306 int ret;
2307
2308 if (!conf->log)
2309 return 0;
2310
2311 switch (conf->log->r5c_journal_mode) {
2312 case R5C_JOURNAL_MODE_WRITE_THROUGH:
2313 ret = snprintf(
2314 page, PAGE_SIZE, "[%s] %s\n",
2315 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2316 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2317 break;
2318 case R5C_JOURNAL_MODE_WRITE_BACK:
2319 ret = snprintf(
2320 page, PAGE_SIZE, "%s [%s]\n",
2321 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2322 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2323 break;
2324 default:
2325 ret = 0;
2326 }
2327 return ret;
2328}
2329
2330static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2331 const char *page, size_t length)
2332{
2333 struct r5conf *conf = mddev->private;
2334 struct r5l_log *log = conf->log;
2335 int val = -1, i;
2336 int len = length;
2337
2338 if (!log)
2339 return -ENODEV;
2340
2341 if (len && page[len - 1] == '\n')
2342 len -= 1;
2343 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
2344 if (strlen(r5c_journal_mode_str[i]) == len &&
2345 strncmp(page, r5c_journal_mode_str[i], len) == 0) {
2346 val = i;
2347 break;
2348 }
2349 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2350 val > R5C_JOURNAL_MODE_WRITE_BACK)
2351 return -EINVAL;
2352
2353 if (raid5_calc_degraded(conf) > 0 &&
2354 val == R5C_JOURNAL_MODE_WRITE_BACK)
2355 return -EINVAL;
2356
2357 mddev_suspend(mddev);
2358 conf->log->r5c_journal_mode = val;
2359 mddev_resume(mddev);
2360
2361 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2362 mdname(mddev), val, r5c_journal_mode_str[val]);
2363 return length;
2364}
2365
2366struct md_sysfs_entry
2367r5c_journal_mode = __ATTR(journal_mode, 0644,
2368 r5c_journal_mode_show, r5c_journal_mode_store);
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378int r5c_try_caching_write(struct r5conf *conf,
2379 struct stripe_head *sh,
2380 struct stripe_head_state *s,
2381 int disks)
2382{
2383 struct r5l_log *log = conf->log;
2384 int i;
2385 struct r5dev *dev;
2386 int to_cache = 0;
2387 void **pslot;
2388 sector_t tree_index;
2389 int ret;
2390 uintptr_t refcount;
2391
2392 BUG_ON(!r5c_is_writeback(log));
2393
2394 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409 if (s->injournal > 0 || s->written > 0)
2410 return -EAGAIN;
2411
2412 set_bit(STRIPE_R5C_CACHING, &sh->state);
2413 }
2414
2415
2416
2417
2418
2419
2420 if (s->failed) {
2421 r5c_make_stripe_write_out(sh);
2422 return -EAGAIN;
2423 }
2424
2425 for (i = disks; i--; ) {
2426 dev = &sh->dev[i];
2427
2428 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
2429 !test_bit(R5_InJournal, &dev->flags)) {
2430 r5c_make_stripe_write_out(sh);
2431 return -EAGAIN;
2432 }
2433 }
2434
2435
2436 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
2437 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2438 tree_index = r5c_tree_index(conf, sh->sector);
2439 spin_lock(&log->tree_lock);
2440 pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
2441 tree_index);
2442 if (pslot) {
2443 refcount = (uintptr_t)radix_tree_deref_slot_protected(
2444 pslot, &log->tree_lock) >>
2445 R5C_RADIX_COUNT_SHIFT;
2446 radix_tree_replace_slot(
2447 &log->big_stripe_tree, pslot,
2448 (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
2449 } else {
2450
2451
2452
2453
2454 ret = radix_tree_insert(
2455 &log->big_stripe_tree, tree_index,
2456 (void *)(1 << R5C_RADIX_COUNT_SHIFT));
2457 if (ret) {
2458 spin_unlock(&log->tree_lock);
2459 r5c_make_stripe_write_out(sh);
2460 return -EAGAIN;
2461 }
2462 }
2463 spin_unlock(&log->tree_lock);
2464
2465
2466
2467
2468
2469 set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
2470 atomic_inc(&conf->r5c_cached_partial_stripes);
2471 }
2472
2473 for (i = disks; i--; ) {
2474 dev = &sh->dev[i];
2475 if (dev->towrite) {
2476 set_bit(R5_Wantwrite, &dev->flags);
2477 set_bit(R5_Wantdrain, &dev->flags);
2478 set_bit(R5_LOCKED, &dev->flags);
2479 to_cache++;
2480 }
2481 }
2482
2483 if (to_cache) {
2484 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2485
2486
2487
2488
2489
2490 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
2491 }
2492
2493 return 0;
2494}
2495
2496
2497
2498
2499void r5c_release_extra_page(struct stripe_head *sh)
2500{
2501 struct r5conf *conf = sh->raid_conf;
2502 int i;
2503 bool using_disk_info_extra_page;
2504
2505 using_disk_info_extra_page =
2506 sh->dev[0].orig_page == conf->disks[0].extra_page;
2507
2508 for (i = sh->disks; i--; )
2509 if (sh->dev[i].page != sh->dev[i].orig_page) {
2510 struct page *p = sh->dev[i].orig_page;
2511
2512 sh->dev[i].orig_page = sh->dev[i].page;
2513 clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2514
2515 if (!using_disk_info_extra_page)
2516 put_page(p);
2517 }
2518
2519 if (using_disk_info_extra_page) {
2520 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
2521 md_wakeup_thread(conf->mddev->thread);
2522 }
2523}
2524
2525void r5c_use_extra_page(struct stripe_head *sh)
2526{
2527 struct r5conf *conf = sh->raid_conf;
2528 int i;
2529 struct r5dev *dev;
2530
2531 for (i = sh->disks; i--; ) {
2532 dev = &sh->dev[i];
2533 if (dev->orig_page != dev->page)
2534 put_page(dev->orig_page);
2535 dev->orig_page = conf->disks[i].extra_page;
2536 }
2537}
2538
2539
2540
2541
2542
2543void r5c_finish_stripe_write_out(struct r5conf *conf,
2544 struct stripe_head *sh,
2545 struct stripe_head_state *s)
2546{
2547 struct r5l_log *log = conf->log;
2548 int i;
2549 int do_wakeup = 0;
2550 sector_t tree_index;
2551 void **pslot;
2552 uintptr_t refcount;
2553
2554 if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
2555 return;
2556
2557 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2558 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
2559
2560 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2561 return;
2562
2563 for (i = sh->disks; i--; ) {
2564 clear_bit(R5_InJournal, &sh->dev[i].flags);
2565 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2566 do_wakeup = 1;
2567 }
2568
2569
2570
2571
2572
2573 s->injournal = 0;
2574
2575 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2576 if (atomic_dec_and_test(&conf->pending_full_writes))
2577 md_wakeup_thread(conf->mddev->thread);
2578
2579 if (do_wakeup)
2580 wake_up(&conf->wait_for_overlap);
2581
2582 spin_lock_irq(&log->stripe_in_journal_lock);
2583 list_del_init(&sh->r5c);
2584 spin_unlock_irq(&log->stripe_in_journal_lock);
2585 sh->log_start = MaxSector;
2586
2587 atomic_dec(&log->stripe_in_journal_count);
2588 r5c_update_log_state(log);
2589
2590
2591 if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
2592 test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2593 tree_index = r5c_tree_index(conf, sh->sector);
2594 spin_lock(&log->tree_lock);
2595 pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
2596 tree_index);
2597 BUG_ON(pslot == NULL);
2598 refcount = (uintptr_t)radix_tree_deref_slot_protected(
2599 pslot, &log->tree_lock) >>
2600 R5C_RADIX_COUNT_SHIFT;
2601 if (refcount == 1)
2602 radix_tree_delete(&log->big_stripe_tree, tree_index);
2603 else
2604 radix_tree_replace_slot(
2605 &log->big_stripe_tree, pslot,
2606 (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
2607 spin_unlock(&log->tree_lock);
2608 }
2609
2610 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
2611 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
2612 atomic_dec(&conf->r5c_flushing_partial_stripes);
2613 atomic_dec(&conf->r5c_cached_partial_stripes);
2614 }
2615
2616 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2617 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
2618 atomic_dec(&conf->r5c_flushing_full_stripes);
2619 atomic_dec(&conf->r5c_cached_full_stripes);
2620 }
2621}
2622
2623int
2624r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
2625 struct stripe_head_state *s)
2626{
2627 struct r5conf *conf = sh->raid_conf;
2628 int pages = 0;
2629 int reserve;
2630 int i;
2631 int ret = 0;
2632
2633 BUG_ON(!log);
2634
2635 for (i = 0; i < sh->disks; i++) {
2636 void *addr;
2637
2638 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
2639 continue;
2640 addr = kmap_atomic(sh->dev[i].page);
2641 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
2642 addr, PAGE_SIZE);
2643 kunmap_atomic(addr);
2644 pages++;
2645 }
2646 WARN_ON(pages == 0);
2647
2648
2649
2650
2651
2652 clear_bit(STRIPE_DELAYED, &sh->state);
2653 atomic_inc(&sh->count);
2654
2655 mutex_lock(&log->io_mutex);
2656
2657 reserve = (1 + pages) << (PAGE_SHIFT - 9);
2658
2659 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
2660 sh->log_start == MaxSector)
2661 r5l_add_no_space_stripe(log, sh);
2662 else if (!r5l_has_free_space(log, reserve)) {
2663 if (sh->log_start == log->last_checkpoint)
2664 BUG();
2665 else
2666 r5l_add_no_space_stripe(log, sh);
2667 } else {
2668 ret = r5l_log_stripe(log, sh, pages, 0);
2669 if (ret) {
2670 spin_lock_irq(&log->io_list_lock);
2671 list_add_tail(&sh->log_list, &log->no_mem_stripes);
2672 spin_unlock_irq(&log->io_list_lock);
2673 }
2674 }
2675
2676 mutex_unlock(&log->io_mutex);
2677 return 0;
2678}
2679
2680
2681bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
2682{
2683 struct r5l_log *log = conf->log;
2684 sector_t tree_index;
2685 void *slot;
2686
2687 if (!log)
2688 return false;
2689
2690 WARN_ON_ONCE(!rcu_read_lock_held());
2691 tree_index = r5c_tree_index(conf, sect);
2692 slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
2693 return slot != NULL;
2694}
2695
2696static int r5l_load_log(struct r5l_log *log)
2697{
2698 struct md_rdev *rdev = log->rdev;
2699 struct page *page;
2700 struct r5l_meta_block *mb;
2701 sector_t cp = log->rdev->journal_tail;
2702 u32 stored_crc, expected_crc;
2703 bool create_super = false;
2704 int ret = 0;
2705
2706
2707 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
2708 cp = 0;
2709 page = alloc_page(GFP_KERNEL);
2710 if (!page)
2711 return -ENOMEM;
2712
2713 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
2714 ret = -EIO;
2715 goto ioerr;
2716 }
2717 mb = page_address(page);
2718
2719 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
2720 mb->version != R5LOG_VERSION) {
2721 create_super = true;
2722 goto create;
2723 }
2724 stored_crc = le32_to_cpu(mb->checksum);
2725 mb->checksum = 0;
2726 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
2727 if (stored_crc != expected_crc) {
2728 create_super = true;
2729 goto create;
2730 }
2731 if (le64_to_cpu(mb->position) != cp) {
2732 create_super = true;
2733 goto create;
2734 }
2735create:
2736 if (create_super) {
2737 log->last_cp_seq = prandom_u32();
2738 cp = 0;
2739 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
2740
2741
2742
2743
2744
2745 r5l_write_super(log, cp);
2746 } else
2747 log->last_cp_seq = le64_to_cpu(mb->seq);
2748
2749 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
2750 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
2751 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
2752 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
2753 log->last_checkpoint = cp;
2754
2755 __free_page(page);
2756
2757 if (create_super) {
2758 log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
2759 log->seq = log->last_cp_seq + 1;
2760 log->next_checkpoint = cp;
2761 } else
2762 ret = r5l_recovery_log(log);
2763
2764 r5c_update_log_state(log);
2765 return ret;
2766ioerr:
2767 __free_page(page);
2768 return ret;
2769}
2770
2771void r5c_update_on_rdev_error(struct mddev *mddev)
2772{
2773 struct r5conf *conf = mddev->private;
2774 struct r5l_log *log = conf->log;
2775
2776 if (!log)
2777 return;
2778
2779 if (raid5_calc_degraded(conf) > 0 &&
2780 conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
2781 schedule_work(&log->disable_writeback_work);
2782}
2783
2784int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
2785{
2786 struct request_queue *q = bdev_get_queue(rdev->bdev);
2787 struct r5l_log *log;
2788
2789 if (PAGE_SIZE != 4096)
2790 return -EINVAL;
2791
2792
2793
2794
2795
2796
2797
2798
2799 if (sizeof(struct r5l_meta_block) +
2800 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
2801 conf->raid_disks) > PAGE_SIZE) {
2802 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
2803 mdname(conf->mddev), conf->raid_disks);
2804 return -EINVAL;
2805 }
2806
2807 log = kzalloc(sizeof(*log), GFP_KERNEL);
2808 if (!log)
2809 return -ENOMEM;
2810 log->rdev = rdev;
2811
2812 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
2813
2814 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
2815 sizeof(rdev->mddev->uuid));
2816
2817 mutex_init(&log->io_mutex);
2818
2819 spin_lock_init(&log->io_list_lock);
2820 INIT_LIST_HEAD(&log->running_ios);
2821 INIT_LIST_HEAD(&log->io_end_ios);
2822 INIT_LIST_HEAD(&log->flushing_ios);
2823 INIT_LIST_HEAD(&log->finished_ios);
2824 bio_init(&log->flush_bio, NULL, 0);
2825
2826 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
2827 if (!log->io_kc)
2828 goto io_kc;
2829
2830 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
2831 if (!log->io_pool)
2832 goto io_pool;
2833
2834 log->bs = bioset_create(R5L_POOL_SIZE, 0);
2835 if (!log->bs)
2836 goto io_bs;
2837
2838 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
2839 if (!log->meta_pool)
2840 goto out_mempool;
2841
2842 spin_lock_init(&log->tree_lock);
2843 INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
2844
2845 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
2846 log->rdev->mddev, "reclaim");
2847 if (!log->reclaim_thread)
2848 goto reclaim_thread;
2849 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
2850
2851 init_waitqueue_head(&log->iounit_wait);
2852
2853 INIT_LIST_HEAD(&log->no_mem_stripes);
2854
2855 INIT_LIST_HEAD(&log->no_space_stripes);
2856 spin_lock_init(&log->no_space_stripes_lock);
2857
2858 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
2859 INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
2860
2861 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2862 INIT_LIST_HEAD(&log->stripe_in_journal_list);
2863 spin_lock_init(&log->stripe_in_journal_lock);
2864 atomic_set(&log->stripe_in_journal_count, 0);
2865
2866 rcu_assign_pointer(conf->log, log);
2867
2868 if (r5l_load_log(log))
2869 goto error;
2870
2871 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
2872 return 0;
2873
2874error:
2875 rcu_assign_pointer(conf->log, NULL);
2876 md_unregister_thread(&log->reclaim_thread);
2877reclaim_thread:
2878 mempool_destroy(log->meta_pool);
2879out_mempool:
2880 bioset_free(log->bs);
2881io_bs:
2882 mempool_destroy(log->io_pool);
2883io_pool:
2884 kmem_cache_destroy(log->io_kc);
2885io_kc:
2886 kfree(log);
2887 return -EINVAL;
2888}
2889
2890void r5l_exit_log(struct r5l_log *log)
2891{
2892 flush_work(&log->disable_writeback_work);
2893 md_unregister_thread(&log->reclaim_thread);
2894 mempool_destroy(log->meta_pool);
2895 bioset_free(log->bs);
2896 mempool_destroy(log->io_pool);
2897 kmem_cache_destroy(log->io_kc);
2898 kfree(log);
2899}
2900