1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/kernel.h>
16#include <linux/blkdev.h>
17#include <linux/slab.h>
18#include <linux/crc32c.h>
19#include <linux/flex_array.h>
20#include <linux/async_tx.h>
21#include <linux/raid/md_p.h>
22#include "md.h"
23#include "raid5.h"
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93#define PPL_SPACE_SIZE (128 * 1024)
94
95struct ppl_conf {
96 struct mddev *mddev;
97
98
99 struct ppl_log *child_logs;
100 int count;
101
102 int block_size;
103
104 u32 signature;
105 atomic64_t seq;
106
107 struct kmem_cache *io_kc;
108 mempool_t io_pool;
109 struct bio_set bs;
110 struct bio_set flush_bs;
111
112
113 int recovered_entries;
114 int mismatch_count;
115
116
117 struct list_head no_mem_stripes;
118 spinlock_t no_mem_stripes_lock;
119};
120
121struct ppl_log {
122 struct ppl_conf *ppl_conf;
123
124 struct md_rdev *rdev;
125
126 struct mutex io_mutex;
127 struct ppl_io_unit *current_io;
128
129 spinlock_t io_list_lock;
130 struct list_head io_list;
131
132 sector_t next_io_sector;
133 unsigned int entry_space;
134 bool use_multippl;
135 bool wb_cache_on;
136 unsigned long disk_flush_bitmap;
137};
138
139#define PPL_IO_INLINE_BVECS 32
140
141struct ppl_io_unit {
142 struct ppl_log *log;
143
144 struct page *header_page;
145
146 unsigned int entries_count;
147 unsigned int pp_size;
148
149 u64 seq;
150 struct list_head log_sibling;
151
152 struct list_head stripe_list;
153 atomic_t pending_stripes;
154 atomic_t pending_flushes;
155
156 bool submitted;
157
158
159 struct bio bio;
160 struct bio_vec biovec[PPL_IO_INLINE_BVECS];
161};
162
163struct dma_async_tx_descriptor *
164ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
165 struct dma_async_tx_descriptor *tx)
166{
167 int disks = sh->disks;
168 struct page **srcs = flex_array_get(percpu->scribble, 0);
169 int count = 0, pd_idx = sh->pd_idx, i;
170 struct async_submit_ctl submit;
171
172 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
173
174
175
176
177
178
179
180 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
181
182
183
184
185
186 srcs[count++] = sh->dev[pd_idx].page;
187 } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
188
189 for (i = disks; i--;) {
190 struct r5dev *dev = &sh->dev[i];
191 if (test_bit(R5_UPTODATE, &dev->flags))
192 srcs[count++] = dev->page;
193 }
194 } else {
195 return tx;
196 }
197
198 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
199 NULL, sh, flex_array_get(percpu->scribble, 0)
200 + sizeof(struct page *) * (sh->disks + 2));
201
202 if (count == 1)
203 tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
204 &submit);
205 else
206 tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE,
207 &submit);
208
209 return tx;
210}
211
212static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data)
213{
214 struct kmem_cache *kc = pool_data;
215 struct ppl_io_unit *io;
216
217 io = kmem_cache_alloc(kc, gfp_mask);
218 if (!io)
219 return NULL;
220
221 io->header_page = alloc_page(gfp_mask);
222 if (!io->header_page) {
223 kmem_cache_free(kc, io);
224 return NULL;
225 }
226
227 return io;
228}
229
230static void ppl_io_pool_free(void *element, void *pool_data)
231{
232 struct kmem_cache *kc = pool_data;
233 struct ppl_io_unit *io = element;
234
235 __free_page(io->header_page);
236 kmem_cache_free(kc, io);
237}
238
239static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
240 struct stripe_head *sh)
241{
242 struct ppl_conf *ppl_conf = log->ppl_conf;
243 struct ppl_io_unit *io;
244 struct ppl_header *pplhdr;
245 struct page *header_page;
246
247 io = mempool_alloc(&ppl_conf->io_pool, GFP_NOWAIT);
248 if (!io)
249 return NULL;
250
251 header_page = io->header_page;
252 memset(io, 0, sizeof(*io));
253 io->header_page = header_page;
254
255 io->log = log;
256 INIT_LIST_HEAD(&io->log_sibling);
257 INIT_LIST_HEAD(&io->stripe_list);
258 atomic_set(&io->pending_stripes, 0);
259 atomic_set(&io->pending_flushes, 0);
260 bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
261
262 pplhdr = page_address(io->header_page);
263 clear_page(pplhdr);
264 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
265 pplhdr->signature = cpu_to_le32(ppl_conf->signature);
266
267 io->seq = atomic64_add_return(1, &ppl_conf->seq);
268 pplhdr->generation = cpu_to_le64(io->seq);
269
270 return io;
271}
272
273static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
274{
275 struct ppl_io_unit *io = log->current_io;
276 struct ppl_header_entry *e = NULL;
277 struct ppl_header *pplhdr;
278 int i;
279 sector_t data_sector = 0;
280 int data_disks = 0;
281 struct r5conf *conf = sh->raid_conf;
282
283 pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
284
285
286 if (io && (io->pp_size == log->entry_space ||
287 io->entries_count == PPL_HDR_MAX_ENTRIES)) {
288 pr_debug("%s: add io_unit blocked by seq: %llu\n",
289 __func__, io->seq);
290 io = NULL;
291 }
292
293
294 if (!io) {
295 io = ppl_new_iounit(log, sh);
296 if (!io)
297 return -ENOMEM;
298 spin_lock_irq(&log->io_list_lock);
299 list_add_tail(&io->log_sibling, &log->io_list);
300 spin_unlock_irq(&log->io_list_lock);
301
302 log->current_io = io;
303 }
304
305 for (i = 0; i < sh->disks; i++) {
306 struct r5dev *dev = &sh->dev[i];
307
308 if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
309 if (!data_disks || dev->sector < data_sector)
310 data_sector = dev->sector;
311 data_disks++;
312 }
313 }
314 BUG_ON(!data_disks);
315
316 pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
317 io->seq, (unsigned long long)data_sector, data_disks);
318
319 pplhdr = page_address(io->header_page);
320
321 if (io->entries_count > 0) {
322 struct ppl_header_entry *last =
323 &pplhdr->entries[io->entries_count - 1];
324 struct stripe_head *sh_last = list_last_entry(
325 &io->stripe_list, struct stripe_head, log_list);
326 u64 data_sector_last = le64_to_cpu(last->data_sector);
327 u32 data_size_last = le32_to_cpu(last->data_size);
328
329
330
331
332
333
334 if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
335 (data_sector >> ilog2(conf->chunk_sectors) ==
336 data_sector_last >> ilog2(conf->chunk_sectors)) &&
337 ((data_sector - data_sector_last) * data_disks ==
338 data_size_last >> 9))
339 e = last;
340 }
341
342 if (!e) {
343 e = &pplhdr->entries[io->entries_count++];
344 e->data_sector = cpu_to_le64(data_sector);
345 e->parity_disk = cpu_to_le32(sh->pd_idx);
346 e->checksum = cpu_to_le32(~0);
347 }
348
349 le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
350
351
352 if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
353 le32_add_cpu(&e->pp_size, PAGE_SIZE);
354 io->pp_size += PAGE_SIZE;
355 e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
356 page_address(sh->ppl_page),
357 PAGE_SIZE));
358 }
359
360 list_add_tail(&sh->log_list, &io->stripe_list);
361 atomic_inc(&io->pending_stripes);
362 sh->ppl_io = io;
363
364 return 0;
365}
366
367int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
368{
369 struct ppl_conf *ppl_conf = conf->log_private;
370 struct ppl_io_unit *io = sh->ppl_io;
371 struct ppl_log *log;
372
373 if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page ||
374 !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
375 !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
376 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
377 return -EAGAIN;
378 }
379
380 log = &ppl_conf->child_logs[sh->pd_idx];
381
382 mutex_lock(&log->io_mutex);
383
384 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
385 mutex_unlock(&log->io_mutex);
386 return -EAGAIN;
387 }
388
389 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
390 clear_bit(STRIPE_DELAYED, &sh->state);
391 atomic_inc(&sh->count);
392
393 if (ppl_log_stripe(log, sh)) {
394 spin_lock_irq(&ppl_conf->no_mem_stripes_lock);
395 list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes);
396 spin_unlock_irq(&ppl_conf->no_mem_stripes_lock);
397 }
398
399 mutex_unlock(&log->io_mutex);
400
401 return 0;
402}
403
404static void ppl_log_endio(struct bio *bio)
405{
406 struct ppl_io_unit *io = bio->bi_private;
407 struct ppl_log *log = io->log;
408 struct ppl_conf *ppl_conf = log->ppl_conf;
409 struct stripe_head *sh, *next;
410
411 pr_debug("%s: seq: %llu\n", __func__, io->seq);
412
413 if (bio->bi_status)
414 md_error(ppl_conf->mddev, log->rdev);
415
416 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
417 list_del_init(&sh->log_list);
418
419 set_bit(STRIPE_HANDLE, &sh->state);
420 raid5_release_stripe(sh);
421 }
422}
423
424static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
425{
426 char b[BDEVNAME_SIZE];
427
428 pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
429 __func__, io->seq, bio->bi_iter.bi_size,
430 (unsigned long long)bio->bi_iter.bi_sector,
431 bio_devname(bio, b));
432
433 submit_bio(bio);
434}
435
436static void ppl_submit_iounit(struct ppl_io_unit *io)
437{
438 struct ppl_log *log = io->log;
439 struct ppl_conf *ppl_conf = log->ppl_conf;
440 struct ppl_header *pplhdr = page_address(io->header_page);
441 struct bio *bio = &io->bio;
442 struct stripe_head *sh;
443 int i;
444
445 bio->bi_private = io;
446
447 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
448 ppl_log_endio(bio);
449 return;
450 }
451
452 for (i = 0; i < io->entries_count; i++) {
453 struct ppl_header_entry *e = &pplhdr->entries[i];
454
455 pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
456 __func__, io->seq, i, le64_to_cpu(e->data_sector),
457 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
458
459 e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
460 ilog2(ppl_conf->block_size >> 9));
461 e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
462 }
463
464 pplhdr->entries_count = cpu_to_le32(io->entries_count);
465 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
466
467
468 if (log->use_multippl &&
469 log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector <
470 (PPL_HEADER_SIZE + io->pp_size) >> 9)
471 log->next_io_sector = log->rdev->ppl.sector;
472
473
474 bio->bi_end_io = ppl_log_endio;
475 bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
476 bio_set_dev(bio, log->rdev->bdev);
477 bio->bi_iter.bi_sector = log->next_io_sector;
478 bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
479
480 pr_debug("%s: log->current_io_sector: %llu\n", __func__,
481 (unsigned long long)log->next_io_sector);
482
483 if (log->use_multippl)
484 log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
485
486 WARN_ON(log->disk_flush_bitmap != 0);
487
488 list_for_each_entry(sh, &io->stripe_list, log_list) {
489 for (i = 0; i < sh->disks; i++) {
490 struct r5dev *dev = &sh->dev[i];
491
492 if ((ppl_conf->child_logs[i].wb_cache_on) &&
493 (test_bit(R5_Wantwrite, &dev->flags))) {
494 set_bit(i, &log->disk_flush_bitmap);
495 }
496 }
497
498
499 if (test_bit(STRIPE_FULL_WRITE, &sh->state))
500 continue;
501
502 if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
503 struct bio *prev = bio;
504
505 bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
506 &ppl_conf->bs);
507 bio->bi_opf = prev->bi_opf;
508 bio_copy_dev(bio, prev);
509 bio->bi_iter.bi_sector = bio_end_sector(prev);
510 bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
511
512 bio_chain(bio, prev);
513 ppl_submit_iounit_bio(io, prev);
514 }
515 }
516
517 ppl_submit_iounit_bio(io, bio);
518}
519
520static void ppl_submit_current_io(struct ppl_log *log)
521{
522 struct ppl_io_unit *io;
523
524 spin_lock_irq(&log->io_list_lock);
525
526 io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
527 log_sibling);
528 if (io && io->submitted)
529 io = NULL;
530
531 spin_unlock_irq(&log->io_list_lock);
532
533 if (io) {
534 io->submitted = true;
535
536 if (io == log->current_io)
537 log->current_io = NULL;
538
539 ppl_submit_iounit(io);
540 }
541}
542
543void ppl_write_stripe_run(struct r5conf *conf)
544{
545 struct ppl_conf *ppl_conf = conf->log_private;
546 struct ppl_log *log;
547 int i;
548
549 for (i = 0; i < ppl_conf->count; i++) {
550 log = &ppl_conf->child_logs[i];
551
552 mutex_lock(&log->io_mutex);
553 ppl_submit_current_io(log);
554 mutex_unlock(&log->io_mutex);
555 }
556}
557
558static void ppl_io_unit_finished(struct ppl_io_unit *io)
559{
560 struct ppl_log *log = io->log;
561 struct ppl_conf *ppl_conf = log->ppl_conf;
562 struct r5conf *conf = ppl_conf->mddev->private;
563 unsigned long flags;
564
565 pr_debug("%s: seq: %llu\n", __func__, io->seq);
566
567 local_irq_save(flags);
568
569 spin_lock(&log->io_list_lock);
570 list_del(&io->log_sibling);
571 spin_unlock(&log->io_list_lock);
572
573 mempool_free(io, &ppl_conf->io_pool);
574
575 spin_lock(&ppl_conf->no_mem_stripes_lock);
576 if (!list_empty(&ppl_conf->no_mem_stripes)) {
577 struct stripe_head *sh;
578
579 sh = list_first_entry(&ppl_conf->no_mem_stripes,
580 struct stripe_head, log_list);
581 list_del_init(&sh->log_list);
582 set_bit(STRIPE_HANDLE, &sh->state);
583 raid5_release_stripe(sh);
584 }
585 spin_unlock(&ppl_conf->no_mem_stripes_lock);
586
587 local_irq_restore(flags);
588
589 wake_up(&conf->wait_for_quiescent);
590}
591
592static void ppl_flush_endio(struct bio *bio)
593{
594 struct ppl_io_unit *io = bio->bi_private;
595 struct ppl_log *log = io->log;
596 struct ppl_conf *ppl_conf = log->ppl_conf;
597 struct r5conf *conf = ppl_conf->mddev->private;
598 char b[BDEVNAME_SIZE];
599
600 pr_debug("%s: dev: %s\n", __func__, bio_devname(bio, b));
601
602 if (bio->bi_status) {
603 struct md_rdev *rdev;
604
605 rcu_read_lock();
606 rdev = md_find_rdev_rcu(conf->mddev, bio_dev(bio));
607 if (rdev)
608 md_error(rdev->mddev, rdev);
609 rcu_read_unlock();
610 }
611
612 bio_put(bio);
613
614 if (atomic_dec_and_test(&io->pending_flushes)) {
615 ppl_io_unit_finished(io);
616 md_wakeup_thread(conf->mddev->thread);
617 }
618}
619
620static void ppl_do_flush(struct ppl_io_unit *io)
621{
622 struct ppl_log *log = io->log;
623 struct ppl_conf *ppl_conf = log->ppl_conf;
624 struct r5conf *conf = ppl_conf->mddev->private;
625 int raid_disks = conf->raid_disks;
626 int flushed_disks = 0;
627 int i;
628
629 atomic_set(&io->pending_flushes, raid_disks);
630
631 for_each_set_bit(i, &log->disk_flush_bitmap, raid_disks) {
632 struct md_rdev *rdev;
633 struct block_device *bdev = NULL;
634
635 rcu_read_lock();
636 rdev = rcu_dereference(conf->disks[i].rdev);
637 if (rdev && !test_bit(Faulty, &rdev->flags))
638 bdev = rdev->bdev;
639 rcu_read_unlock();
640
641 if (bdev) {
642 struct bio *bio;
643 char b[BDEVNAME_SIZE];
644
645 bio = bio_alloc_bioset(GFP_NOIO, 0, &ppl_conf->flush_bs);
646 bio_set_dev(bio, bdev);
647 bio->bi_private = io;
648 bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
649 bio->bi_end_io = ppl_flush_endio;
650
651 pr_debug("%s: dev: %s\n", __func__,
652 bio_devname(bio, b));
653
654 submit_bio(bio);
655 flushed_disks++;
656 }
657 }
658
659 log->disk_flush_bitmap = 0;
660
661 for (i = flushed_disks ; i < raid_disks; i++) {
662 if (atomic_dec_and_test(&io->pending_flushes))
663 ppl_io_unit_finished(io);
664 }
665}
666
667static inline bool ppl_no_io_unit_submitted(struct r5conf *conf,
668 struct ppl_log *log)
669{
670 struct ppl_io_unit *io;
671
672 io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
673 log_sibling);
674
675 return !io || !io->submitted;
676}
677
678void ppl_quiesce(struct r5conf *conf, int quiesce)
679{
680 struct ppl_conf *ppl_conf = conf->log_private;
681 int i;
682
683 if (quiesce) {
684 for (i = 0; i < ppl_conf->count; i++) {
685 struct ppl_log *log = &ppl_conf->child_logs[i];
686
687 spin_lock_irq(&log->io_list_lock);
688 wait_event_lock_irq(conf->wait_for_quiescent,
689 ppl_no_io_unit_submitted(conf, log),
690 log->io_list_lock);
691 spin_unlock_irq(&log->io_list_lock);
692 }
693 }
694}
695
696int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio)
697{
698 if (bio->bi_iter.bi_size == 0) {
699 bio_endio(bio);
700 return 0;
701 }
702 bio->bi_opf &= ~REQ_PREFLUSH;
703 return -EAGAIN;
704}
705
706void ppl_stripe_write_finished(struct stripe_head *sh)
707{
708 struct ppl_io_unit *io;
709
710 io = sh->ppl_io;
711 sh->ppl_io = NULL;
712
713 if (io && atomic_dec_and_test(&io->pending_stripes)) {
714 if (io->log->disk_flush_bitmap)
715 ppl_do_flush(io);
716 else
717 ppl_io_unit_finished(io);
718 }
719}
720
721static void ppl_xor(int size, struct page *page1, struct page *page2)
722{
723 struct async_submit_ctl submit;
724 struct dma_async_tx_descriptor *tx;
725 struct page *xor_srcs[] = { page1, page2 };
726
727 init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
728 NULL, NULL, NULL, NULL);
729 tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
730
731 async_tx_quiesce(&tx);
732}
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
801 sector_t ppl_sector)
802{
803 struct ppl_conf *ppl_conf = log->ppl_conf;
804 struct mddev *mddev = ppl_conf->mddev;
805 struct r5conf *conf = mddev->private;
806 int block_size = ppl_conf->block_size;
807 struct page *page1;
808 struct page *page2;
809 sector_t r_sector_first;
810 sector_t r_sector_last;
811 int strip_sectors;
812 int data_disks;
813 int i;
814 int ret = 0;
815 char b[BDEVNAME_SIZE];
816 unsigned int pp_size = le32_to_cpu(e->pp_size);
817 unsigned int data_size = le32_to_cpu(e->data_size);
818
819 page1 = alloc_page(GFP_KERNEL);
820 page2 = alloc_page(GFP_KERNEL);
821
822 if (!page1 || !page2) {
823 ret = -ENOMEM;
824 goto out;
825 }
826
827 r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
828
829 if ((pp_size >> 9) < conf->chunk_sectors) {
830 if (pp_size > 0) {
831 data_disks = data_size / pp_size;
832 strip_sectors = pp_size >> 9;
833 } else {
834 data_disks = conf->raid_disks - conf->max_degraded;
835 strip_sectors = (data_size >> 9) / data_disks;
836 }
837 r_sector_last = r_sector_first +
838 (data_disks - 1) * conf->chunk_sectors +
839 strip_sectors;
840 } else {
841 data_disks = conf->raid_disks - conf->max_degraded;
842 strip_sectors = conf->chunk_sectors;
843 r_sector_last = r_sector_first + (data_size >> 9);
844 }
845
846 pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
847 (unsigned long long)r_sector_first,
848 (unsigned long long)r_sector_last);
849
850
851 if (block_size == 512 &&
852 (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
853 (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
854 block_size = STRIPE_SIZE;
855
856
857 for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
858 bool update_parity = false;
859 sector_t parity_sector;
860 struct md_rdev *parity_rdev;
861 struct stripe_head sh;
862 int disk;
863 int indent = 0;
864
865 pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
866 indent += 2;
867
868 memset(page_address(page1), 0, PAGE_SIZE);
869
870
871 for (disk = 0; disk < data_disks; disk++) {
872 int dd_idx;
873 struct md_rdev *rdev;
874 sector_t sector;
875 sector_t r_sector = r_sector_first + i +
876 (disk * conf->chunk_sectors);
877
878 pr_debug("%s:%*s data member disk %d start\n",
879 __func__, indent, "", disk);
880 indent += 2;
881
882 if (r_sector >= r_sector_last) {
883 pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
884 __func__, indent, "",
885 (unsigned long long)r_sector);
886 indent -= 2;
887 continue;
888 }
889
890 update_parity = true;
891
892
893 sector = raid5_compute_sector(conf, r_sector, 0,
894 &dd_idx, NULL);
895 pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
896 __func__, indent, "",
897 (unsigned long long)r_sector, dd_idx,
898 (unsigned long long)sector);
899
900 rdev = conf->disks[dd_idx].rdev;
901 if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
902 sector >= rdev->recovery_offset)) {
903 pr_debug("%s:%*s data member disk %d missing\n",
904 __func__, indent, "", dd_idx);
905 update_parity = false;
906 break;
907 }
908
909 pr_debug("%s:%*s reading data member disk %s sector %llu\n",
910 __func__, indent, "", bdevname(rdev->bdev, b),
911 (unsigned long long)sector);
912 if (!sync_page_io(rdev, sector, block_size, page2,
913 REQ_OP_READ, 0, false)) {
914 md_error(mddev, rdev);
915 pr_debug("%s:%*s read failed!\n", __func__,
916 indent, "");
917 ret = -EIO;
918 goto out;
919 }
920
921 ppl_xor(block_size, page1, page2);
922
923 indent -= 2;
924 }
925
926 if (!update_parity)
927 continue;
928
929 if (pp_size > 0) {
930 pr_debug("%s:%*s reading pp disk sector %llu\n",
931 __func__, indent, "",
932 (unsigned long long)(ppl_sector + i));
933 if (!sync_page_io(log->rdev,
934 ppl_sector - log->rdev->data_offset + i,
935 block_size, page2, REQ_OP_READ, 0,
936 false)) {
937 pr_debug("%s:%*s read failed!\n", __func__,
938 indent, "");
939 md_error(mddev, log->rdev);
940 ret = -EIO;
941 goto out;
942 }
943
944 ppl_xor(block_size, page1, page2);
945 }
946
947
948 parity_sector = raid5_compute_sector(conf, r_sector_first + i,
949 0, &disk, &sh);
950 BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
951 parity_rdev = conf->disks[sh.pd_idx].rdev;
952
953 BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
954 pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
955 __func__, indent, "",
956 (unsigned long long)parity_sector,
957 bdevname(parity_rdev->bdev, b));
958 if (!sync_page_io(parity_rdev, parity_sector, block_size,
959 page1, REQ_OP_WRITE, 0, false)) {
960 pr_debug("%s:%*s parity write error!\n", __func__,
961 indent, "");
962 md_error(mddev, parity_rdev);
963 ret = -EIO;
964 goto out;
965 }
966 }
967out:
968 if (page1)
969 __free_page(page1);
970 if (page2)
971 __free_page(page2);
972 return ret;
973}
974
975static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr,
976 sector_t offset)
977{
978 struct ppl_conf *ppl_conf = log->ppl_conf;
979 struct md_rdev *rdev = log->rdev;
980 struct mddev *mddev = rdev->mddev;
981 sector_t ppl_sector = rdev->ppl.sector + offset +
982 (PPL_HEADER_SIZE >> 9);
983 struct page *page;
984 int i;
985 int ret = 0;
986
987 page = alloc_page(GFP_KERNEL);
988 if (!page)
989 return -ENOMEM;
990
991
992 for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
993 struct ppl_header_entry *e = &pplhdr->entries[i];
994 u32 pp_size = le32_to_cpu(e->pp_size);
995 sector_t sector = ppl_sector;
996 int ppl_entry_sectors = pp_size >> 9;
997 u32 crc, crc_stored;
998
999 pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
1000 __func__, rdev->raid_disk, i,
1001 (unsigned long long)ppl_sector, pp_size);
1002
1003 crc = ~0;
1004 crc_stored = le32_to_cpu(e->checksum);
1005
1006
1007 while (pp_size) {
1008 int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
1009
1010 if (!sync_page_io(rdev, sector - rdev->data_offset,
1011 s, page, REQ_OP_READ, 0, false)) {
1012 md_error(mddev, rdev);
1013 ret = -EIO;
1014 goto out;
1015 }
1016
1017 crc = crc32c_le(crc, page_address(page), s);
1018
1019 pp_size -= s;
1020 sector += s >> 9;
1021 }
1022
1023 crc = ~crc;
1024
1025 if (crc != crc_stored) {
1026
1027
1028
1029
1030
1031 pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
1032 __func__, crc_stored, crc);
1033 ppl_conf->mismatch_count++;
1034 } else {
1035 ret = ppl_recover_entry(log, e, ppl_sector);
1036 if (ret)
1037 goto out;
1038 ppl_conf->recovered_entries++;
1039 }
1040
1041 ppl_sector += ppl_entry_sectors;
1042 }
1043
1044
1045 ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
1046out:
1047 __free_page(page);
1048 return ret;
1049}
1050
1051static int ppl_write_empty_header(struct ppl_log *log)
1052{
1053 struct page *page;
1054 struct ppl_header *pplhdr;
1055 struct md_rdev *rdev = log->rdev;
1056 int ret = 0;
1057
1058 pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
1059 rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
1060
1061 page = alloc_page(GFP_NOIO | __GFP_ZERO);
1062 if (!page)
1063 return -ENOMEM;
1064
1065 pplhdr = page_address(page);
1066
1067 blkdev_issue_zeroout(rdev->bdev, rdev->ppl.sector,
1068 log->rdev->ppl.size, GFP_NOIO, 0);
1069 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
1070 pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
1071 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
1072
1073 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
1074 PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC |
1075 REQ_FUA, 0, false)) {
1076 md_error(rdev->mddev, rdev);
1077 ret = -EIO;
1078 }
1079
1080 __free_page(page);
1081 return ret;
1082}
1083
1084static int ppl_load_distributed(struct ppl_log *log)
1085{
1086 struct ppl_conf *ppl_conf = log->ppl_conf;
1087 struct md_rdev *rdev = log->rdev;
1088 struct mddev *mddev = rdev->mddev;
1089 struct page *page, *page2, *tmp;
1090 struct ppl_header *pplhdr = NULL, *prev_pplhdr = NULL;
1091 u32 crc, crc_stored;
1092 u32 signature;
1093 int ret = 0, i;
1094 sector_t pplhdr_offset = 0, prev_pplhdr_offset = 0;
1095
1096 pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
1097
1098 page = alloc_page(GFP_KERNEL);
1099 if (!page)
1100 return -ENOMEM;
1101
1102 page2 = alloc_page(GFP_KERNEL);
1103 if (!page2) {
1104 __free_page(page);
1105 return -ENOMEM;
1106 }
1107
1108
1109 while (pplhdr_offset < rdev->ppl.size - (PPL_HEADER_SIZE >> 9)) {
1110 if (!sync_page_io(rdev,
1111 rdev->ppl.sector - rdev->data_offset +
1112 pplhdr_offset, PAGE_SIZE, page, REQ_OP_READ,
1113 0, false)) {
1114 md_error(mddev, rdev);
1115 ret = -EIO;
1116
1117 pplhdr = NULL;
1118 break;
1119 }
1120 pplhdr = page_address(page);
1121
1122
1123 crc_stored = le32_to_cpu(pplhdr->checksum);
1124 pplhdr->checksum = 0;
1125 crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
1126
1127 if (crc_stored != crc) {
1128 pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n",
1129 __func__, crc_stored, crc,
1130 (unsigned long long)pplhdr_offset);
1131 pplhdr = prev_pplhdr;
1132 pplhdr_offset = prev_pplhdr_offset;
1133 break;
1134 }
1135
1136 signature = le32_to_cpu(pplhdr->signature);
1137
1138 if (mddev->external) {
1139
1140
1141
1142
1143 ppl_conf->signature = signature;
1144 } else if (ppl_conf->signature != signature) {
1145 pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x (offset: %llu)\n",
1146 __func__, signature, ppl_conf->signature,
1147 (unsigned long long)pplhdr_offset);
1148 pplhdr = prev_pplhdr;
1149 pplhdr_offset = prev_pplhdr_offset;
1150 break;
1151 }
1152
1153 if (prev_pplhdr && le64_to_cpu(prev_pplhdr->generation) >
1154 le64_to_cpu(pplhdr->generation)) {
1155
1156 pplhdr = prev_pplhdr;
1157 pplhdr_offset = prev_pplhdr_offset;
1158 break;
1159 }
1160
1161 prev_pplhdr_offset = pplhdr_offset;
1162 prev_pplhdr = pplhdr;
1163
1164 tmp = page;
1165 page = page2;
1166 page2 = tmp;
1167
1168
1169 for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++)
1170 pplhdr_offset +=
1171 le32_to_cpu(pplhdr->entries[i].pp_size) >> 9;
1172 pplhdr_offset += PPL_HEADER_SIZE >> 9;
1173 }
1174
1175
1176 if (!pplhdr)
1177 ppl_conf->mismatch_count++;
1178 else
1179 pr_debug("%s: latest PPL found at offset: %llu, with generation: %llu\n",
1180 __func__, (unsigned long long)pplhdr_offset,
1181 le64_to_cpu(pplhdr->generation));
1182
1183
1184 if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector)
1185 ret = ppl_recover(log, pplhdr, pplhdr_offset);
1186
1187
1188 if (!ret && !mddev->pers)
1189 ret = ppl_write_empty_header(log);
1190
1191 __free_page(page);
1192 __free_page(page2);
1193
1194 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
1195 __func__, ret, ppl_conf->mismatch_count,
1196 ppl_conf->recovered_entries);
1197 return ret;
1198}
1199
1200static int ppl_load(struct ppl_conf *ppl_conf)
1201{
1202 int ret = 0;
1203 u32 signature = 0;
1204 bool signature_set = false;
1205 int i;
1206
1207 for (i = 0; i < ppl_conf->count; i++) {
1208 struct ppl_log *log = &ppl_conf->child_logs[i];
1209
1210
1211 if (!log->rdev)
1212 continue;
1213
1214 ret = ppl_load_distributed(log);
1215 if (ret)
1216 break;
1217
1218
1219
1220
1221
1222
1223 if (ppl_conf->mddev->external) {
1224 if (!signature_set) {
1225 signature = ppl_conf->signature;
1226 signature_set = true;
1227 } else if (signature != ppl_conf->signature) {
1228 pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
1229 mdname(ppl_conf->mddev));
1230 ret = -EINVAL;
1231 break;
1232 }
1233 }
1234 }
1235
1236 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
1237 __func__, ret, ppl_conf->mismatch_count,
1238 ppl_conf->recovered_entries);
1239 return ret;
1240}
1241
1242static void __ppl_exit_log(struct ppl_conf *ppl_conf)
1243{
1244 clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
1245 clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags);
1246
1247 kfree(ppl_conf->child_logs);
1248
1249 bioset_exit(&ppl_conf->bs);
1250 bioset_exit(&ppl_conf->flush_bs);
1251 mempool_exit(&ppl_conf->io_pool);
1252 kmem_cache_destroy(ppl_conf->io_kc);
1253
1254 kfree(ppl_conf);
1255}
1256
1257void ppl_exit_log(struct r5conf *conf)
1258{
1259 struct ppl_conf *ppl_conf = conf->log_private;
1260
1261 if (ppl_conf) {
1262 __ppl_exit_log(ppl_conf);
1263 conf->log_private = NULL;
1264 }
1265}
1266
1267static int ppl_validate_rdev(struct md_rdev *rdev)
1268{
1269 char b[BDEVNAME_SIZE];
1270 int ppl_data_sectors;
1271 int ppl_size_new;
1272
1273
1274
1275
1276
1277
1278
1279 ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
1280
1281 if (ppl_data_sectors > 0)
1282 ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
1283
1284 if (ppl_data_sectors <= 0) {
1285 pr_warn("md/raid:%s: PPL space too small on %s\n",
1286 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1287 return -ENOSPC;
1288 }
1289
1290 ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
1291
1292 if ((rdev->ppl.sector < rdev->data_offset &&
1293 rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
1294 (rdev->ppl.sector >= rdev->data_offset &&
1295 rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
1296 pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
1297 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1298 return -EINVAL;
1299 }
1300
1301 if (!rdev->mddev->external &&
1302 ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
1303 (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
1304 pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
1305 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1306 return -EINVAL;
1307 }
1308
1309 rdev->ppl.size = ppl_size_new;
1310
1311 return 0;
1312}
1313
1314static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
1315{
1316 struct request_queue *q;
1317
1318 if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
1319 PPL_HEADER_SIZE) * 2) {
1320 log->use_multippl = true;
1321 set_bit(MD_HAS_MULTIPLE_PPLS,
1322 &log->ppl_conf->mddev->flags);
1323 log->entry_space = PPL_SPACE_SIZE;
1324 } else {
1325 log->use_multippl = false;
1326 log->entry_space = (log->rdev->ppl.size << 9) -
1327 PPL_HEADER_SIZE;
1328 }
1329 log->next_io_sector = rdev->ppl.sector;
1330
1331 q = bdev_get_queue(rdev->bdev);
1332 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
1333 log->wb_cache_on = true;
1334}
1335
1336int ppl_init_log(struct r5conf *conf)
1337{
1338 struct ppl_conf *ppl_conf;
1339 struct mddev *mddev = conf->mddev;
1340 int ret = 0;
1341 int max_disks;
1342 int i;
1343
1344 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
1345 mdname(conf->mddev));
1346
1347 if (PAGE_SIZE != 4096)
1348 return -EINVAL;
1349
1350 if (mddev->level != 5) {
1351 pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
1352 mdname(mddev), mddev->level);
1353 return -EINVAL;
1354 }
1355
1356 if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
1357 pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
1358 mdname(mddev));
1359 return -EINVAL;
1360 }
1361
1362 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
1363 pr_warn("md/raid:%s PPL is not compatible with journal\n",
1364 mdname(mddev));
1365 return -EINVAL;
1366 }
1367
1368 max_disks = FIELD_SIZEOF(struct ppl_log, disk_flush_bitmap) *
1369 BITS_PER_BYTE;
1370 if (conf->raid_disks > max_disks) {
1371 pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n",
1372 mdname(mddev), max_disks);
1373 return -EINVAL;
1374 }
1375
1376 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
1377 if (!ppl_conf)
1378 return -ENOMEM;
1379
1380 ppl_conf->mddev = mddev;
1381
1382 ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
1383 if (!ppl_conf->io_kc) {
1384 ret = -ENOMEM;
1385 goto err;
1386 }
1387
1388 ret = mempool_init(&ppl_conf->io_pool, conf->raid_disks, ppl_io_pool_alloc,
1389 ppl_io_pool_free, ppl_conf->io_kc);
1390 if (ret)
1391 goto err;
1392
1393 ret = bioset_init(&ppl_conf->bs, conf->raid_disks, 0, BIOSET_NEED_BVECS);
1394 if (ret)
1395 goto err;
1396
1397 ret = bioset_init(&ppl_conf->flush_bs, conf->raid_disks, 0, 0);
1398 if (ret)
1399 goto err;
1400
1401 ppl_conf->count = conf->raid_disks;
1402 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
1403 GFP_KERNEL);
1404 if (!ppl_conf->child_logs) {
1405 ret = -ENOMEM;
1406 goto err;
1407 }
1408
1409 atomic64_set(&ppl_conf->seq, 0);
1410 INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
1411 spin_lock_init(&ppl_conf->no_mem_stripes_lock);
1412
1413 if (!mddev->external) {
1414 ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
1415 ppl_conf->block_size = 512;
1416 } else {
1417 ppl_conf->block_size = queue_logical_block_size(mddev->queue);
1418 }
1419
1420 for (i = 0; i < ppl_conf->count; i++) {
1421 struct ppl_log *log = &ppl_conf->child_logs[i];
1422 struct md_rdev *rdev = conf->disks[i].rdev;
1423
1424 mutex_init(&log->io_mutex);
1425 spin_lock_init(&log->io_list_lock);
1426 INIT_LIST_HEAD(&log->io_list);
1427
1428 log->ppl_conf = ppl_conf;
1429 log->rdev = rdev;
1430
1431 if (rdev) {
1432 ret = ppl_validate_rdev(rdev);
1433 if (ret)
1434 goto err;
1435
1436 ppl_init_child_log(log, rdev);
1437 }
1438 }
1439
1440
1441 ret = ppl_load(ppl_conf);
1442
1443 if (ret) {
1444 goto err;
1445 } else if (!mddev->pers && mddev->recovery_cp == 0 &&
1446 ppl_conf->recovered_entries > 0 &&
1447 ppl_conf->mismatch_count == 0) {
1448
1449
1450
1451
1452 mddev->recovery_cp = MaxSector;
1453 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
1454 } else if (mddev->pers && ppl_conf->mismatch_count > 0) {
1455
1456 ret = -EINVAL;
1457 goto err;
1458 }
1459
1460 conf->log_private = ppl_conf;
1461 set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
1462
1463 return 0;
1464err:
1465 __ppl_exit_log(ppl_conf);
1466 return ret;
1467}
1468
1469int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
1470{
1471 struct ppl_conf *ppl_conf = conf->log_private;
1472 struct ppl_log *log;
1473 int ret = 0;
1474 char b[BDEVNAME_SIZE];
1475
1476 if (!rdev)
1477 return -EINVAL;
1478
1479 pr_debug("%s: disk: %d operation: %s dev: %s\n",
1480 __func__, rdev->raid_disk, add ? "add" : "remove",
1481 bdevname(rdev->bdev, b));
1482
1483 if (rdev->raid_disk < 0)
1484 return 0;
1485
1486 if (rdev->raid_disk >= ppl_conf->count)
1487 return -ENODEV;
1488
1489 log = &ppl_conf->child_logs[rdev->raid_disk];
1490
1491 mutex_lock(&log->io_mutex);
1492 if (add) {
1493 ret = ppl_validate_rdev(rdev);
1494 if (!ret) {
1495 log->rdev = rdev;
1496 ret = ppl_write_empty_header(log);
1497 ppl_init_child_log(log, rdev);
1498 }
1499 } else {
1500 log->rdev = NULL;
1501 }
1502 mutex_unlock(&log->io_mutex);
1503
1504 return ret;
1505}
1506