1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/kernel.h>
16#include <linux/blkdev.h>
17#include <linux/slab.h>
18#include <linux/crc32c.h>
19#include <linux/flex_array.h>
20#include <linux/async_tx.h>
21#include <linux/raid/md_p.h>
22#include "md.h"
23#include "raid5.h"
24
25#define STRIPE_SIZE PAGE_SIZE
26#define STRIPE_SECTORS (STRIPE_SIZE>>9)
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93#define PPL_SPACE_SIZE (128 * 1024)
94
95struct ppl_conf {
96 struct mddev *mddev;
97
98
99 struct ppl_log *child_logs;
100 int count;
101
102 int block_size;
103
104 u32 signature;
105 atomic64_t seq;
106
107 struct kmem_cache *io_kc;
108 mempool_t *io_pool;
109 struct bio_set *bs;
110
111
112 int recovered_entries;
113 int mismatch_count;
114
115
116 struct list_head no_mem_stripes;
117 spinlock_t no_mem_stripes_lock;
118};
119
120struct ppl_log {
121 struct ppl_conf *ppl_conf;
122
123 struct md_rdev *rdev;
124
125 struct mutex io_mutex;
126 struct ppl_io_unit *current_io;
127
128 spinlock_t io_list_lock;
129 struct list_head io_list;
130
131 sector_t next_io_sector;
132 unsigned int entry_space;
133 bool use_multippl;
134};
135
136#define PPL_IO_INLINE_BVECS 32
137
138struct ppl_io_unit {
139 struct ppl_log *log;
140
141 struct page *header_page;
142
143 unsigned int entries_count;
144 unsigned int pp_size;
145
146 u64 seq;
147 struct list_head log_sibling;
148
149 struct list_head stripe_list;
150 atomic_t pending_stripes;
151
152 bool submitted;
153
154
155 struct bio bio;
156 struct bio_aux bio_aux;
157 struct bio_vec biovec[PPL_IO_INLINE_BVECS];
158};
159
160struct dma_async_tx_descriptor *
161ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
162 struct dma_async_tx_descriptor *tx)
163{
164 int disks = sh->disks;
165 struct page **srcs = flex_array_get(percpu->scribble, 0);
166 int count = 0, pd_idx = sh->pd_idx, i;
167 struct async_submit_ctl submit;
168
169 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
170
171
172
173
174
175
176
177 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
178
179
180
181
182
183 srcs[count++] = sh->dev[pd_idx].page;
184 } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
185
186 for (i = disks; i--;) {
187 struct r5dev *dev = &sh->dev[i];
188 if (test_bit(R5_UPTODATE, &dev->flags))
189 srcs[count++] = dev->page;
190 }
191 } else {
192 return tx;
193 }
194
195 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
196 NULL, sh, flex_array_get(percpu->scribble, 0)
197 + sizeof(struct page *) * (sh->disks + 2));
198
199 if (count == 1)
200 tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
201 &submit);
202 else
203 tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE,
204 &submit);
205
206 return tx;
207}
208
209static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data)
210{
211 struct kmem_cache *kc = pool_data;
212 struct ppl_io_unit *io;
213
214 io = kmem_cache_alloc(kc, gfp_mask);
215 if (!io)
216 return NULL;
217
218 io->header_page = alloc_page(gfp_mask);
219 if (!io->header_page) {
220 kmem_cache_free(kc, io);
221 return NULL;
222 }
223
224 return io;
225}
226
227static void ppl_io_pool_free(void *element, void *pool_data)
228{
229 struct kmem_cache *kc = pool_data;
230 struct ppl_io_unit *io = element;
231
232 __free_page(io->header_page);
233 kmem_cache_free(kc, io);
234}
235
236static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
237 struct stripe_head *sh)
238{
239 struct ppl_conf *ppl_conf = log->ppl_conf;
240 struct ppl_io_unit *io;
241 struct ppl_header *pplhdr;
242 struct page *header_page;
243
244 io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT);
245 if (!io)
246 return NULL;
247
248 header_page = io->header_page;
249 memset(io, 0, sizeof(*io));
250 io->header_page = header_page;
251
252 io->log = log;
253 INIT_LIST_HEAD(&io->log_sibling);
254 INIT_LIST_HEAD(&io->stripe_list);
255 atomic_set(&io->pending_stripes, 0);
256 bio_init(&io->bio);
257 io->bio.bio_aux = &io->bio_aux;
258 atomic_set(&io->bio.bio_aux->__bi_remaining, 1);
259 io->bio.bi_io_vec = io->biovec;
260 io->bio.bi_max_vecs = PPL_IO_INLINE_BVECS;
261
262 pplhdr = page_address(io->header_page);
263 clear_page(pplhdr);
264 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
265 pplhdr->signature = cpu_to_le32(ppl_conf->signature);
266
267 io->seq = atomic64_add_return(1, &ppl_conf->seq);
268 pplhdr->generation = cpu_to_le64(io->seq);
269
270 return io;
271}
272
273static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
274{
275 struct ppl_io_unit *io = log->current_io;
276 struct ppl_header_entry *e = NULL;
277 struct ppl_header *pplhdr;
278 int i;
279 sector_t data_sector = 0;
280 int data_disks = 0;
281 struct r5conf *conf = sh->raid_conf;
282
283 pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
284
285
286 if (io && (io->pp_size == log->entry_space ||
287 io->entries_count == PPL_HDR_MAX_ENTRIES)) {
288 pr_debug("%s: add io_unit blocked by seq: %llu\n",
289 __func__, io->seq);
290 io = NULL;
291 }
292
293
294 if (!io) {
295 io = ppl_new_iounit(log, sh);
296 if (!io)
297 return -ENOMEM;
298 spin_lock_irq(&log->io_list_lock);
299 list_add_tail(&io->log_sibling, &log->io_list);
300 spin_unlock_irq(&log->io_list_lock);
301
302 log->current_io = io;
303 }
304
305 for (i = 0; i < sh->disks; i++) {
306 struct r5dev *dev = &sh->dev[i];
307
308 if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
309 if (!data_disks || dev->sector < data_sector)
310 data_sector = dev->sector;
311 data_disks++;
312 }
313 }
314 BUG_ON(!data_disks);
315
316 pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
317 io->seq, (unsigned long long)data_sector, data_disks);
318
319 pplhdr = page_address(io->header_page);
320
321 if (io->entries_count > 0) {
322 struct ppl_header_entry *last =
323 &pplhdr->entries[io->entries_count - 1];
324 struct stripe_head *sh_last = list_last_entry(
325 &io->stripe_list, struct stripe_head, log_list);
326 u64 data_sector_last = le64_to_cpu(last->data_sector);
327 u32 data_size_last = le32_to_cpu(last->data_size);
328
329
330
331
332
333
334 if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
335 (data_sector >> ilog2(conf->chunk_sectors) ==
336 data_sector_last >> ilog2(conf->chunk_sectors)) &&
337 ((data_sector - data_sector_last) * data_disks ==
338 data_size_last >> 9))
339 e = last;
340 }
341
342 if (!e) {
343 e = &pplhdr->entries[io->entries_count++];
344 e->data_sector = cpu_to_le64(data_sector);
345 e->parity_disk = cpu_to_le32(sh->pd_idx);
346 e->checksum = cpu_to_le32(~0);
347 }
348
349 le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
350
351
352 if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
353 le32_add_cpu(&e->pp_size, PAGE_SIZE);
354 io->pp_size += PAGE_SIZE;
355 e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
356 page_address(sh->ppl_page),
357 PAGE_SIZE));
358 }
359
360 list_add_tail(&sh->log_list, &io->stripe_list);
361 atomic_inc(&io->pending_stripes);
362 sh->ppl_io = io;
363
364 return 0;
365}
366
367int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
368{
369 struct ppl_conf *ppl_conf = conf->log_private;
370 struct ppl_io_unit *io = sh->ppl_io;
371 struct ppl_log *log;
372
373 if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page ||
374 !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
375 !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
376 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
377 return -EAGAIN;
378 }
379
380 log = &ppl_conf->child_logs[sh->pd_idx];
381
382 mutex_lock(&log->io_mutex);
383
384 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
385 mutex_unlock(&log->io_mutex);
386 return -EAGAIN;
387 }
388
389 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
390 clear_bit(STRIPE_DELAYED, &sh->state);
391 atomic_inc(&sh->count);
392
393 if (ppl_log_stripe(log, sh)) {
394 spin_lock_irq(&ppl_conf->no_mem_stripes_lock);
395 list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes);
396 spin_unlock_irq(&ppl_conf->no_mem_stripes_lock);
397 }
398
399 mutex_unlock(&log->io_mutex);
400
401 return 0;
402}
403
404static void ppl_log_endio(struct bio *bio, int error)
405{
406 struct ppl_io_unit *io = bio->bi_private;
407 struct ppl_log *log = io->log;
408 struct ppl_conf *ppl_conf = log->ppl_conf;
409 struct stripe_head *sh, *next;
410
411 pr_debug("%s: seq: %llu\n", __func__, io->seq);
412
413 if (error)
414 md_error(ppl_conf->mddev, log->rdev);
415
416 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
417 list_del_init(&sh->log_list);
418
419 set_bit(STRIPE_HANDLE, &sh->state);
420 raid5_release_stripe(sh);
421 }
422}
423
424static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
425{
426 char b[BDEVNAME_SIZE];
427
428 pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
429 __func__, io->seq, bio->bi_size,
430 (unsigned long long)bio->bi_sector,
431 bdevname(bio->bi_bdev, b));
432
433 submit_bio(bio->bi_rw, bio);
434}
435
436static void ppl_submit_iounit(struct ppl_io_unit *io)
437{
438 struct ppl_log *log = io->log;
439 struct ppl_conf *ppl_conf = log->ppl_conf;
440 struct ppl_header *pplhdr = page_address(io->header_page);
441 struct bio *bio = &io->bio;
442 struct stripe_head *sh;
443 int i;
444
445 bio->bi_private = io;
446
447 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
448 ppl_log_endio(bio,0);
449 return;
450 }
451
452 for (i = 0; i < io->entries_count; i++) {
453 struct ppl_header_entry *e = &pplhdr->entries[i];
454
455 pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
456 __func__, io->seq, i, le64_to_cpu(e->data_sector),
457 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
458
459 e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
460 ilog2(ppl_conf->block_size >> 9));
461 e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
462 }
463
464 pplhdr->entries_count = cpu_to_le32(io->entries_count);
465 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
466
467
468 if (log->use_multippl &&
469 log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector <
470 (PPL_HEADER_SIZE + io->pp_size) >> 9)
471 log->next_io_sector = log->rdev->ppl.sector;
472
473
474 bio->bi_end_io = ppl_log_endio;
475 bio->bi_rw = WRITE_FUA;
476 bio->bi_bdev = log->rdev->bdev;
477 bio->bi_sector = log->next_io_sector;
478 bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
479
480 pr_debug("%s: log->current_io_sector: %llu\n", __func__,
481 (unsigned long long)log->next_io_sector);
482
483 if (log->use_multippl)
484 log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
485
486 list_for_each_entry(sh, &io->stripe_list, log_list) {
487
488 if (test_bit(STRIPE_FULL_WRITE, &sh->state))
489 continue;
490
491 if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
492 struct bio *prev = bio;
493
494 bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
495 ppl_conf->bs);
496 bio->bi_rw = prev->bi_rw;
497 bio->bi_bdev = prev->bi_bdev;
498 bio->bi_sector = bio_end_sector(prev);
499 bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
500
501 bio_chain(bio, prev);
502 ppl_submit_iounit_bio(io, prev);
503 }
504 }
505
506 ppl_submit_iounit_bio(io, bio);
507}
508
509static void ppl_submit_current_io(struct ppl_log *log)
510{
511 struct ppl_io_unit *io;
512
513 spin_lock_irq(&log->io_list_lock);
514
515 io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
516 log_sibling);
517 if (io && io->submitted)
518 io = NULL;
519
520 spin_unlock_irq(&log->io_list_lock);
521
522 if (io) {
523 io->submitted = true;
524
525 if (io == log->current_io)
526 log->current_io = NULL;
527
528 ppl_submit_iounit(io);
529 }
530}
531
532void ppl_write_stripe_run(struct r5conf *conf)
533{
534 struct ppl_conf *ppl_conf = conf->log_private;
535 struct ppl_log *log;
536 int i;
537
538 for (i = 0; i < ppl_conf->count; i++) {
539 log = &ppl_conf->child_logs[i];
540
541 mutex_lock(&log->io_mutex);
542 ppl_submit_current_io(log);
543 mutex_unlock(&log->io_mutex);
544 }
545}
546
547static void ppl_io_unit_finished(struct ppl_io_unit *io)
548{
549 struct ppl_log *log = io->log;
550 struct ppl_conf *ppl_conf = log->ppl_conf;
551 unsigned long flags;
552
553 pr_debug("%s: seq: %llu\n", __func__, io->seq);
554
555 local_irq_save(flags);
556
557 spin_lock(&log->io_list_lock);
558 list_del(&io->log_sibling);
559 spin_unlock(&log->io_list_lock);
560
561 mempool_free(io, ppl_conf->io_pool);
562
563 spin_lock(&ppl_conf->no_mem_stripes_lock);
564 if (!list_empty(&ppl_conf->no_mem_stripes)) {
565 struct stripe_head *sh;
566
567 sh = list_first_entry(&ppl_conf->no_mem_stripes,
568 struct stripe_head, log_list);
569 list_del_init(&sh->log_list);
570 set_bit(STRIPE_HANDLE, &sh->state);
571 raid5_release_stripe(sh);
572 }
573 spin_unlock(&ppl_conf->no_mem_stripes_lock);
574
575 local_irq_restore(flags);
576}
577
578void ppl_stripe_write_finished(struct stripe_head *sh)
579{
580 struct ppl_io_unit *io;
581
582 io = sh->ppl_io;
583 sh->ppl_io = NULL;
584
585 if (io && atomic_dec_and_test(&io->pending_stripes))
586 ppl_io_unit_finished(io);
587}
588
589static void ppl_xor(int size, struct page *page1, struct page *page2)
590{
591 struct async_submit_ctl submit;
592 struct dma_async_tx_descriptor *tx;
593 struct page *xor_srcs[] = { page1, page2 };
594
595 init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
596 NULL, NULL, NULL, NULL);
597 tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
598
599 async_tx_quiesce(&tx);
600}
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
669 sector_t ppl_sector)
670{
671 struct ppl_conf *ppl_conf = log->ppl_conf;
672 struct mddev *mddev = ppl_conf->mddev;
673 struct r5conf *conf = mddev->private;
674 int block_size = ppl_conf->block_size;
675 struct page *page1;
676 struct page *page2;
677 sector_t r_sector_first;
678 sector_t r_sector_last;
679 int strip_sectors;
680 int data_disks;
681 int i;
682 int ret = 0;
683 char b[BDEVNAME_SIZE];
684 unsigned int pp_size = le32_to_cpu(e->pp_size);
685 unsigned int data_size = le32_to_cpu(e->data_size);
686
687 page1 = alloc_page(GFP_KERNEL);
688 page2 = alloc_page(GFP_KERNEL);
689
690 if (!page1 || !page2) {
691 ret = -ENOMEM;
692 goto out;
693 }
694
695 r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
696
697 if ((pp_size >> 9) < conf->chunk_sectors) {
698 if (pp_size > 0) {
699 data_disks = data_size / pp_size;
700 strip_sectors = pp_size >> 9;
701 } else {
702 data_disks = conf->raid_disks - conf->max_degraded;
703 strip_sectors = (data_size >> 9) / data_disks;
704 }
705 r_sector_last = r_sector_first +
706 (data_disks - 1) * conf->chunk_sectors +
707 strip_sectors;
708 } else {
709 data_disks = conf->raid_disks - conf->max_degraded;
710 strip_sectors = conf->chunk_sectors;
711 r_sector_last = r_sector_first + (data_size >> 9);
712 }
713
714 pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
715 (unsigned long long)r_sector_first,
716 (unsigned long long)r_sector_last);
717
718
719 if (block_size == 512 &&
720 (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
721 (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
722 block_size = STRIPE_SIZE;
723
724
725 for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
726 bool update_parity = false;
727 sector_t parity_sector;
728 struct md_rdev *parity_rdev;
729 struct stripe_head sh;
730 int disk;
731 int indent = 0;
732
733 pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
734 indent += 2;
735
736 memset(page_address(page1), 0, PAGE_SIZE);
737
738
739 for (disk = 0; disk < data_disks; disk++) {
740 int dd_idx;
741 struct md_rdev *rdev;
742 sector_t sector;
743 sector_t r_sector = r_sector_first + i +
744 (disk * conf->chunk_sectors);
745
746 pr_debug("%s:%*s data member disk %d start\n",
747 __func__, indent, "", disk);
748 indent += 2;
749
750 if (r_sector >= r_sector_last) {
751 pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
752 __func__, indent, "",
753 (unsigned long long)r_sector);
754 indent -= 2;
755 continue;
756 }
757
758 update_parity = true;
759
760
761 sector = raid5_compute_sector(conf, r_sector, 0,
762 &dd_idx, NULL);
763 pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
764 __func__, indent, "",
765 (unsigned long long)r_sector, dd_idx,
766 (unsigned long long)sector);
767
768 rdev = conf->disks[dd_idx].rdev;
769 if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
770 sector >= rdev->recovery_offset)) {
771 pr_debug("%s:%*s data member disk %d missing\n",
772 __func__, indent, "", dd_idx);
773 update_parity = false;
774 break;
775 }
776
777 pr_debug("%s:%*s reading data member disk %s sector %llu\n",
778 __func__, indent, "", bdevname(rdev->bdev, b),
779 (unsigned long long)sector);
780 if (!sync_page_io(rdev, sector, block_size, page2,
781 READ, false)) {
782 md_error(mddev, rdev);
783 pr_debug("%s:%*s read failed!\n", __func__,
784 indent, "");
785 ret = -EIO;
786 goto out;
787 }
788
789 ppl_xor(block_size, page1, page2);
790
791 indent -= 2;
792 }
793
794 if (!update_parity)
795 continue;
796
797 if (pp_size > 0) {
798 pr_debug("%s:%*s reading pp disk sector %llu\n",
799 __func__, indent, "",
800 (unsigned long long)(ppl_sector + i));
801 if (!sync_page_io(log->rdev,
802 ppl_sector - log->rdev->data_offset + i,
803 block_size, page2, READ, false)) {
804 pr_debug("%s:%*s read failed!\n", __func__,
805 indent, "");
806 md_error(mddev, log->rdev);
807 ret = -EIO;
808 goto out;
809 }
810
811 ppl_xor(block_size, page1, page2);
812 }
813
814
815 parity_sector = raid5_compute_sector(conf, r_sector_first + i,
816 0, &disk, &sh);
817 BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
818 parity_rdev = conf->disks[sh.pd_idx].rdev;
819
820 BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
821 pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
822 __func__, indent, "",
823 (unsigned long long)parity_sector,
824 bdevname(parity_rdev->bdev, b));
825 if (!sync_page_io(parity_rdev, parity_sector, block_size,
826 page1, WRITE, false)) {
827 pr_debug("%s:%*s parity write error!\n", __func__,
828 indent, "");
829 md_error(mddev, parity_rdev);
830 ret = -EIO;
831 goto out;
832 }
833 }
834out:
835 if (page1)
836 __free_page(page1);
837 if (page2)
838 __free_page(page2);
839 return ret;
840}
841
842static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr,
843 sector_t offset)
844{
845 struct ppl_conf *ppl_conf = log->ppl_conf;
846 struct md_rdev *rdev = log->rdev;
847 struct mddev *mddev = rdev->mddev;
848 sector_t ppl_sector = rdev->ppl.sector + offset +
849 (PPL_HEADER_SIZE >> 9);
850 struct page *page;
851 int i;
852 int ret = 0;
853
854 page = alloc_page(GFP_KERNEL);
855 if (!page)
856 return -ENOMEM;
857
858
859 for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
860 struct ppl_header_entry *e = &pplhdr->entries[i];
861 u32 pp_size = le32_to_cpu(e->pp_size);
862 sector_t sector = ppl_sector;
863 int ppl_entry_sectors = pp_size >> 9;
864 u32 crc, crc_stored;
865
866 pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
867 __func__, rdev->raid_disk, i,
868 (unsigned long long)ppl_sector, pp_size);
869
870 crc = ~0;
871 crc_stored = le32_to_cpu(e->checksum);
872
873
874 while (pp_size) {
875 int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
876
877 if (!sync_page_io(rdev, sector - rdev->data_offset,
878 s, page, READ, false)) {
879 md_error(mddev, rdev);
880 ret = -EIO;
881 goto out;
882 }
883
884 crc = crc32c_le(crc, page_address(page), s);
885
886 pp_size -= s;
887 sector += s >> 9;
888 }
889
890 crc = ~crc;
891
892 if (crc != crc_stored) {
893
894
895
896
897
898 pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
899 __func__, crc_stored, crc);
900 ppl_conf->mismatch_count++;
901 } else {
902 ret = ppl_recover_entry(log, e, ppl_sector);
903 if (ret)
904 goto out;
905 ppl_conf->recovered_entries++;
906 }
907
908 ppl_sector += ppl_entry_sectors;
909 }
910
911
912 ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
913out:
914 __free_page(page);
915 return ret;
916}
917
918static int ppl_write_empty_header(struct ppl_log *log)
919{
920 struct page *page;
921 struct ppl_header *pplhdr;
922 struct md_rdev *rdev = log->rdev;
923 int ret = 0;
924
925 pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
926 rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
927
928 page = alloc_page(GFP_NOIO | __GFP_ZERO);
929 if (!page)
930 return -ENOMEM;
931
932 pplhdr = page_address(page);
933
934 blkdev_issue_zeroout(rdev->bdev, rdev->ppl.sector,
935 log->rdev->ppl.size, GFP_NOIO);
936 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
937 pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
938 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
939
940 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
941 PPL_HEADER_SIZE, page, WRITE_FUA, false)) {
942 md_error(rdev->mddev, rdev);
943 ret = -EIO;
944 }
945
946 __free_page(page);
947 return ret;
948}
949
950static int ppl_load_distributed(struct ppl_log *log)
951{
952 struct ppl_conf *ppl_conf = log->ppl_conf;
953 struct md_rdev *rdev = log->rdev;
954 struct mddev *mddev = rdev->mddev;
955 struct page *page, *page2, *tmp;
956 struct ppl_header *pplhdr = NULL, *prev_pplhdr = NULL;
957 u32 crc, crc_stored;
958 u32 signature;
959 int ret = 0, i;
960 sector_t pplhdr_offset = 0, prev_pplhdr_offset = 0;
961
962 pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
963
964 page = alloc_page(GFP_KERNEL);
965 if (!page)
966 return -ENOMEM;
967
968 page2 = alloc_page(GFP_KERNEL);
969 if (!page2) {
970 __free_page(page);
971 return -ENOMEM;
972 }
973
974
975 while (pplhdr_offset < rdev->ppl.size - (PPL_HEADER_SIZE >> 9)) {
976 if (!sync_page_io(rdev,
977 rdev->ppl.sector - rdev->data_offset +
978 pplhdr_offset, PAGE_SIZE, page, READ,
979 false)) {
980 md_error(mddev, rdev);
981 ret = -EIO;
982
983 pplhdr = NULL;
984 break;
985 }
986 pplhdr = page_address(page);
987
988
989 crc_stored = le32_to_cpu(pplhdr->checksum);
990 pplhdr->checksum = 0;
991 crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
992
993 if (crc_stored != crc) {
994 pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n",
995 __func__, crc_stored, crc,
996 (unsigned long long)pplhdr_offset);
997 pplhdr = prev_pplhdr;
998 pplhdr_offset = prev_pplhdr_offset;
999 break;
1000 }
1001
1002 signature = le32_to_cpu(pplhdr->signature);
1003
1004 if (mddev->external) {
1005
1006
1007
1008
1009 ppl_conf->signature = signature;
1010 } else if (ppl_conf->signature != signature) {
1011 pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x (offset: %llu)\n",
1012 __func__, signature, ppl_conf->signature,
1013 (unsigned long long)pplhdr_offset);
1014 pplhdr = prev_pplhdr;
1015 pplhdr_offset = prev_pplhdr_offset;
1016 break;
1017 }
1018
1019 if (prev_pplhdr && le64_to_cpu(prev_pplhdr->generation) >
1020 le64_to_cpu(pplhdr->generation)) {
1021
1022 pplhdr = prev_pplhdr;
1023 pplhdr_offset = prev_pplhdr_offset;
1024 break;
1025 }
1026
1027 prev_pplhdr_offset = pplhdr_offset;
1028 prev_pplhdr = pplhdr;
1029
1030 tmp = page;
1031 page = page2;
1032 page2 = tmp;
1033
1034
1035 for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++)
1036 pplhdr_offset +=
1037 le32_to_cpu(pplhdr->entries[i].pp_size) >> 9;
1038 pplhdr_offset += PPL_HEADER_SIZE >> 9;
1039 }
1040
1041
1042 if (!pplhdr)
1043 ppl_conf->mismatch_count++;
1044 else
1045 pr_debug("%s: latest PPL found at offset: %llu, with generation: %llu\n",
1046 __func__, (unsigned long long)pplhdr_offset,
1047 le64_to_cpu(pplhdr->generation));
1048
1049
1050 if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector)
1051 ret = ppl_recover(log, pplhdr, pplhdr_offset);
1052
1053
1054 if (!ret && !mddev->pers)
1055 ret = ppl_write_empty_header(log);
1056
1057 __free_page(page);
1058 __free_page(page2);
1059
1060 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
1061 __func__, ret, ppl_conf->mismatch_count,
1062 ppl_conf->recovered_entries);
1063 return ret;
1064}
1065
1066static int ppl_load(struct ppl_conf *ppl_conf)
1067{
1068 int ret = 0;
1069 u32 signature = 0;
1070 bool signature_set = false;
1071 int i;
1072
1073 for (i = 0; i < ppl_conf->count; i++) {
1074 struct ppl_log *log = &ppl_conf->child_logs[i];
1075
1076
1077 if (!log->rdev)
1078 continue;
1079
1080 ret = ppl_load_distributed(log);
1081 if (ret)
1082 break;
1083
1084
1085
1086
1087
1088
1089 if (ppl_conf->mddev->external) {
1090 if (!signature_set) {
1091 signature = ppl_conf->signature;
1092 signature_set = true;
1093 } else if (signature != ppl_conf->signature) {
1094 pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
1095 mdname(ppl_conf->mddev));
1096 ret = -EINVAL;
1097 break;
1098 }
1099 }
1100 }
1101
1102 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
1103 __func__, ret, ppl_conf->mismatch_count,
1104 ppl_conf->recovered_entries);
1105 return ret;
1106}
1107
1108static void __ppl_exit_log(struct ppl_conf *ppl_conf)
1109{
1110 clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
1111 clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags);
1112
1113 kfree(ppl_conf->child_logs);
1114
1115 if (ppl_conf->bs)
1116 bioset_free(ppl_conf->bs);
1117 mempool_destroy(ppl_conf->io_pool);
1118 kmem_cache_destroy(ppl_conf->io_kc);
1119
1120 kfree(ppl_conf);
1121}
1122
1123void ppl_exit_log(struct r5conf *conf)
1124{
1125 struct ppl_conf *ppl_conf = conf->log_private;
1126
1127 if (ppl_conf) {
1128 __ppl_exit_log(ppl_conf);
1129 conf->log_private = NULL;
1130 }
1131}
1132
1133static int ppl_validate_rdev(struct md_rdev *rdev)
1134{
1135 char b[BDEVNAME_SIZE];
1136 int ppl_data_sectors;
1137 int ppl_size_new;
1138
1139
1140
1141
1142
1143
1144
1145 ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
1146
1147 if (ppl_data_sectors > 0)
1148 ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
1149
1150 if (ppl_data_sectors <= 0) {
1151 pr_warn("md/raid:%s: PPL space too small on %s\n",
1152 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1153 return -ENOSPC;
1154 }
1155
1156 ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
1157
1158 if ((rdev->ppl.sector < rdev->data_offset &&
1159 rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
1160 (rdev->ppl.sector >= rdev->data_offset &&
1161 rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
1162 pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
1163 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1164 return -EINVAL;
1165 }
1166
1167 if (!rdev->mddev->external &&
1168 ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
1169 (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
1170 pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
1171 mdname(rdev->mddev), bdevname(rdev->bdev, b));
1172 return -EINVAL;
1173 }
1174
1175 rdev->ppl.size = ppl_size_new;
1176
1177 return 0;
1178}
1179
1180static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
1181{
1182 if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
1183 PPL_HEADER_SIZE) * 2) {
1184 log->use_multippl = true;
1185 set_bit(MD_HAS_MULTIPLE_PPLS,
1186 &log->ppl_conf->mddev->flags);
1187 log->entry_space = PPL_SPACE_SIZE;
1188 } else {
1189 log->use_multippl = false;
1190 log->entry_space = (log->rdev->ppl.size << 9) -
1191 PPL_HEADER_SIZE;
1192 }
1193 log->next_io_sector = rdev->ppl.sector;
1194}
1195
1196int ppl_init_log(struct r5conf *conf)
1197{
1198 struct ppl_conf *ppl_conf;
1199 struct mddev *mddev = conf->mddev;
1200 int ret = 0;
1201 int i;
1202 bool need_cache_flush = false;
1203
1204 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
1205 mdname(conf->mddev));
1206
1207 if (PAGE_SIZE != 4096)
1208 return -EINVAL;
1209
1210 if (mddev->level != 5) {
1211 pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
1212 mdname(mddev), mddev->level);
1213 return -EINVAL;
1214 }
1215
1216 if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
1217 pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
1218 mdname(mddev));
1219 return -EINVAL;
1220 }
1221
1222 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
1223 pr_warn("md/raid:%s PPL is not compatible with journal\n",
1224 mdname(mddev));
1225 return -EINVAL;
1226 }
1227
1228 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
1229 if (!ppl_conf)
1230 return -ENOMEM;
1231
1232 ppl_conf->mddev = mddev;
1233
1234 ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
1235 if (!ppl_conf->io_kc) {
1236 ret = -ENOMEM;
1237 goto err;
1238 }
1239
1240 ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc,
1241 ppl_io_pool_free, ppl_conf->io_kc);
1242 if (!ppl_conf->io_pool) {
1243 ret = -ENOMEM;
1244 goto err;
1245 }
1246
1247 ppl_conf->bs = bioset_create(conf->raid_disks, 0);
1248 if (!ppl_conf->bs) {
1249 ret = -ENOMEM;
1250 goto err;
1251 }
1252
1253 ppl_conf->count = conf->raid_disks;
1254 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
1255 GFP_KERNEL);
1256 if (!ppl_conf->child_logs) {
1257 ret = -ENOMEM;
1258 goto err;
1259 }
1260
1261 atomic64_set(&ppl_conf->seq, 0);
1262 INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
1263 spin_lock_init(&ppl_conf->no_mem_stripes_lock);
1264
1265 if (!mddev->external) {
1266 ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
1267 ppl_conf->block_size = 512;
1268 } else {
1269 ppl_conf->block_size = queue_logical_block_size(mddev->queue);
1270 }
1271
1272 for (i = 0; i < ppl_conf->count; i++) {
1273 struct ppl_log *log = &ppl_conf->child_logs[i];
1274 struct md_rdev *rdev = conf->disks[i].rdev;
1275
1276 mutex_init(&log->io_mutex);
1277 spin_lock_init(&log->io_list_lock);
1278 INIT_LIST_HEAD(&log->io_list);
1279
1280 log->ppl_conf = ppl_conf;
1281 log->rdev = rdev;
1282
1283 if (rdev) {
1284 struct request_queue *q;
1285
1286 ret = ppl_validate_rdev(rdev);
1287 if (ret)
1288 goto err;
1289
1290 q = bdev_get_queue(rdev->bdev);
1291 if (q->flush_flags)
1292 need_cache_flush = true;
1293 ppl_init_child_log(log, rdev);
1294 }
1295 }
1296
1297 if (need_cache_flush)
1298 pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
1299 mdname(mddev));
1300
1301
1302 ret = ppl_load(ppl_conf);
1303
1304 if (ret) {
1305 goto err;
1306 } else if (!mddev->pers && mddev->recovery_cp == 0 &&
1307 ppl_conf->recovered_entries > 0 &&
1308 ppl_conf->mismatch_count == 0) {
1309
1310
1311
1312
1313 mddev->recovery_cp = MaxSector;
1314 set_bit(MD_SB_CHANGE_CLEAN, &mddev->flags);
1315 } else if (mddev->pers && ppl_conf->mismatch_count > 0) {
1316
1317 ret = -EINVAL;
1318 goto err;
1319 }
1320
1321 conf->log_private = ppl_conf;
1322 set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
1323
1324 return 0;
1325err:
1326 __ppl_exit_log(ppl_conf);
1327 return ret;
1328}
1329
1330int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
1331{
1332 struct ppl_conf *ppl_conf = conf->log_private;
1333 struct ppl_log *log;
1334 int ret = 0;
1335 char b[BDEVNAME_SIZE];
1336
1337 if (!rdev)
1338 return -EINVAL;
1339
1340 pr_debug("%s: disk: %d operation: %s dev: %s\n",
1341 __func__, rdev->raid_disk, add ? "add" : "remove",
1342 bdevname(rdev->bdev, b));
1343
1344 if (rdev->raid_disk < 0)
1345 return 0;
1346
1347 if (rdev->raid_disk >= ppl_conf->count)
1348 return -ENODEV;
1349
1350 log = &ppl_conf->child_logs[rdev->raid_disk];
1351
1352 mutex_lock(&log->io_mutex);
1353 if (add) {
1354 ret = ppl_validate_rdev(rdev);
1355 if (!ret) {
1356 log->rdev = rdev;
1357 ret = ppl_write_empty_header(log);
1358 ppl_init_child_log(log, rdev);
1359 }
1360 } else {
1361 log->rdev = NULL;
1362 }
1363 mutex_unlock(&log->io_mutex);
1364
1365 return ret;
1366}
1367