1
2
3
4
5
6#include <linux/kernel.h>
7#include <linux/bio.h>
8#include <linux/file.h>
9#include <linux/fs.h>
10#include <linux/pagemap.h>
11#include <linux/highmem.h>
12#include <linux/time.h>
13#include <linux/init.h>
14#include <linux/string.h>
15#include <linux/backing-dev.h>
16#include <linux/writeback.h>
17#include <linux/slab.h>
18#include <linux/sched/mm.h>
19#include <linux/log2.h>
20#include <crypto/hash.h>
21#include "misc.h"
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "btrfs_inode.h"
26#include "volumes.h"
27#include "ordered-data.h"
28#include "compression.h"
29#include "extent_io.h"
30#include "extent_map.h"
31#include "zoned.h"
32
33static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
34
35const char* btrfs_compress_type2str(enum btrfs_compression_type type)
36{
37 switch (type) {
38 case BTRFS_COMPRESS_ZLIB:
39 case BTRFS_COMPRESS_LZO:
40 case BTRFS_COMPRESS_ZSTD:
41 case BTRFS_COMPRESS_NONE:
42 return btrfs_compress_types[type];
43 default:
44 break;
45 }
46
47 return NULL;
48}
49
50bool btrfs_compress_is_valid_type(const char *str, size_t len)
51{
52 int i;
53
54 for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
55 size_t comp_len = strlen(btrfs_compress_types[i]);
56
57 if (len < comp_len)
58 continue;
59
60 if (!strncmp(btrfs_compress_types[i], str, comp_len))
61 return true;
62 }
63 return false;
64}
65
66static int compression_compress_pages(int type, struct list_head *ws,
67 struct address_space *mapping, u64 start, struct page **pages,
68 unsigned long *out_pages, unsigned long *total_in,
69 unsigned long *total_out)
70{
71 switch (type) {
72 case BTRFS_COMPRESS_ZLIB:
73 return zlib_compress_pages(ws, mapping, start, pages,
74 out_pages, total_in, total_out);
75 case BTRFS_COMPRESS_LZO:
76 return lzo_compress_pages(ws, mapping, start, pages,
77 out_pages, total_in, total_out);
78 case BTRFS_COMPRESS_ZSTD:
79 return zstd_compress_pages(ws, mapping, start, pages,
80 out_pages, total_in, total_out);
81 case BTRFS_COMPRESS_NONE:
82 default:
83
84
85
86
87
88
89
90
91
92 *out_pages = 0;
93 return -E2BIG;
94 }
95}
96
97static int compression_decompress_bio(int type, struct list_head *ws,
98 struct compressed_bio *cb)
99{
100 switch (type) {
101 case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
102 case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
103 case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
104 case BTRFS_COMPRESS_NONE:
105 default:
106
107
108
109
110 BUG();
111 }
112}
113
114static int compression_decompress(int type, struct list_head *ws,
115 unsigned char *data_in, struct page *dest_page,
116 unsigned long start_byte, size_t srclen, size_t destlen)
117{
118 switch (type) {
119 case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
120 start_byte, srclen, destlen);
121 case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
122 start_byte, srclen, destlen);
123 case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
124 start_byte, srclen, destlen);
125 case BTRFS_COMPRESS_NONE:
126 default:
127
128
129
130
131 BUG();
132 }
133}
134
135static int btrfs_decompress_bio(struct compressed_bio *cb);
136
137static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
138 unsigned long disk_size)
139{
140 return sizeof(struct compressed_bio) +
141 (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
142}
143
144static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
145 u64 disk_start)
146{
147 struct btrfs_fs_info *fs_info = inode->root->fs_info;
148 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
149 const u32 csum_size = fs_info->csum_size;
150 const u32 sectorsize = fs_info->sectorsize;
151 struct page *page;
152 unsigned int i;
153 char *kaddr;
154 u8 csum[BTRFS_CSUM_SIZE];
155 struct compressed_bio *cb = bio->bi_private;
156 u8 *cb_sum = cb->sums;
157
158 if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM))
159 return 0;
160
161 shash->tfm = fs_info->csum_shash;
162
163 for (i = 0; i < cb->nr_pages; i++) {
164 u32 pg_offset;
165 u32 bytes_left = PAGE_SIZE;
166 page = cb->compressed_pages[i];
167
168
169 if (i == cb->nr_pages - 1)
170 bytes_left = cb->compressed_len - i * PAGE_SIZE;
171
172
173 for (pg_offset = 0; pg_offset < bytes_left;
174 pg_offset += sectorsize) {
175 kaddr = kmap_atomic(page);
176 crypto_shash_digest(shash, kaddr + pg_offset,
177 sectorsize, csum);
178 kunmap_atomic(kaddr);
179
180 if (memcmp(&csum, cb_sum, csum_size) != 0) {
181 btrfs_print_data_csum_error(inode, disk_start,
182 csum, cb_sum, cb->mirror_num);
183 if (btrfs_io_bio(bio)->device)
184 btrfs_dev_stat_inc_and_print(
185 btrfs_io_bio(bio)->device,
186 BTRFS_DEV_STAT_CORRUPTION_ERRS);
187 return -EIO;
188 }
189 cb_sum += csum_size;
190 disk_start += sectorsize;
191 }
192 }
193 return 0;
194}
195
196
197
198
199
200
201
202
203
204
205
206static void end_compressed_bio_read(struct bio *bio)
207{
208 struct compressed_bio *cb = bio->bi_private;
209 struct inode *inode;
210 struct page *page;
211 unsigned int index;
212 unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
213 int ret = 0;
214
215 if (bio->bi_status)
216 cb->errors = 1;
217
218
219
220
221 if (!refcount_dec_and_test(&cb->pending_bios))
222 goto out;
223
224
225
226
227
228 btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
229 cb->mirror_num = mirror;
230
231
232
233
234
235 if (cb->errors == 1)
236 goto csum_failed;
237
238 inode = cb->inode;
239 ret = check_compressed_csum(BTRFS_I(inode), bio,
240 bio->bi_iter.bi_sector << 9);
241 if (ret)
242 goto csum_failed;
243
244
245
246
247 ret = btrfs_decompress_bio(cb);
248
249csum_failed:
250 if (ret)
251 cb->errors = 1;
252
253
254 index = 0;
255 for (index = 0; index < cb->nr_pages; index++) {
256 page = cb->compressed_pages[index];
257 page->mapping = NULL;
258 put_page(page);
259 }
260
261
262 if (cb->errors) {
263 bio_io_error(cb->orig_bio);
264 } else {
265 struct bio_vec *bvec;
266 struct bvec_iter_all iter_all;
267
268
269
270
271
272 ASSERT(!bio_flagged(bio, BIO_CLONED));
273 bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
274 SetPageChecked(bvec->bv_page);
275
276 bio_endio(cb->orig_bio);
277 }
278
279
280 kfree(cb->compressed_pages);
281 kfree(cb);
282out:
283 bio_put(bio);
284}
285
286
287
288
289
290static noinline void end_compressed_writeback(struct inode *inode,
291 const struct compressed_bio *cb)
292{
293 unsigned long index = cb->start >> PAGE_SHIFT;
294 unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
295 struct page *pages[16];
296 unsigned long nr_pages = end_index - index + 1;
297 int i;
298 int ret;
299
300 if (cb->errors)
301 mapping_set_error(inode->i_mapping, -EIO);
302
303 while (nr_pages > 0) {
304 ret = find_get_pages_contig(inode->i_mapping, index,
305 min_t(unsigned long,
306 nr_pages, ARRAY_SIZE(pages)), pages);
307 if (ret == 0) {
308 nr_pages -= 1;
309 index += 1;
310 continue;
311 }
312 for (i = 0; i < ret; i++) {
313 if (cb->errors)
314 SetPageError(pages[i]);
315 end_page_writeback(pages[i]);
316 put_page(pages[i]);
317 }
318 nr_pages -= ret;
319 index += ret;
320 }
321
322}
323
324
325
326
327
328
329
330
331
332static void end_compressed_bio_write(struct bio *bio)
333{
334 struct compressed_bio *cb = bio->bi_private;
335 struct inode *inode;
336 struct page *page;
337 unsigned int index;
338
339 if (bio->bi_status)
340 cb->errors = 1;
341
342
343
344
345 if (!refcount_dec_and_test(&cb->pending_bios))
346 goto out;
347
348
349
350
351 inode = cb->inode;
352 btrfs_record_physical_zoned(inode, cb->start, bio);
353 btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
354 cb->start, cb->start + cb->len - 1,
355 !cb->errors);
356
357 end_compressed_writeback(inode, cb);
358
359
360
361
362
363
364 index = 0;
365 for (index = 0; index < cb->nr_pages; index++) {
366 page = cb->compressed_pages[index];
367 page->mapping = NULL;
368 put_page(page);
369 }
370
371
372 kfree(cb->compressed_pages);
373 kfree(cb);
374out:
375 bio_put(bio);
376}
377
378
379
380
381
382
383
384
385
386
387blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
388 unsigned int len, u64 disk_start,
389 unsigned int compressed_len,
390 struct page **compressed_pages,
391 unsigned int nr_pages,
392 unsigned int write_flags,
393 struct cgroup_subsys_state *blkcg_css)
394{
395 struct btrfs_fs_info *fs_info = inode->root->fs_info;
396 struct bio *bio = NULL;
397 struct compressed_bio *cb;
398 unsigned long bytes_left;
399 int pg_index = 0;
400 struct page *page;
401 u64 first_byte = disk_start;
402 blk_status_t ret;
403 int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
404 const bool use_append = btrfs_use_zone_append(inode, disk_start);
405 const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
406
407 WARN_ON(!PAGE_ALIGNED(start));
408 cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
409 if (!cb)
410 return BLK_STS_RESOURCE;
411 refcount_set(&cb->pending_bios, 0);
412 cb->errors = 0;
413 cb->inode = &inode->vfs_inode;
414 cb->start = start;
415 cb->len = len;
416 cb->mirror_num = 0;
417 cb->compressed_pages = compressed_pages;
418 cb->compressed_len = compressed_len;
419 cb->orig_bio = NULL;
420 cb->nr_pages = nr_pages;
421
422 bio = btrfs_bio_alloc(first_byte);
423 bio->bi_opf = bio_op | write_flags;
424 bio->bi_private = cb;
425 bio->bi_end_io = end_compressed_bio_write;
426
427 if (use_append) {
428 struct btrfs_device *device;
429
430 device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
431 if (IS_ERR(device)) {
432 kfree(cb);
433 bio_put(bio);
434 return BLK_STS_NOTSUPP;
435 }
436
437 bio_set_dev(bio, device->bdev);
438 }
439
440 if (blkcg_css) {
441 bio->bi_opf |= REQ_CGROUP_PUNT;
442 kthread_associate_blkcg(blkcg_css);
443 }
444 refcount_set(&cb->pending_bios, 1);
445
446
447 bytes_left = compressed_len;
448 for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
449 int submit = 0;
450 int len = 0;
451
452 page = compressed_pages[pg_index];
453 page->mapping = inode->vfs_inode.i_mapping;
454 if (bio->bi_iter.bi_size)
455 submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
456 0);
457
458
459
460
461
462 if (!submit) {
463 if (pg_index == 0 && use_append)
464 len = bio_add_zone_append_page(bio, page,
465 PAGE_SIZE, 0);
466 else
467 len = bio_add_page(bio, page, PAGE_SIZE, 0);
468 }
469
470 page->mapping = NULL;
471 if (submit || len < PAGE_SIZE) {
472
473
474
475
476
477
478 refcount_inc(&cb->pending_bios);
479 ret = btrfs_bio_wq_end_io(fs_info, bio,
480 BTRFS_WQ_ENDIO_DATA);
481 BUG_ON(ret);
482
483 if (!skip_sum) {
484 ret = btrfs_csum_one_bio(inode, bio, start, 1);
485 BUG_ON(ret);
486 }
487
488 ret = btrfs_map_bio(fs_info, bio, 0);
489 if (ret) {
490 bio->bi_status = ret;
491 bio_endio(bio);
492 }
493
494 bio = btrfs_bio_alloc(first_byte);
495 bio->bi_opf = bio_op | write_flags;
496 bio->bi_private = cb;
497 bio->bi_end_io = end_compressed_bio_write;
498 if (blkcg_css)
499 bio->bi_opf |= REQ_CGROUP_PUNT;
500
501
502
503
504 bio_add_page(bio, page, PAGE_SIZE, 0);
505 }
506 if (bytes_left < PAGE_SIZE) {
507 btrfs_info(fs_info,
508 "bytes left %lu compress len %u nr %u",
509 bytes_left, cb->compressed_len, cb->nr_pages);
510 }
511 bytes_left -= PAGE_SIZE;
512 first_byte += PAGE_SIZE;
513 cond_resched();
514 }
515
516 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
517 BUG_ON(ret);
518
519 if (!skip_sum) {
520 ret = btrfs_csum_one_bio(inode, bio, start, 1);
521 BUG_ON(ret);
522 }
523
524 ret = btrfs_map_bio(fs_info, bio, 0);
525 if (ret) {
526 bio->bi_status = ret;
527 bio_endio(bio);
528 }
529
530 if (blkcg_css)
531 kthread_associate_blkcg(NULL);
532
533 return 0;
534}
535
536static u64 bio_end_offset(struct bio *bio)
537{
538 struct bio_vec *last = bio_last_bvec_all(bio);
539
540 return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
541}
542
543static noinline int add_ra_bio_pages(struct inode *inode,
544 u64 compressed_end,
545 struct compressed_bio *cb)
546{
547 unsigned long end_index;
548 unsigned long pg_index;
549 u64 last_offset;
550 u64 isize = i_size_read(inode);
551 int ret;
552 struct page *page;
553 unsigned long nr_pages = 0;
554 struct extent_map *em;
555 struct address_space *mapping = inode->i_mapping;
556 struct extent_map_tree *em_tree;
557 struct extent_io_tree *tree;
558 u64 end;
559 int misses = 0;
560
561 last_offset = bio_end_offset(cb->orig_bio);
562 em_tree = &BTRFS_I(inode)->extent_tree;
563 tree = &BTRFS_I(inode)->io_tree;
564
565 if (isize == 0)
566 return 0;
567
568
569
570
571
572
573
574
575 if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
576 return 0;
577
578 end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
579
580 while (last_offset < compressed_end) {
581 pg_index = last_offset >> PAGE_SHIFT;
582
583 if (pg_index > end_index)
584 break;
585
586 page = xa_load(&mapping->i_pages, pg_index);
587 if (page && !xa_is_value(page)) {
588 misses++;
589 if (misses > 4)
590 break;
591 goto next;
592 }
593
594 page = __page_cache_alloc(mapping_gfp_constraint(mapping,
595 ~__GFP_FS));
596 if (!page)
597 break;
598
599 if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
600 put_page(page);
601 goto next;
602 }
603
604
605
606
607
608
609 ret = set_page_extent_mapped(page);
610 if (ret < 0) {
611 unlock_page(page);
612 put_page(page);
613 break;
614 }
615
616 end = last_offset + PAGE_SIZE - 1;
617 lock_extent(tree, last_offset, end);
618 read_lock(&em_tree->lock);
619 em = lookup_extent_mapping(em_tree, last_offset,
620 PAGE_SIZE);
621 read_unlock(&em_tree->lock);
622
623 if (!em || last_offset < em->start ||
624 (last_offset + PAGE_SIZE > extent_map_end(em)) ||
625 (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
626 free_extent_map(em);
627 unlock_extent(tree, last_offset, end);
628 unlock_page(page);
629 put_page(page);
630 break;
631 }
632 free_extent_map(em);
633
634 if (page->index == end_index) {
635 size_t zero_offset = offset_in_page(isize);
636
637 if (zero_offset) {
638 int zeros;
639 zeros = PAGE_SIZE - zero_offset;
640 memzero_page(page, zero_offset, zeros);
641 flush_dcache_page(page);
642 }
643 }
644
645 ret = bio_add_page(cb->orig_bio, page,
646 PAGE_SIZE, 0);
647
648 if (ret == PAGE_SIZE) {
649 nr_pages++;
650 put_page(page);
651 } else {
652 unlock_extent(tree, last_offset, end);
653 unlock_page(page);
654 put_page(page);
655 break;
656 }
657next:
658 last_offset += PAGE_SIZE;
659 }
660 return 0;
661}
662
663
664
665
666
667
668
669
670
671
672
673
674blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
675 int mirror_num, unsigned long bio_flags)
676{
677 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
678 struct extent_map_tree *em_tree;
679 struct compressed_bio *cb;
680 unsigned int compressed_len;
681 unsigned int nr_pages;
682 unsigned int pg_index;
683 struct page *page;
684 struct bio *comp_bio;
685 u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
686 u64 file_offset;
687 u64 em_len;
688 u64 em_start;
689 struct extent_map *em;
690 blk_status_t ret = BLK_STS_RESOURCE;
691 int faili = 0;
692 u8 *sums;
693
694 em_tree = &BTRFS_I(inode)->extent_tree;
695
696 file_offset = bio_first_bvec_all(bio)->bv_offset +
697 page_offset(bio_first_page_all(bio));
698
699
700 read_lock(&em_tree->lock);
701 em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
702 read_unlock(&em_tree->lock);
703 if (!em)
704 return BLK_STS_IOERR;
705
706 ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
707 compressed_len = em->block_len;
708 cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
709 if (!cb)
710 goto out;
711
712 refcount_set(&cb->pending_bios, 0);
713 cb->errors = 0;
714 cb->inode = inode;
715 cb->mirror_num = mirror_num;
716 sums = cb->sums;
717
718 cb->start = em->orig_start;
719 em_len = em->len;
720 em_start = em->start;
721
722 free_extent_map(em);
723 em = NULL;
724
725 cb->len = bio->bi_iter.bi_size;
726 cb->compressed_len = compressed_len;
727 cb->compress_type = extent_compress_type(bio_flags);
728 cb->orig_bio = bio;
729
730 nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
731 cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
732 GFP_NOFS);
733 if (!cb->compressed_pages)
734 goto fail1;
735
736 for (pg_index = 0; pg_index < nr_pages; pg_index++) {
737 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
738 if (!cb->compressed_pages[pg_index]) {
739 faili = pg_index - 1;
740 ret = BLK_STS_RESOURCE;
741 goto fail2;
742 }
743 }
744 faili = nr_pages - 1;
745 cb->nr_pages = nr_pages;
746
747 add_ra_bio_pages(inode, em_start + em_len, cb);
748
749
750 cb->len = bio->bi_iter.bi_size;
751
752 comp_bio = btrfs_bio_alloc(cur_disk_byte);
753 comp_bio->bi_opf = REQ_OP_READ;
754 comp_bio->bi_private = cb;
755 comp_bio->bi_end_io = end_compressed_bio_read;
756 refcount_set(&cb->pending_bios, 1);
757
758 for (pg_index = 0; pg_index < nr_pages; pg_index++) {
759 u32 pg_len = PAGE_SIZE;
760 int submit = 0;
761
762
763
764
765
766
767
768
769 if (pg_index == nr_pages - 1)
770 pg_len = min_t(u32, PAGE_SIZE,
771 compressed_len - pg_index * PAGE_SIZE);
772
773 page = cb->compressed_pages[pg_index];
774 page->mapping = inode->i_mapping;
775 page->index = em_start >> PAGE_SHIFT;
776
777 if (comp_bio->bi_iter.bi_size)
778 submit = btrfs_bio_fits_in_stripe(page, pg_len,
779 comp_bio, 0);
780
781 page->mapping = NULL;
782 if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
783 unsigned int nr_sectors;
784
785 ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
786 BTRFS_WQ_ENDIO_DATA);
787 BUG_ON(ret);
788
789
790
791
792
793
794
795 refcount_inc(&cb->pending_bios);
796
797 ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
798 BUG_ON(ret);
799
800 nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
801 fs_info->sectorsize);
802 sums += fs_info->csum_size * nr_sectors;
803
804 ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
805 if (ret) {
806 comp_bio->bi_status = ret;
807 bio_endio(comp_bio);
808 }
809
810 comp_bio = btrfs_bio_alloc(cur_disk_byte);
811 comp_bio->bi_opf = REQ_OP_READ;
812 comp_bio->bi_private = cb;
813 comp_bio->bi_end_io = end_compressed_bio_read;
814
815 bio_add_page(comp_bio, page, pg_len, 0);
816 }
817 cur_disk_byte += pg_len;
818 }
819
820 ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
821 BUG_ON(ret);
822
823 ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
824 BUG_ON(ret);
825
826 ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
827 if (ret) {
828 comp_bio->bi_status = ret;
829 bio_endio(comp_bio);
830 }
831
832 return 0;
833
834fail2:
835 while (faili >= 0) {
836 __free_page(cb->compressed_pages[faili]);
837 faili--;
838 }
839
840 kfree(cb->compressed_pages);
841fail1:
842 kfree(cb);
843out:
844 free_extent_map(em);
845 return ret;
846}
847
848
849
850
851
852
853
854
855#define SAMPLING_READ_SIZE (16)
856#define SAMPLING_INTERVAL (256)
857
858
859
860
861
862
863#define BUCKET_SIZE (256)
864
865
866
867
868
869
870
871
872
873
874
875
876
877#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
878 SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
879
880struct bucket_item {
881 u32 count;
882};
883
884struct heuristic_ws {
885
886 u8 *sample;
887 u32 sample_size;
888
889 struct bucket_item *bucket;
890
891 struct bucket_item *bucket_b;
892 struct list_head list;
893};
894
895static struct workspace_manager heuristic_wsm;
896
897static void free_heuristic_ws(struct list_head *ws)
898{
899 struct heuristic_ws *workspace;
900
901 workspace = list_entry(ws, struct heuristic_ws, list);
902
903 kvfree(workspace->sample);
904 kfree(workspace->bucket);
905 kfree(workspace->bucket_b);
906 kfree(workspace);
907}
908
909static struct list_head *alloc_heuristic_ws(unsigned int level)
910{
911 struct heuristic_ws *ws;
912
913 ws = kzalloc(sizeof(*ws), GFP_KERNEL);
914 if (!ws)
915 return ERR_PTR(-ENOMEM);
916
917 ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
918 if (!ws->sample)
919 goto fail;
920
921 ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
922 if (!ws->bucket)
923 goto fail;
924
925 ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
926 if (!ws->bucket_b)
927 goto fail;
928
929 INIT_LIST_HEAD(&ws->list);
930 return &ws->list;
931fail:
932 free_heuristic_ws(&ws->list);
933 return ERR_PTR(-ENOMEM);
934}
935
936const struct btrfs_compress_op btrfs_heuristic_compress = {
937 .workspace_manager = &heuristic_wsm,
938};
939
940static const struct btrfs_compress_op * const btrfs_compress_op[] = {
941
942 &btrfs_heuristic_compress,
943 &btrfs_zlib_compress,
944 &btrfs_lzo_compress,
945 &btrfs_zstd_compress,
946};
947
948static struct list_head *alloc_workspace(int type, unsigned int level)
949{
950 switch (type) {
951 case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
952 case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
953 case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level);
954 case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
955 default:
956
957
958
959
960 BUG();
961 }
962}
963
964static void free_workspace(int type, struct list_head *ws)
965{
966 switch (type) {
967 case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
968 case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
969 case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws);
970 case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
971 default:
972
973
974
975
976 BUG();
977 }
978}
979
980static void btrfs_init_workspace_manager(int type)
981{
982 struct workspace_manager *wsm;
983 struct list_head *workspace;
984
985 wsm = btrfs_compress_op[type]->workspace_manager;
986 INIT_LIST_HEAD(&wsm->idle_ws);
987 spin_lock_init(&wsm->ws_lock);
988 atomic_set(&wsm->total_ws, 0);
989 init_waitqueue_head(&wsm->ws_wait);
990
991
992
993
994
995 workspace = alloc_workspace(type, 0);
996 if (IS_ERR(workspace)) {
997 pr_warn(
998 "BTRFS: cannot preallocate compression workspace, will try later\n");
999 } else {
1000 atomic_set(&wsm->total_ws, 1);
1001 wsm->free_ws = 1;
1002 list_add(workspace, &wsm->idle_ws);
1003 }
1004}
1005
1006static void btrfs_cleanup_workspace_manager(int type)
1007{
1008 struct workspace_manager *wsman;
1009 struct list_head *ws;
1010
1011 wsman = btrfs_compress_op[type]->workspace_manager;
1012 while (!list_empty(&wsman->idle_ws)) {
1013 ws = wsman->idle_ws.next;
1014 list_del(ws);
1015 free_workspace(type, ws);
1016 atomic_dec(&wsman->total_ws);
1017 }
1018}
1019
1020
1021
1022
1023
1024
1025
1026struct list_head *btrfs_get_workspace(int type, unsigned int level)
1027{
1028 struct workspace_manager *wsm;
1029 struct list_head *workspace;
1030 int cpus = num_online_cpus();
1031 unsigned nofs_flag;
1032 struct list_head *idle_ws;
1033 spinlock_t *ws_lock;
1034 atomic_t *total_ws;
1035 wait_queue_head_t *ws_wait;
1036 int *free_ws;
1037
1038 wsm = btrfs_compress_op[type]->workspace_manager;
1039 idle_ws = &wsm->idle_ws;
1040 ws_lock = &wsm->ws_lock;
1041 total_ws = &wsm->total_ws;
1042 ws_wait = &wsm->ws_wait;
1043 free_ws = &wsm->free_ws;
1044
1045again:
1046 spin_lock(ws_lock);
1047 if (!list_empty(idle_ws)) {
1048 workspace = idle_ws->next;
1049 list_del(workspace);
1050 (*free_ws)--;
1051 spin_unlock(ws_lock);
1052 return workspace;
1053
1054 }
1055 if (atomic_read(total_ws) > cpus) {
1056 DEFINE_WAIT(wait);
1057
1058 spin_unlock(ws_lock);
1059 prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
1060 if (atomic_read(total_ws) > cpus && !*free_ws)
1061 schedule();
1062 finish_wait(ws_wait, &wait);
1063 goto again;
1064 }
1065 atomic_inc(total_ws);
1066 spin_unlock(ws_lock);
1067
1068
1069
1070
1071
1072
1073 nofs_flag = memalloc_nofs_save();
1074 workspace = alloc_workspace(type, level);
1075 memalloc_nofs_restore(nofs_flag);
1076
1077 if (IS_ERR(workspace)) {
1078 atomic_dec(total_ws);
1079 wake_up(ws_wait);
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091 if (atomic_read(total_ws) == 0) {
1092 static DEFINE_RATELIMIT_STATE(_rs,
1093 60 * HZ,
1094 1);
1095
1096 if (__ratelimit(&_rs)) {
1097 pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
1098 }
1099 }
1100 goto again;
1101 }
1102 return workspace;
1103}
1104
1105static struct list_head *get_workspace(int type, int level)
1106{
1107 switch (type) {
1108 case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
1109 case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
1110 case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level);
1111 case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
1112 default:
1113
1114
1115
1116
1117 BUG();
1118 }
1119}
1120
1121
1122
1123
1124
1125void btrfs_put_workspace(int type, struct list_head *ws)
1126{
1127 struct workspace_manager *wsm;
1128 struct list_head *idle_ws;
1129 spinlock_t *ws_lock;
1130 atomic_t *total_ws;
1131 wait_queue_head_t *ws_wait;
1132 int *free_ws;
1133
1134 wsm = btrfs_compress_op[type]->workspace_manager;
1135 idle_ws = &wsm->idle_ws;
1136 ws_lock = &wsm->ws_lock;
1137 total_ws = &wsm->total_ws;
1138 ws_wait = &wsm->ws_wait;
1139 free_ws = &wsm->free_ws;
1140
1141 spin_lock(ws_lock);
1142 if (*free_ws <= num_online_cpus()) {
1143 list_add(ws, idle_ws);
1144 (*free_ws)++;
1145 spin_unlock(ws_lock);
1146 goto wake;
1147 }
1148 spin_unlock(ws_lock);
1149
1150 free_workspace(type, ws);
1151 atomic_dec(total_ws);
1152wake:
1153 cond_wake_up(ws_wait);
1154}
1155
1156static void put_workspace(int type, struct list_head *ws)
1157{
1158 switch (type) {
1159 case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
1160 case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
1161 case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws);
1162 case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
1163 default:
1164
1165
1166
1167
1168 BUG();
1169 }
1170}
1171
1172
1173
1174
1175
1176static unsigned int btrfs_compress_set_level(int type, unsigned level)
1177{
1178 const struct btrfs_compress_op *ops = btrfs_compress_op[type];
1179
1180 if (level == 0)
1181 level = ops->default_level;
1182 else
1183 level = min(level, ops->max_level);
1184
1185 return level;
1186}
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
1209 u64 start, struct page **pages,
1210 unsigned long *out_pages,
1211 unsigned long *total_in,
1212 unsigned long *total_out)
1213{
1214 int type = btrfs_compress_type(type_level);
1215 int level = btrfs_compress_level(type_level);
1216 struct list_head *workspace;
1217 int ret;
1218
1219 level = btrfs_compress_set_level(type, level);
1220 workspace = get_workspace(type, level);
1221 ret = compression_compress_pages(type, workspace, mapping, start, pages,
1222 out_pages, total_in, total_out);
1223 put_workspace(type, workspace);
1224 return ret;
1225}
1226
1227static int btrfs_decompress_bio(struct compressed_bio *cb)
1228{
1229 struct list_head *workspace;
1230 int ret;
1231 int type = cb->compress_type;
1232
1233 workspace = get_workspace(type, 0);
1234 ret = compression_decompress_bio(type, workspace, cb);
1235 put_workspace(type, workspace);
1236
1237 return ret;
1238}
1239
1240
1241
1242
1243
1244
1245int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
1246 unsigned long start_byte, size_t srclen, size_t destlen)
1247{
1248 struct list_head *workspace;
1249 int ret;
1250
1251 workspace = get_workspace(type, 0);
1252 ret = compression_decompress(type, workspace, data_in, dest_page,
1253 start_byte, srclen, destlen);
1254 put_workspace(type, workspace);
1255
1256 return ret;
1257}
1258
1259void __init btrfs_init_compress(void)
1260{
1261 btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
1262 btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
1263 btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
1264 zstd_init_workspace_manager();
1265}
1266
1267void __cold btrfs_exit_compress(void)
1268{
1269 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
1270 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
1271 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
1272 zstd_cleanup_workspace_manager();
1273}
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
1306 struct compressed_bio *cb, u32 decompressed)
1307{
1308 struct bio *orig_bio = cb->orig_bio;
1309
1310 u32 cur_offset;
1311
1312 cur_offset = decompressed;
1313
1314 while (cur_offset < decompressed + buf_len) {
1315 struct bio_vec bvec;
1316 size_t copy_len;
1317 u32 copy_start;
1318
1319 u32 bvec_offset;
1320
1321 bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
1322
1323
1324
1325
1326 bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
1327
1328
1329 if (decompressed + buf_len <= bvec_offset)
1330 return 1;
1331
1332 copy_start = max(cur_offset, bvec_offset);
1333 copy_len = min(bvec_offset + bvec.bv_len,
1334 decompressed + buf_len) - copy_start;
1335 ASSERT(copy_len);
1336
1337
1338
1339
1340
1341 ASSERT(copy_start - decompressed < buf_len);
1342 memcpy_to_page(bvec.bv_page, bvec.bv_offset,
1343 buf + copy_start - decompressed, copy_len);
1344 flush_dcache_page(bvec.bv_page);
1345 cur_offset += copy_len;
1346
1347 bio_advance(orig_bio, copy_len);
1348
1349 if (!orig_bio->bi_iter.bi_size)
1350 return 0;
1351 }
1352 return 1;
1353}
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372#define ENTROPY_LVL_ACEPTABLE (65)
1373#define ENTROPY_LVL_HIGH (80)
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385static inline u32 ilog2_w(u64 n)
1386{
1387 return ilog2(n * n * n * n);
1388}
1389
1390static u32 shannon_entropy(struct heuristic_ws *ws)
1391{
1392 const u32 entropy_max = 8 * ilog2_w(2);
1393 u32 entropy_sum = 0;
1394 u32 p, p_base, sz_base;
1395 u32 i;
1396
1397 sz_base = ilog2_w(ws->sample_size);
1398 for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
1399 p = ws->bucket[i].count;
1400 p_base = ilog2_w(p);
1401 entropy_sum += p * (sz_base - p_base);
1402 }
1403
1404 entropy_sum /= ws->sample_size;
1405 return entropy_sum * 100 / entropy_max;
1406}
1407
1408#define RADIX_BASE 4U
1409#define COUNTERS_SIZE (1U << RADIX_BASE)
1410
1411static u8 get4bits(u64 num, int shift) {
1412 u8 low4bits;
1413
1414 num >>= shift;
1415
1416 low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
1417 return low4bits;
1418}
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
1430 int num)
1431{
1432 u64 max_num;
1433 u64 buf_num;
1434 u32 counters[COUNTERS_SIZE];
1435 u32 new_addr;
1436 u32 addr;
1437 int bitlen;
1438 int shift;
1439 int i;
1440
1441
1442
1443
1444
1445 max_num = array[0].count;
1446 for (i = 1; i < num; i++) {
1447 buf_num = array[i].count;
1448 if (buf_num > max_num)
1449 max_num = buf_num;
1450 }
1451
1452 buf_num = ilog2(max_num);
1453 bitlen = ALIGN(buf_num, RADIX_BASE * 2);
1454
1455 shift = 0;
1456 while (shift < bitlen) {
1457 memset(counters, 0, sizeof(counters));
1458
1459 for (i = 0; i < num; i++) {
1460 buf_num = array[i].count;
1461 addr = get4bits(buf_num, shift);
1462 counters[addr]++;
1463 }
1464
1465 for (i = 1; i < COUNTERS_SIZE; i++)
1466 counters[i] += counters[i - 1];
1467
1468 for (i = num - 1; i >= 0; i--) {
1469 buf_num = array[i].count;
1470 addr = get4bits(buf_num, shift);
1471 counters[addr]--;
1472 new_addr = counters[addr];
1473 array_buf[new_addr] = array[i];
1474 }
1475
1476 shift += RADIX_BASE;
1477
1478
1479
1480
1481
1482
1483
1484 memset(counters, 0, sizeof(counters));
1485
1486 for (i = 0; i < num; i ++) {
1487 buf_num = array_buf[i].count;
1488 addr = get4bits(buf_num, shift);
1489 counters[addr]++;
1490 }
1491
1492 for (i = 1; i < COUNTERS_SIZE; i++)
1493 counters[i] += counters[i - 1];
1494
1495 for (i = num - 1; i >= 0; i--) {
1496 buf_num = array_buf[i].count;
1497 addr = get4bits(buf_num, shift);
1498 counters[addr]--;
1499 new_addr = counters[addr];
1500 array[new_addr] = array_buf[i];
1501 }
1502
1503 shift += RADIX_BASE;
1504 }
1505}
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523#define BYTE_CORE_SET_LOW (64)
1524#define BYTE_CORE_SET_HIGH (200)
1525
1526static int byte_core_set_size(struct heuristic_ws *ws)
1527{
1528 u32 i;
1529 u32 coreset_sum = 0;
1530 const u32 core_set_threshold = ws->sample_size * 90 / 100;
1531 struct bucket_item *bucket = ws->bucket;
1532
1533
1534 radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
1535
1536 for (i = 0; i < BYTE_CORE_SET_LOW; i++)
1537 coreset_sum += bucket[i].count;
1538
1539 if (coreset_sum > core_set_threshold)
1540 return i;
1541
1542 for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
1543 coreset_sum += bucket[i].count;
1544 if (coreset_sum > core_set_threshold)
1545 break;
1546 }
1547
1548 return i;
1549}
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562#define BYTE_SET_THRESHOLD (64)
1563
1564static u32 byte_set_size(const struct heuristic_ws *ws)
1565{
1566 u32 i;
1567 u32 byte_set_size = 0;
1568
1569 for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
1570 if (ws->bucket[i].count > 0)
1571 byte_set_size++;
1572 }
1573
1574
1575
1576
1577
1578
1579 for (; i < BUCKET_SIZE; i++) {
1580 if (ws->bucket[i].count > 0) {
1581 byte_set_size++;
1582 if (byte_set_size > BYTE_SET_THRESHOLD)
1583 return byte_set_size;
1584 }
1585 }
1586
1587 return byte_set_size;
1588}
1589
1590static bool sample_repeated_patterns(struct heuristic_ws *ws)
1591{
1592 const u32 half_of_sample = ws->sample_size / 2;
1593 const u8 *data = ws->sample;
1594
1595 return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
1596}
1597
1598static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
1599 struct heuristic_ws *ws)
1600{
1601 struct page *page;
1602 u64 index, index_end;
1603 u32 i, curr_sample_pos;
1604 u8 *in_data;
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615 if (end - start > BTRFS_MAX_UNCOMPRESSED)
1616 end = start + BTRFS_MAX_UNCOMPRESSED;
1617
1618 index = start >> PAGE_SHIFT;
1619 index_end = end >> PAGE_SHIFT;
1620
1621
1622 if (!IS_ALIGNED(end, PAGE_SIZE))
1623 index_end++;
1624
1625 curr_sample_pos = 0;
1626 while (index < index_end) {
1627 page = find_get_page(inode->i_mapping, index);
1628 in_data = kmap_local_page(page);
1629
1630 i = start % PAGE_SIZE;
1631 while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
1632
1633 if (start > end - SAMPLING_READ_SIZE)
1634 break;
1635 memcpy(&ws->sample[curr_sample_pos], &in_data[i],
1636 SAMPLING_READ_SIZE);
1637 i += SAMPLING_INTERVAL;
1638 start += SAMPLING_INTERVAL;
1639 curr_sample_pos += SAMPLING_READ_SIZE;
1640 }
1641 kunmap_local(in_data);
1642 put_page(page);
1643
1644 index++;
1645 }
1646
1647 ws->sample_size = curr_sample_pos;
1648}
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
1666{
1667 struct list_head *ws_list = get_workspace(0, 0);
1668 struct heuristic_ws *ws;
1669 u32 i;
1670 u8 byte;
1671 int ret = 0;
1672
1673 ws = list_entry(ws_list, struct heuristic_ws, list);
1674
1675 heuristic_collect_sample(inode, start, end, ws);
1676
1677 if (sample_repeated_patterns(ws)) {
1678 ret = 1;
1679 goto out;
1680 }
1681
1682 memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
1683
1684 for (i = 0; i < ws->sample_size; i++) {
1685 byte = ws->sample[i];
1686 ws->bucket[byte].count++;
1687 }
1688
1689 i = byte_set_size(ws);
1690 if (i < BYTE_SET_THRESHOLD) {
1691 ret = 2;
1692 goto out;
1693 }
1694
1695 i = byte_core_set_size(ws);
1696 if (i <= BYTE_CORE_SET_LOW) {
1697 ret = 3;
1698 goto out;
1699 }
1700
1701 if (i >= BYTE_CORE_SET_HIGH) {
1702 ret = 0;
1703 goto out;
1704 }
1705
1706 i = shannon_entropy(ws);
1707 if (i <= ENTROPY_LVL_ACEPTABLE) {
1708 ret = 4;
1709 goto out;
1710 }
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727 if (i < ENTROPY_LVL_HIGH) {
1728 ret = 5;
1729 goto out;
1730 } else {
1731 ret = 0;
1732 goto out;
1733 }
1734
1735out:
1736 put_workspace(0, ws_list);
1737 return ret;
1738}
1739
1740
1741
1742
1743
1744unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
1745{
1746 unsigned int level = 0;
1747 int ret;
1748
1749 if (!type)
1750 return 0;
1751
1752 if (str[0] == ':') {
1753 ret = kstrtouint(str + 1, 10, &level);
1754 if (ret)
1755 level = 0;
1756 }
1757
1758 level = btrfs_compress_set_level(type, level);
1759
1760 return level;
1761}
1762