1
2
3
4
5
6
7
8
9
10
11
12#include <linux/list_sort.h>
13#include <linux/proc_fs.h>
14#include <linux/seq_file.h>
15#include "ext4.h"
16
17#include <trace/events/ext4.h>
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143static struct kmem_cache *ext4_es_cachep;
144
145static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
146static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
147 ext4_lblk_t end);
148static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
149static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
150 struct ext4_inode_info *locked_ei);
151
152int __init ext4_init_es(void)
153{
154 ext4_es_cachep = kmem_cache_create("ext4_extent_status",
155 sizeof(struct extent_status),
156 0, (SLAB_RECLAIM_ACCOUNT), NULL);
157 if (ext4_es_cachep == NULL)
158 return -ENOMEM;
159 return 0;
160}
161
162void ext4_exit_es(void)
163{
164 if (ext4_es_cachep)
165 kmem_cache_destroy(ext4_es_cachep);
166}
167
168void ext4_es_init_tree(struct ext4_es_tree *tree)
169{
170 tree->root = RB_ROOT;
171 tree->cache_es = NULL;
172}
173
174#ifdef ES_DEBUG__
175static void ext4_es_print_tree(struct inode *inode)
176{
177 struct ext4_es_tree *tree;
178 struct rb_node *node;
179
180 printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
181 tree = &EXT4_I(inode)->i_es_tree;
182 node = rb_first(&tree->root);
183 while (node) {
184 struct extent_status *es;
185 es = rb_entry(node, struct extent_status, rb_node);
186 printk(KERN_DEBUG " [%u/%u) %llu %x",
187 es->es_lblk, es->es_len,
188 ext4_es_pblock(es), ext4_es_status(es));
189 node = rb_next(node);
190 }
191 printk(KERN_DEBUG "\n");
192}
193#else
194#define ext4_es_print_tree(inode)
195#endif
196
197static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
198{
199 BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
200 return es->es_lblk + es->es_len - 1;
201}
202
203
204
205
206
207static struct extent_status *__es_tree_search(struct rb_root *root,
208 ext4_lblk_t lblk)
209{
210 struct rb_node *node = root->rb_node;
211 struct extent_status *es = NULL;
212
213 while (node) {
214 es = rb_entry(node, struct extent_status, rb_node);
215 if (lblk < es->es_lblk)
216 node = node->rb_left;
217 else if (lblk > ext4_es_end(es))
218 node = node->rb_right;
219 else
220 return es;
221 }
222
223 if (es && lblk < es->es_lblk)
224 return es;
225
226 if (es && lblk > ext4_es_end(es)) {
227 node = rb_next(&es->rb_node);
228 return node ? rb_entry(node, struct extent_status, rb_node) :
229 NULL;
230 }
231
232 return NULL;
233}
234
235
236
237
238
239
240
241
242
243
244void ext4_es_find_delayed_extent_range(struct inode *inode,
245 ext4_lblk_t lblk, ext4_lblk_t end,
246 struct extent_status *es)
247{
248 struct ext4_es_tree *tree = NULL;
249 struct extent_status *es1 = NULL;
250 struct rb_node *node;
251
252 BUG_ON(es == NULL);
253 BUG_ON(end < lblk);
254 trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
255
256 read_lock(&EXT4_I(inode)->i_es_lock);
257 tree = &EXT4_I(inode)->i_es_tree;
258
259
260 es->es_lblk = es->es_len = es->es_pblk = 0;
261 if (tree->cache_es) {
262 es1 = tree->cache_es;
263 if (in_range(lblk, es1->es_lblk, es1->es_len)) {
264 es_debug("%u cached by [%u/%u) %llu %x\n",
265 lblk, es1->es_lblk, es1->es_len,
266 ext4_es_pblock(es1), ext4_es_status(es1));
267 goto out;
268 }
269 }
270
271 es1 = __es_tree_search(&tree->root, lblk);
272
273out:
274 if (es1 && !ext4_es_is_delayed(es1)) {
275 while ((node = rb_next(&es1->rb_node)) != NULL) {
276 es1 = rb_entry(node, struct extent_status, rb_node);
277 if (es1->es_lblk > end) {
278 es1 = NULL;
279 break;
280 }
281 if (ext4_es_is_delayed(es1))
282 break;
283 }
284 }
285
286 if (es1 && ext4_es_is_delayed(es1)) {
287 tree->cache_es = es1;
288 es->es_lblk = es1->es_lblk;
289 es->es_len = es1->es_len;
290 es->es_pblk = es1->es_pblk;
291 }
292
293 read_unlock(&EXT4_I(inode)->i_es_lock);
294
295 trace_ext4_es_find_delayed_extent_range_exit(inode, es);
296}
297
298static void ext4_es_list_add(struct inode *inode)
299{
300 struct ext4_inode_info *ei = EXT4_I(inode);
301 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
302
303 if (!list_empty(&ei->i_es_list))
304 return;
305
306 spin_lock(&sbi->s_es_lock);
307 if (list_empty(&ei->i_es_list)) {
308 list_add_tail(&ei->i_es_list, &sbi->s_es_list);
309 sbi->s_es_nr_inode++;
310 }
311 spin_unlock(&sbi->s_es_lock);
312}
313
314static void ext4_es_list_del(struct inode *inode)
315{
316 struct ext4_inode_info *ei = EXT4_I(inode);
317 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
318
319 spin_lock(&sbi->s_es_lock);
320 if (!list_empty(&ei->i_es_list)) {
321 list_del_init(&ei->i_es_list);
322 sbi->s_es_nr_inode--;
323 WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
324 }
325 spin_unlock(&sbi->s_es_lock);
326}
327
328static struct extent_status *
329ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
330 ext4_fsblk_t pblk)
331{
332 struct extent_status *es;
333 es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
334 if (es == NULL)
335 return NULL;
336 es->es_lblk = lblk;
337 es->es_len = len;
338 es->es_pblk = pblk;
339
340
341
342
343 if (!ext4_es_is_delayed(es)) {
344 if (!EXT4_I(inode)->i_es_shk_nr++)
345 ext4_es_list_add(inode);
346 percpu_counter_inc(&EXT4_SB(inode->i_sb)->
347 s_es_stats.es_stats_shk_cnt);
348 }
349
350 EXT4_I(inode)->i_es_all_nr++;
351 percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
352
353 return es;
354}
355
356static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
357{
358 EXT4_I(inode)->i_es_all_nr--;
359 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
360
361
362 if (!ext4_es_is_delayed(es)) {
363 BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
364 if (!--EXT4_I(inode)->i_es_shk_nr)
365 ext4_es_list_del(inode);
366 percpu_counter_dec(&EXT4_SB(inode->i_sb)->
367 s_es_stats.es_stats_shk_cnt);
368 }
369
370 kmem_cache_free(ext4_es_cachep, es);
371}
372
373
374
375
376
377
378
379
380static int ext4_es_can_be_merged(struct extent_status *es1,
381 struct extent_status *es2)
382{
383 if (ext4_es_type(es1) != ext4_es_type(es2))
384 return 0;
385
386 if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
387 pr_warn("ES assertion failed when merging extents. "
388 "The sum of lengths of es1 (%d) and es2 (%d) "
389 "is bigger than allowed file size (%d)\n",
390 es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
391 WARN_ON(1);
392 return 0;
393 }
394
395 if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
396 return 0;
397
398 if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
399 (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
400 return 1;
401
402 if (ext4_es_is_hole(es1))
403 return 1;
404
405
406 if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
407 return 1;
408
409 return 0;
410}
411
412static struct extent_status *
413ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
414{
415 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
416 struct extent_status *es1;
417 struct rb_node *node;
418
419 node = rb_prev(&es->rb_node);
420 if (!node)
421 return es;
422
423 es1 = rb_entry(node, struct extent_status, rb_node);
424 if (ext4_es_can_be_merged(es1, es)) {
425 es1->es_len += es->es_len;
426 if (ext4_es_is_referenced(es))
427 ext4_es_set_referenced(es1);
428 rb_erase(&es->rb_node, &tree->root);
429 ext4_es_free_extent(inode, es);
430 es = es1;
431 }
432
433 return es;
434}
435
436static struct extent_status *
437ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
438{
439 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
440 struct extent_status *es1;
441 struct rb_node *node;
442
443 node = rb_next(&es->rb_node);
444 if (!node)
445 return es;
446
447 es1 = rb_entry(node, struct extent_status, rb_node);
448 if (ext4_es_can_be_merged(es, es1)) {
449 es->es_len += es1->es_len;
450 if (ext4_es_is_referenced(es1))
451 ext4_es_set_referenced(es);
452 rb_erase(node, &tree->root);
453 ext4_es_free_extent(inode, es1);
454 }
455
456 return es;
457}
458
459#ifdef ES_AGGRESSIVE_TEST
460#include "ext4_extents.h"
461
462static void ext4_es_insert_extent_ext_check(struct inode *inode,
463 struct extent_status *es)
464{
465 struct ext4_ext_path *path = NULL;
466 struct ext4_extent *ex;
467 ext4_lblk_t ee_block;
468 ext4_fsblk_t ee_start;
469 unsigned short ee_len;
470 int depth, ee_status, es_status;
471
472 path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
473 if (IS_ERR(path))
474 return;
475
476 depth = ext_depth(inode);
477 ex = path[depth].p_ext;
478
479 if (ex) {
480
481 ee_block = le32_to_cpu(ex->ee_block);
482 ee_start = ext4_ext_pblock(ex);
483 ee_len = ext4_ext_get_actual_len(ex);
484
485 ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
486 es_status = ext4_es_is_unwritten(es) ? 1 : 0;
487
488
489
490
491
492 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
493 if (in_range(es->es_lblk, ee_block, ee_len)) {
494 pr_warn("ES insert assertion failed for "
495 "inode: %lu we can find an extent "
496 "at block [%d/%d/%llu/%c], but we "
497 "want to add a delayed/hole extent "
498 "[%d/%d/%llu/%x]\n",
499 inode->i_ino, ee_block, ee_len,
500 ee_start, ee_status ? 'u' : 'w',
501 es->es_lblk, es->es_len,
502 ext4_es_pblock(es), ext4_es_status(es));
503 }
504 goto out;
505 }
506
507
508
509
510
511 if (es->es_lblk < ee_block ||
512 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
513 pr_warn("ES insert assertion failed for inode: %lu "
514 "ex_status [%d/%d/%llu/%c] != "
515 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
516 ee_block, ee_len, ee_start,
517 ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
518 ext4_es_pblock(es), es_status ? 'u' : 'w');
519 goto out;
520 }
521
522 if (ee_status ^ es_status) {
523 pr_warn("ES insert assertion failed for inode: %lu "
524 "ex_status [%d/%d/%llu/%c] != "
525 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
526 ee_block, ee_len, ee_start,
527 ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
528 ext4_es_pblock(es), es_status ? 'u' : 'w');
529 }
530 } else {
531
532
533
534
535 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
536 pr_warn("ES insert assertion failed for inode: %lu "
537 "can't find an extent at block %d but we want "
538 "to add a written/unwritten extent "
539 "[%d/%d/%llu/%x]\n", inode->i_ino,
540 es->es_lblk, es->es_lblk, es->es_len,
541 ext4_es_pblock(es), ext4_es_status(es));
542 }
543 }
544out:
545 ext4_ext_drop_refs(path);
546 kfree(path);
547}
548
549static void ext4_es_insert_extent_ind_check(struct inode *inode,
550 struct extent_status *es)
551{
552 struct ext4_map_blocks map;
553 int retval;
554
555
556
557
558
559
560
561
562 map.m_lblk = es->es_lblk;
563 map.m_len = es->es_len;
564
565 retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
566 if (retval > 0) {
567 if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
568
569
570
571
572 pr_warn("ES insert assertion failed for inode: %lu "
573 "We can find blocks but we want to add a "
574 "delayed/hole extent [%d/%d/%llu/%x]\n",
575 inode->i_ino, es->es_lblk, es->es_len,
576 ext4_es_pblock(es), ext4_es_status(es));
577 return;
578 } else if (ext4_es_is_written(es)) {
579 if (retval != es->es_len) {
580 pr_warn("ES insert assertion failed for "
581 "inode: %lu retval %d != es_len %d\n",
582 inode->i_ino, retval, es->es_len);
583 return;
584 }
585 if (map.m_pblk != ext4_es_pblock(es)) {
586 pr_warn("ES insert assertion failed for "
587 "inode: %lu m_pblk %llu != "
588 "es_pblk %llu\n",
589 inode->i_ino, map.m_pblk,
590 ext4_es_pblock(es));
591 return;
592 }
593 } else {
594
595
596
597
598 BUG_ON(1);
599 }
600 } else if (retval == 0) {
601 if (ext4_es_is_written(es)) {
602 pr_warn("ES insert assertion failed for inode: %lu "
603 "We can't find the block but we want to add "
604 "a written extent [%d/%d/%llu/%x]\n",
605 inode->i_ino, es->es_lblk, es->es_len,
606 ext4_es_pblock(es), ext4_es_status(es));
607 return;
608 }
609 }
610}
611
612static inline void ext4_es_insert_extent_check(struct inode *inode,
613 struct extent_status *es)
614{
615
616
617
618
619 BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
620 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
621 ext4_es_insert_extent_ext_check(inode, es);
622 else
623 ext4_es_insert_extent_ind_check(inode, es);
624}
625#else
626static inline void ext4_es_insert_extent_check(struct inode *inode,
627 struct extent_status *es)
628{
629}
630#endif
631
632static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
633{
634 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
635 struct rb_node **p = &tree->root.rb_node;
636 struct rb_node *parent = NULL;
637 struct extent_status *es;
638
639 while (*p) {
640 parent = *p;
641 es = rb_entry(parent, struct extent_status, rb_node);
642
643 if (newes->es_lblk < es->es_lblk) {
644 if (ext4_es_can_be_merged(newes, es)) {
645
646
647
648
649 es->es_lblk = newes->es_lblk;
650 es->es_len += newes->es_len;
651 if (ext4_es_is_written(es) ||
652 ext4_es_is_unwritten(es))
653 ext4_es_store_pblock(es,
654 newes->es_pblk);
655 es = ext4_es_try_to_merge_left(inode, es);
656 goto out;
657 }
658 p = &(*p)->rb_left;
659 } else if (newes->es_lblk > ext4_es_end(es)) {
660 if (ext4_es_can_be_merged(es, newes)) {
661 es->es_len += newes->es_len;
662 es = ext4_es_try_to_merge_right(inode, es);
663 goto out;
664 }
665 p = &(*p)->rb_right;
666 } else {
667 BUG_ON(1);
668 return -EINVAL;
669 }
670 }
671
672 es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
673 newes->es_pblk);
674 if (!es)
675 return -ENOMEM;
676 rb_link_node(&es->rb_node, parent, p);
677 rb_insert_color(&es->rb_node, &tree->root);
678
679out:
680 tree->cache_es = es;
681 return 0;
682}
683
684
685
686
687
688
689
690int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
691 ext4_lblk_t len, ext4_fsblk_t pblk,
692 unsigned int status)
693{
694 struct extent_status newes;
695 ext4_lblk_t end = lblk + len - 1;
696 int err = 0;
697
698 es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
699 lblk, len, pblk, status, inode->i_ino);
700
701 if (!len)
702 return 0;
703
704 BUG_ON(end < lblk);
705
706 if ((status & EXTENT_STATUS_DELAYED) &&
707 (status & EXTENT_STATUS_WRITTEN)) {
708 ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
709 " delayed and written which can potentially "
710 " cause data loss.\n", lblk, len);
711 WARN_ON(1);
712 }
713
714 newes.es_lblk = lblk;
715 newes.es_len = len;
716 ext4_es_store_pblock_status(&newes, pblk, status);
717 trace_ext4_es_insert_extent(inode, &newes);
718
719 ext4_es_insert_extent_check(inode, &newes);
720
721 write_lock(&EXT4_I(inode)->i_es_lock);
722 err = __es_remove_extent(inode, lblk, end);
723 if (err != 0)
724 goto error;
725retry:
726 err = __es_insert_extent(inode, &newes);
727 if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
728 128, EXT4_I(inode)))
729 goto retry;
730 if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
731 err = 0;
732
733error:
734 write_unlock(&EXT4_I(inode)->i_es_lock);
735
736 ext4_es_print_tree(inode);
737
738 return err;
739}
740
741
742
743
744
745
746void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
747 ext4_lblk_t len, ext4_fsblk_t pblk,
748 unsigned int status)
749{
750 struct extent_status *es;
751 struct extent_status newes;
752 ext4_lblk_t end = lblk + len - 1;
753
754 newes.es_lblk = lblk;
755 newes.es_len = len;
756 ext4_es_store_pblock_status(&newes, pblk, status);
757 trace_ext4_es_cache_extent(inode, &newes);
758
759 if (!len)
760 return;
761
762 BUG_ON(end < lblk);
763
764 write_lock(&EXT4_I(inode)->i_es_lock);
765
766 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
767 if (!es || es->es_lblk > end)
768 __es_insert_extent(inode, &newes);
769 write_unlock(&EXT4_I(inode)->i_es_lock);
770}
771
772
773
774
775
776
777
778
779int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
780 struct extent_status *es)
781{
782 struct ext4_es_tree *tree;
783 struct ext4_es_stats *stats;
784 struct extent_status *es1 = NULL;
785 struct rb_node *node;
786 int found = 0;
787
788 trace_ext4_es_lookup_extent_enter(inode, lblk);
789 es_debug("lookup extent in block %u\n", lblk);
790
791 tree = &EXT4_I(inode)->i_es_tree;
792 read_lock(&EXT4_I(inode)->i_es_lock);
793
794
795 es->es_lblk = es->es_len = es->es_pblk = 0;
796 if (tree->cache_es) {
797 es1 = tree->cache_es;
798 if (in_range(lblk, es1->es_lblk, es1->es_len)) {
799 es_debug("%u cached by [%u/%u)\n",
800 lblk, es1->es_lblk, es1->es_len);
801 found = 1;
802 goto out;
803 }
804 }
805
806 node = tree->root.rb_node;
807 while (node) {
808 es1 = rb_entry(node, struct extent_status, rb_node);
809 if (lblk < es1->es_lblk)
810 node = node->rb_left;
811 else if (lblk > ext4_es_end(es1))
812 node = node->rb_right;
813 else {
814 found = 1;
815 break;
816 }
817 }
818
819out:
820 stats = &EXT4_SB(inode->i_sb)->s_es_stats;
821 if (found) {
822 BUG_ON(!es1);
823 es->es_lblk = es1->es_lblk;
824 es->es_len = es1->es_len;
825 es->es_pblk = es1->es_pblk;
826 if (!ext4_es_is_referenced(es))
827 ext4_es_set_referenced(es);
828 stats->es_stats_cache_hits++;
829 } else {
830 stats->es_stats_cache_misses++;
831 }
832
833 read_unlock(&EXT4_I(inode)->i_es_lock);
834
835 trace_ext4_es_lookup_extent_exit(inode, es, found);
836 return found;
837}
838
839static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
840 ext4_lblk_t end)
841{
842 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
843 struct rb_node *node;
844 struct extent_status *es;
845 struct extent_status orig_es;
846 ext4_lblk_t len1, len2;
847 ext4_fsblk_t block;
848 int err;
849
850retry:
851 err = 0;
852 es = __es_tree_search(&tree->root, lblk);
853 if (!es)
854 goto out;
855 if (es->es_lblk > end)
856 goto out;
857
858
859 tree->cache_es = NULL;
860
861 orig_es.es_lblk = es->es_lblk;
862 orig_es.es_len = es->es_len;
863 orig_es.es_pblk = es->es_pblk;
864
865 len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
866 len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
867 if (len1 > 0)
868 es->es_len = len1;
869 if (len2 > 0) {
870 if (len1 > 0) {
871 struct extent_status newes;
872
873 newes.es_lblk = end + 1;
874 newes.es_len = len2;
875 block = 0x7FDEADBEEFULL;
876 if (ext4_es_is_written(&orig_es) ||
877 ext4_es_is_unwritten(&orig_es))
878 block = ext4_es_pblock(&orig_es) +
879 orig_es.es_len - len2;
880 ext4_es_store_pblock_status(&newes, block,
881 ext4_es_status(&orig_es));
882 err = __es_insert_extent(inode, &newes);
883 if (err) {
884 es->es_lblk = orig_es.es_lblk;
885 es->es_len = orig_es.es_len;
886 if ((err == -ENOMEM) &&
887 __es_shrink(EXT4_SB(inode->i_sb),
888 128, EXT4_I(inode)))
889 goto retry;
890 goto out;
891 }
892 } else {
893 es->es_lblk = end + 1;
894 es->es_len = len2;
895 if (ext4_es_is_written(es) ||
896 ext4_es_is_unwritten(es)) {
897 block = orig_es.es_pblk + orig_es.es_len - len2;
898 ext4_es_store_pblock(es, block);
899 }
900 }
901 goto out;
902 }
903
904 if (len1 > 0) {
905 node = rb_next(&es->rb_node);
906 if (node)
907 es = rb_entry(node, struct extent_status, rb_node);
908 else
909 es = NULL;
910 }
911
912 while (es && ext4_es_end(es) <= end) {
913 node = rb_next(&es->rb_node);
914 rb_erase(&es->rb_node, &tree->root);
915 ext4_es_free_extent(inode, es);
916 if (!node) {
917 es = NULL;
918 break;
919 }
920 es = rb_entry(node, struct extent_status, rb_node);
921 }
922
923 if (es && es->es_lblk < end + 1) {
924 ext4_lblk_t orig_len = es->es_len;
925
926 len1 = ext4_es_end(es) - end;
927 es->es_lblk = end + 1;
928 es->es_len = len1;
929 if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
930 block = es->es_pblk + orig_len - len1;
931 ext4_es_store_pblock(es, block);
932 }
933 }
934
935out:
936 return err;
937}
938
939
940
941
942
943
944int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
945 ext4_lblk_t len)
946{
947 ext4_lblk_t end;
948 int err = 0;
949
950 trace_ext4_es_remove_extent(inode, lblk, len);
951 es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
952 lblk, len, inode->i_ino);
953
954 if (!len)
955 return err;
956
957 end = lblk + len - 1;
958 BUG_ON(end < lblk);
959
960
961
962
963
964
965 write_lock(&EXT4_I(inode)->i_es_lock);
966 err = __es_remove_extent(inode, lblk, end);
967 write_unlock(&EXT4_I(inode)->i_es_lock);
968 ext4_es_print_tree(inode);
969 return err;
970}
971
972static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
973 struct ext4_inode_info *locked_ei)
974{
975 struct ext4_inode_info *ei;
976 struct ext4_es_stats *es_stats;
977 ktime_t start_time;
978 u64 scan_time;
979 int nr_to_walk;
980 int nr_shrunk = 0;
981 int retried = 0, nr_skipped = 0;
982
983 es_stats = &sbi->s_es_stats;
984 start_time = ktime_get();
985
986retry:
987 spin_lock(&sbi->s_es_lock);
988 nr_to_walk = sbi->s_es_nr_inode;
989 while (nr_to_walk-- > 0) {
990 if (list_empty(&sbi->s_es_list)) {
991 spin_unlock(&sbi->s_es_lock);
992 goto out;
993 }
994 ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
995 i_es_list);
996
997 list_move_tail(&ei->i_es_list, &sbi->s_es_list);
998
999
1000
1001
1002
1003 if (!retried && ext4_test_inode_state(&ei->vfs_inode,
1004 EXT4_STATE_EXT_PRECACHED)) {
1005 nr_skipped++;
1006 continue;
1007 }
1008
1009 if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
1010 nr_skipped++;
1011 continue;
1012 }
1013
1014
1015
1016
1017 spin_unlock(&sbi->s_es_lock);
1018
1019 nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
1020 write_unlock(&ei->i_es_lock);
1021
1022 if (nr_to_scan <= 0)
1023 goto out;
1024 spin_lock(&sbi->s_es_lock);
1025 }
1026 spin_unlock(&sbi->s_es_lock);
1027
1028
1029
1030
1031
1032 if ((nr_shrunk == 0) && nr_skipped && !retried) {
1033 retried++;
1034 goto retry;
1035 }
1036
1037 if (locked_ei && nr_shrunk == 0)
1038 nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1039
1040out:
1041 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1042 if (likely(es_stats->es_stats_scan_time))
1043 es_stats->es_stats_scan_time = (scan_time +
1044 es_stats->es_stats_scan_time*3) / 4;
1045 else
1046 es_stats->es_stats_scan_time = scan_time;
1047 if (scan_time > es_stats->es_stats_max_scan_time)
1048 es_stats->es_stats_max_scan_time = scan_time;
1049 if (likely(es_stats->es_stats_shrunk))
1050 es_stats->es_stats_shrunk = (nr_shrunk +
1051 es_stats->es_stats_shrunk*3) / 4;
1052 else
1053 es_stats->es_stats_shrunk = nr_shrunk;
1054
1055 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1056 nr_skipped, retried);
1057 return nr_shrunk;
1058}
1059
1060static unsigned long ext4_es_count(struct shrinker *shrink,
1061 struct shrink_control *sc)
1062{
1063 unsigned long nr;
1064 struct ext4_sb_info *sbi;
1065
1066 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1067 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1068 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1069 return nr;
1070}
1071
1072static unsigned long ext4_es_scan(struct shrinker *shrink,
1073 struct shrink_control *sc)
1074{
1075 struct ext4_sb_info *sbi = container_of(shrink,
1076 struct ext4_sb_info, s_es_shrinker);
1077 int nr_to_scan = sc->nr_to_scan;
1078 int ret, nr_shrunk;
1079
1080 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1081 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1082
1083 if (!nr_to_scan)
1084 return ret;
1085
1086 nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1087
1088 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1089 return nr_shrunk;
1090}
1091
1092static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
1093{
1094 return *pos ? NULL : SEQ_START_TOKEN;
1095}
1096
1097static void *
1098ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
1099{
1100 return NULL;
1101}
1102
1103static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
1104{
1105 struct ext4_sb_info *sbi = seq->private;
1106 struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1107 struct ext4_inode_info *ei, *max = NULL;
1108 unsigned int inode_cnt = 0;
1109
1110 if (v != SEQ_START_TOKEN)
1111 return 0;
1112
1113
1114 spin_lock(&sbi->s_es_lock);
1115 list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1116 inode_cnt++;
1117 if (max && max->i_es_all_nr < ei->i_es_all_nr)
1118 max = ei;
1119 else if (!max)
1120 max = ei;
1121 }
1122 spin_unlock(&sbi->s_es_lock);
1123
1124 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
1125 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1126 percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1127 seq_printf(seq, " %lu/%lu cache hits/misses\n",
1128 es_stats->es_stats_cache_hits,
1129 es_stats->es_stats_cache_misses);
1130 if (inode_cnt)
1131 seq_printf(seq, " %d inodes on list\n", inode_cnt);
1132
1133 seq_printf(seq, "average:\n %llu us scan time\n",
1134 div_u64(es_stats->es_stats_scan_time, 1000));
1135 seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
1136 if (inode_cnt)
1137 seq_printf(seq,
1138 "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
1139 " %llu us max scan time\n",
1140 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1141 div_u64(es_stats->es_stats_max_scan_time, 1000));
1142
1143 return 0;
1144}
1145
1146static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
1147{
1148}
1149
1150static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
1151 .start = ext4_es_seq_shrinker_info_start,
1152 .next = ext4_es_seq_shrinker_info_next,
1153 .stop = ext4_es_seq_shrinker_info_stop,
1154 .show = ext4_es_seq_shrinker_info_show,
1155};
1156
1157static int
1158ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
1159{
1160 int ret;
1161
1162 ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
1163 if (!ret) {
1164 struct seq_file *m = file->private_data;
1165 m->private = PDE_DATA(inode);
1166 }
1167
1168 return ret;
1169}
1170
1171static int
1172ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
1173{
1174 return seq_release(inode, file);
1175}
1176
1177static const struct file_operations ext4_es_seq_shrinker_info_fops = {
1178 .owner = THIS_MODULE,
1179 .open = ext4_es_seq_shrinker_info_open,
1180 .read = seq_read,
1181 .llseek = seq_lseek,
1182 .release = ext4_es_seq_shrinker_info_release,
1183};
1184
1185int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1186{
1187 int err;
1188
1189
1190 BUILD_BUG_ON(ES_SHIFT < 48);
1191 INIT_LIST_HEAD(&sbi->s_es_list);
1192 sbi->s_es_nr_inode = 0;
1193 spin_lock_init(&sbi->s_es_lock);
1194 sbi->s_es_stats.es_stats_shrunk = 0;
1195 sbi->s_es_stats.es_stats_cache_hits = 0;
1196 sbi->s_es_stats.es_stats_cache_misses = 0;
1197 sbi->s_es_stats.es_stats_scan_time = 0;
1198 sbi->s_es_stats.es_stats_max_scan_time = 0;
1199 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1200 if (err)
1201 return err;
1202 err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1203 if (err)
1204 goto err1;
1205
1206 sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1207 sbi->s_es_shrinker.count_objects = ext4_es_count;
1208 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1209 err = register_shrinker(&sbi->s_es_shrinker);
1210 if (err)
1211 goto err2;
1212
1213 if (sbi->s_proc)
1214 proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
1215 &ext4_es_seq_shrinker_info_fops, sbi);
1216
1217 return 0;
1218
1219err2:
1220 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1221err1:
1222 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1223 return err;
1224}
1225
1226void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1227{
1228 if (sbi->s_proc)
1229 remove_proc_entry("es_shrinker_info", sbi->s_proc);
1230 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1231 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1232 unregister_shrinker(&sbi->s_es_shrinker);
1233}
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1244 int *nr_to_scan, int *nr_shrunk)
1245{
1246 struct inode *inode = &ei->vfs_inode;
1247 struct ext4_es_tree *tree = &ei->i_es_tree;
1248 struct extent_status *es;
1249 struct rb_node *node;
1250
1251 es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1252 if (!es)
1253 goto out_wrap;
1254 node = &es->rb_node;
1255 while (*nr_to_scan > 0) {
1256 if (es->es_lblk > end) {
1257 ei->i_es_shrink_lblk = end + 1;
1258 return 0;
1259 }
1260
1261 (*nr_to_scan)--;
1262 node = rb_next(&es->rb_node);
1263
1264
1265
1266
1267 if (ext4_es_is_delayed(es))
1268 goto next;
1269 if (ext4_es_is_referenced(es)) {
1270 ext4_es_clear_referenced(es);
1271 goto next;
1272 }
1273
1274 rb_erase(&es->rb_node, &tree->root);
1275 ext4_es_free_extent(inode, es);
1276 (*nr_shrunk)++;
1277next:
1278 if (!node)
1279 goto out_wrap;
1280 es = rb_entry(node, struct extent_status, rb_node);
1281 }
1282 ei->i_es_shrink_lblk = es->es_lblk;
1283 return 1;
1284out_wrap:
1285 ei->i_es_shrink_lblk = 0;
1286 return 0;
1287}
1288
1289static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1290{
1291 struct inode *inode = &ei->vfs_inode;
1292 int nr_shrunk = 0;
1293 ext4_lblk_t start = ei->i_es_shrink_lblk;
1294 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1295 DEFAULT_RATELIMIT_BURST);
1296
1297 if (ei->i_es_shk_nr == 0)
1298 return 0;
1299
1300 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1301 __ratelimit(&_rs))
1302 ext4_warning(inode->i_sb, "forced shrink of precached extents");
1303
1304 if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1305 start != 0)
1306 es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1307
1308 ei->i_es_tree.cache_es = NULL;
1309 return nr_shrunk;
1310}
1311