1
2
3
4
5
6
7
8
9
10
11
12
13#include <linux/list_sort.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "ext4.h"
17
18#include <trace/events/ext4.h>
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144static struct kmem_cache *ext4_es_cachep;
145
146static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
147static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
148 ext4_lblk_t end);
149static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
150static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
151 struct ext4_inode_info *locked_ei);
152
153int __init ext4_init_es(void)
154{
155 ext4_es_cachep = kmem_cache_create("ext4_extent_status",
156 sizeof(struct extent_status),
157 0, (SLAB_RECLAIM_ACCOUNT), NULL);
158 if (ext4_es_cachep == NULL)
159 return -ENOMEM;
160 return 0;
161}
162
163void ext4_exit_es(void)
164{
165 kmem_cache_destroy(ext4_es_cachep);
166}
167
168void ext4_es_init_tree(struct ext4_es_tree *tree)
169{
170 tree->root = RB_ROOT;
171 tree->cache_es = NULL;
172}
173
174#ifdef ES_DEBUG__
175static void ext4_es_print_tree(struct inode *inode)
176{
177 struct ext4_es_tree *tree;
178 struct rb_node *node;
179
180 printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
181 tree = &EXT4_I(inode)->i_es_tree;
182 node = rb_first(&tree->root);
183 while (node) {
184 struct extent_status *es;
185 es = rb_entry(node, struct extent_status, rb_node);
186 printk(KERN_DEBUG " [%u/%u) %llu %x",
187 es->es_lblk, es->es_len,
188 ext4_es_pblock(es), ext4_es_status(es));
189 node = rb_next(node);
190 }
191 printk(KERN_DEBUG "\n");
192}
193#else
194#define ext4_es_print_tree(inode)
195#endif
196
197static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
198{
199 BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
200 return es->es_lblk + es->es_len - 1;
201}
202
203
204
205
206
207static struct extent_status *__es_tree_search(struct rb_root *root,
208 ext4_lblk_t lblk)
209{
210 struct rb_node *node = root->rb_node;
211 struct extent_status *es = NULL;
212
213 while (node) {
214 es = rb_entry(node, struct extent_status, rb_node);
215 if (lblk < es->es_lblk)
216 node = node->rb_left;
217 else if (lblk > ext4_es_end(es))
218 node = node->rb_right;
219 else
220 return es;
221 }
222
223 if (es && lblk < es->es_lblk)
224 return es;
225
226 if (es && lblk > ext4_es_end(es)) {
227 node = rb_next(&es->rb_node);
228 return node ? rb_entry(node, struct extent_status, rb_node) :
229 NULL;
230 }
231
232 return NULL;
233}
234
235
236
237
238
239
240
241
242
243
244void ext4_es_find_delayed_extent_range(struct inode *inode,
245 ext4_lblk_t lblk, ext4_lblk_t end,
246 struct extent_status *es)
247{
248 struct ext4_es_tree *tree = NULL;
249 struct extent_status *es1 = NULL;
250 struct rb_node *node;
251
252 BUG_ON(es == NULL);
253 BUG_ON(end < lblk);
254 trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
255
256 read_lock(&EXT4_I(inode)->i_es_lock);
257 tree = &EXT4_I(inode)->i_es_tree;
258
259
260 es->es_lblk = es->es_len = es->es_pblk = 0;
261 if (tree->cache_es) {
262 es1 = tree->cache_es;
263 if (in_range(lblk, es1->es_lblk, es1->es_len)) {
264 es_debug("%u cached by [%u/%u) %llu %x\n",
265 lblk, es1->es_lblk, es1->es_len,
266 ext4_es_pblock(es1), ext4_es_status(es1));
267 goto out;
268 }
269 }
270
271 es1 = __es_tree_search(&tree->root, lblk);
272
273out:
274 if (es1 && !ext4_es_is_delayed(es1)) {
275 while ((node = rb_next(&es1->rb_node)) != NULL) {
276 es1 = rb_entry(node, struct extent_status, rb_node);
277 if (es1->es_lblk > end) {
278 es1 = NULL;
279 break;
280 }
281 if (ext4_es_is_delayed(es1))
282 break;
283 }
284 }
285
286 if (es1 && ext4_es_is_delayed(es1)) {
287 tree->cache_es = es1;
288 es->es_lblk = es1->es_lblk;
289 es->es_len = es1->es_len;
290 es->es_pblk = es1->es_pblk;
291 }
292
293 read_unlock(&EXT4_I(inode)->i_es_lock);
294
295 trace_ext4_es_find_delayed_extent_range_exit(inode, es);
296}
297
298static void ext4_es_list_add(struct inode *inode)
299{
300 struct ext4_inode_info *ei = EXT4_I(inode);
301 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
302
303 if (!list_empty(&ei->i_es_list))
304 return;
305
306 spin_lock(&sbi->s_es_lock);
307 if (list_empty(&ei->i_es_list)) {
308 list_add_tail(&ei->i_es_list, &sbi->s_es_list);
309 sbi->s_es_nr_inode++;
310 }
311 spin_unlock(&sbi->s_es_lock);
312}
313
314static void ext4_es_list_del(struct inode *inode)
315{
316 struct ext4_inode_info *ei = EXT4_I(inode);
317 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
318
319 spin_lock(&sbi->s_es_lock);
320 if (!list_empty(&ei->i_es_list)) {
321 list_del_init(&ei->i_es_list);
322 sbi->s_es_nr_inode--;
323 WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
324 }
325 spin_unlock(&sbi->s_es_lock);
326}
327
328static struct extent_status *
329ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
330 ext4_fsblk_t pblk)
331{
332 struct extent_status *es;
333 es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
334 if (es == NULL)
335 return NULL;
336 es->es_lblk = lblk;
337 es->es_len = len;
338 es->es_pblk = pblk;
339
340
341
342
343 if (!ext4_es_is_delayed(es)) {
344 if (!EXT4_I(inode)->i_es_shk_nr++)
345 ext4_es_list_add(inode);
346 percpu_counter_inc(&EXT4_SB(inode->i_sb)->
347 s_es_stats.es_stats_shk_cnt);
348 }
349
350 EXT4_I(inode)->i_es_all_nr++;
351 percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
352
353 return es;
354}
355
356static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
357{
358 EXT4_I(inode)->i_es_all_nr--;
359 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
360
361
362 if (!ext4_es_is_delayed(es)) {
363 BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
364 if (!--EXT4_I(inode)->i_es_shk_nr)
365 ext4_es_list_del(inode);
366 percpu_counter_dec(&EXT4_SB(inode->i_sb)->
367 s_es_stats.es_stats_shk_cnt);
368 }
369
370 kmem_cache_free(ext4_es_cachep, es);
371}
372
373
374
375
376
377
378
379
380static int ext4_es_can_be_merged(struct extent_status *es1,
381 struct extent_status *es2)
382{
383 if (ext4_es_type(es1) != ext4_es_type(es2))
384 return 0;
385
386 if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
387 pr_warn("ES assertion failed when merging extents. "
388 "The sum of lengths of es1 (%d) and es2 (%d) "
389 "is bigger than allowed file size (%d)\n",
390 es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
391 WARN_ON(1);
392 return 0;
393 }
394
395 if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
396 return 0;
397
398 if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
399 (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
400 return 1;
401
402 if (ext4_es_is_hole(es1))
403 return 1;
404
405
406 if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
407 return 1;
408
409 return 0;
410}
411
412static struct extent_status *
413ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
414{
415 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
416 struct extent_status *es1;
417 struct rb_node *node;
418
419 node = rb_prev(&es->rb_node);
420 if (!node)
421 return es;
422
423 es1 = rb_entry(node, struct extent_status, rb_node);
424 if (ext4_es_can_be_merged(es1, es)) {
425 es1->es_len += es->es_len;
426 if (ext4_es_is_referenced(es))
427 ext4_es_set_referenced(es1);
428 rb_erase(&es->rb_node, &tree->root);
429 ext4_es_free_extent(inode, es);
430 es = es1;
431 }
432
433 return es;
434}
435
436static struct extent_status *
437ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
438{
439 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
440 struct extent_status *es1;
441 struct rb_node *node;
442
443 node = rb_next(&es->rb_node);
444 if (!node)
445 return es;
446
447 es1 = rb_entry(node, struct extent_status, rb_node);
448 if (ext4_es_can_be_merged(es, es1)) {
449 es->es_len += es1->es_len;
450 if (ext4_es_is_referenced(es1))
451 ext4_es_set_referenced(es);
452 rb_erase(node, &tree->root);
453 ext4_es_free_extent(inode, es1);
454 }
455
456 return es;
457}
458
459#ifdef ES_AGGRESSIVE_TEST
460#include "ext4_extents.h"
461
462static void ext4_es_insert_extent_ext_check(struct inode *inode,
463 struct extent_status *es)
464{
465 struct ext4_ext_path *path = NULL;
466 struct ext4_extent *ex;
467 ext4_lblk_t ee_block;
468 ext4_fsblk_t ee_start;
469 unsigned short ee_len;
470 int depth, ee_status, es_status;
471
472 path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
473 if (IS_ERR(path))
474 return;
475
476 depth = ext_depth(inode);
477 ex = path[depth].p_ext;
478
479 if (ex) {
480
481 ee_block = le32_to_cpu(ex->ee_block);
482 ee_start = ext4_ext_pblock(ex);
483 ee_len = ext4_ext_get_actual_len(ex);
484
485 ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
486 es_status = ext4_es_is_unwritten(es) ? 1 : 0;
487
488
489
490
491
492 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
493 if (in_range(es->es_lblk, ee_block, ee_len)) {
494 pr_warn("ES insert assertion failed for "
495 "inode: %lu we can find an extent "
496 "at block [%d/%d/%llu/%c], but we "
497 "want to add a delayed/hole extent "
498 "[%d/%d/%llu/%x]\n",
499 inode->i_ino, ee_block, ee_len,
500 ee_start, ee_status ? 'u' : 'w',
501 es->es_lblk, es->es_len,
502 ext4_es_pblock(es), ext4_es_status(es));
503 }
504 goto out;
505 }
506
507
508
509
510
511 if (es->es_lblk < ee_block ||
512 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
513 pr_warn("ES insert assertion failed for inode: %lu "
514 "ex_status [%d/%d/%llu/%c] != "
515 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
516 ee_block, ee_len, ee_start,
517 ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
518 ext4_es_pblock(es), es_status ? 'u' : 'w');
519 goto out;
520 }
521
522 if (ee_status ^ es_status) {
523 pr_warn("ES insert assertion failed for inode: %lu "
524 "ex_status [%d/%d/%llu/%c] != "
525 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
526 ee_block, ee_len, ee_start,
527 ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
528 ext4_es_pblock(es), es_status ? 'u' : 'w');
529 }
530 } else {
531
532
533
534
535 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
536 pr_warn("ES insert assertion failed for inode: %lu "
537 "can't find an extent at block %d but we want "
538 "to add a written/unwritten extent "
539 "[%d/%d/%llu/%x]\n", inode->i_ino,
540 es->es_lblk, es->es_lblk, es->es_len,
541 ext4_es_pblock(es), ext4_es_status(es));
542 }
543 }
544out:
545 ext4_ext_drop_refs(path);
546 kfree(path);
547}
548
549static void ext4_es_insert_extent_ind_check(struct inode *inode,
550 struct extent_status *es)
551{
552 struct ext4_map_blocks map;
553 int retval;
554
555
556
557
558
559
560
561
562 map.m_lblk = es->es_lblk;
563 map.m_len = es->es_len;
564
565 retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
566 if (retval > 0) {
567 if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
568
569
570
571
572 pr_warn("ES insert assertion failed for inode: %lu "
573 "We can find blocks but we want to add a "
574 "delayed/hole extent [%d/%d/%llu/%x]\n",
575 inode->i_ino, es->es_lblk, es->es_len,
576 ext4_es_pblock(es), ext4_es_status(es));
577 return;
578 } else if (ext4_es_is_written(es)) {
579 if (retval != es->es_len) {
580 pr_warn("ES insert assertion failed for "
581 "inode: %lu retval %d != es_len %d\n",
582 inode->i_ino, retval, es->es_len);
583 return;
584 }
585 if (map.m_pblk != ext4_es_pblock(es)) {
586 pr_warn("ES insert assertion failed for "
587 "inode: %lu m_pblk %llu != "
588 "es_pblk %llu\n",
589 inode->i_ino, map.m_pblk,
590 ext4_es_pblock(es));
591 return;
592 }
593 } else {
594
595
596
597
598 BUG_ON(1);
599 }
600 } else if (retval == 0) {
601 if (ext4_es_is_written(es)) {
602 pr_warn("ES insert assertion failed for inode: %lu "
603 "We can't find the block but we want to add "
604 "a written extent [%d/%d/%llu/%x]\n",
605 inode->i_ino, es->es_lblk, es->es_len,
606 ext4_es_pblock(es), ext4_es_status(es));
607 return;
608 }
609 }
610}
611
612static inline void ext4_es_insert_extent_check(struct inode *inode,
613 struct extent_status *es)
614{
615
616
617
618
619 BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
620 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
621 ext4_es_insert_extent_ext_check(inode, es);
622 else
623 ext4_es_insert_extent_ind_check(inode, es);
624}
625#else
626static inline void ext4_es_insert_extent_check(struct inode *inode,
627 struct extent_status *es)
628{
629}
630#endif
631
632static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
633{
634 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
635 struct rb_node **p = &tree->root.rb_node;
636 struct rb_node *parent = NULL;
637 struct extent_status *es;
638
639 while (*p) {
640 parent = *p;
641 es = rb_entry(parent, struct extent_status, rb_node);
642
643 if (newes->es_lblk < es->es_lblk) {
644 if (ext4_es_can_be_merged(newes, es)) {
645
646
647
648
649 es->es_lblk = newes->es_lblk;
650 es->es_len += newes->es_len;
651 if (ext4_es_is_written(es) ||
652 ext4_es_is_unwritten(es))
653 ext4_es_store_pblock(es,
654 newes->es_pblk);
655 es = ext4_es_try_to_merge_left(inode, es);
656 goto out;
657 }
658 p = &(*p)->rb_left;
659 } else if (newes->es_lblk > ext4_es_end(es)) {
660 if (ext4_es_can_be_merged(es, newes)) {
661 es->es_len += newes->es_len;
662 es = ext4_es_try_to_merge_right(inode, es);
663 goto out;
664 }
665 p = &(*p)->rb_right;
666 } else {
667 BUG_ON(1);
668 return -EINVAL;
669 }
670 }
671
672 es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
673 newes->es_pblk);
674 if (!es)
675 return -ENOMEM;
676 rb_link_node(&es->rb_node, parent, p);
677 rb_insert_color(&es->rb_node, &tree->root);
678
679out:
680 tree->cache_es = es;
681 return 0;
682}
683
684
685
686
687
688
689
690int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
691 ext4_lblk_t len, ext4_fsblk_t pblk,
692 unsigned int status)
693{
694 struct extent_status newes;
695 ext4_lblk_t end = lblk + len - 1;
696 int err = 0;
697
698 es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
699 lblk, len, pblk, status, inode->i_ino);
700
701 if (!len)
702 return 0;
703
704 BUG_ON(end < lblk);
705
706 if ((status & EXTENT_STATUS_DELAYED) &&
707 (status & EXTENT_STATUS_WRITTEN)) {
708 ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
709 " delayed and written which can potentially "
710 " cause data loss.", lblk, len);
711 WARN_ON(1);
712 }
713
714 newes.es_lblk = lblk;
715 newes.es_len = len;
716 ext4_es_store_pblock_status(&newes, pblk, status);
717 trace_ext4_es_insert_extent(inode, &newes);
718
719 ext4_es_insert_extent_check(inode, &newes);
720
721 write_lock(&EXT4_I(inode)->i_es_lock);
722 err = __es_remove_extent(inode, lblk, end);
723 if (err != 0)
724 goto error;
725retry:
726 err = __es_insert_extent(inode, &newes);
727 if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
728 128, EXT4_I(inode)))
729 goto retry;
730 if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
731 err = 0;
732
733error:
734 write_unlock(&EXT4_I(inode)->i_es_lock);
735
736 ext4_es_print_tree(inode);
737
738 return err;
739}
740
741
742
743
744
745
746void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
747 ext4_lblk_t len, ext4_fsblk_t pblk,
748 unsigned int status)
749{
750 struct extent_status *es;
751 struct extent_status newes;
752 ext4_lblk_t end = lblk + len - 1;
753
754 newes.es_lblk = lblk;
755 newes.es_len = len;
756 ext4_es_store_pblock_status(&newes, pblk, status);
757 trace_ext4_es_cache_extent(inode, &newes);
758
759 if (!len)
760 return;
761
762 BUG_ON(end < lblk);
763
764 write_lock(&EXT4_I(inode)->i_es_lock);
765
766 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
767 if (!es || es->es_lblk > end)
768 __es_insert_extent(inode, &newes);
769 write_unlock(&EXT4_I(inode)->i_es_lock);
770}
771
772
773
774
775
776
777
778
779int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
780 struct extent_status *es)
781{
782 struct ext4_es_tree *tree;
783 struct ext4_es_stats *stats;
784 struct extent_status *es1 = NULL;
785 struct rb_node *node;
786 int found = 0;
787
788 trace_ext4_es_lookup_extent_enter(inode, lblk);
789 es_debug("lookup extent in block %u\n", lblk);
790
791 tree = &EXT4_I(inode)->i_es_tree;
792 read_lock(&EXT4_I(inode)->i_es_lock);
793
794
795 es->es_lblk = es->es_len = es->es_pblk = 0;
796 if (tree->cache_es) {
797 es1 = tree->cache_es;
798 if (in_range(lblk, es1->es_lblk, es1->es_len)) {
799 es_debug("%u cached by [%u/%u)\n",
800 lblk, es1->es_lblk, es1->es_len);
801 found = 1;
802 goto out;
803 }
804 }
805
806 node = tree->root.rb_node;
807 while (node) {
808 es1 = rb_entry(node, struct extent_status, rb_node);
809 if (lblk < es1->es_lblk)
810 node = node->rb_left;
811 else if (lblk > ext4_es_end(es1))
812 node = node->rb_right;
813 else {
814 found = 1;
815 break;
816 }
817 }
818
819out:
820 stats = &EXT4_SB(inode->i_sb)->s_es_stats;
821 if (found) {
822 BUG_ON(!es1);
823 es->es_lblk = es1->es_lblk;
824 es->es_len = es1->es_len;
825 es->es_pblk = es1->es_pblk;
826 if (!ext4_es_is_referenced(es1))
827 ext4_es_set_referenced(es1);
828 stats->es_stats_cache_hits++;
829 } else {
830 stats->es_stats_cache_misses++;
831 }
832
833 read_unlock(&EXT4_I(inode)->i_es_lock);
834
835 trace_ext4_es_lookup_extent_exit(inode, es, found);
836 return found;
837}
838
839static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
840 ext4_lblk_t end)
841{
842 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
843 struct rb_node *node;
844 struct extent_status *es;
845 struct extent_status orig_es;
846 ext4_lblk_t len1, len2;
847 ext4_fsblk_t block;
848 int err;
849
850retry:
851 err = 0;
852 es = __es_tree_search(&tree->root, lblk);
853 if (!es)
854 goto out;
855 if (es->es_lblk > end)
856 goto out;
857
858
859 tree->cache_es = NULL;
860
861 orig_es.es_lblk = es->es_lblk;
862 orig_es.es_len = es->es_len;
863 orig_es.es_pblk = es->es_pblk;
864
865 len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
866 len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
867 if (len1 > 0)
868 es->es_len = len1;
869 if (len2 > 0) {
870 if (len1 > 0) {
871 struct extent_status newes;
872
873 newes.es_lblk = end + 1;
874 newes.es_len = len2;
875 block = 0x7FDEADBEEFULL;
876 if (ext4_es_is_written(&orig_es) ||
877 ext4_es_is_unwritten(&orig_es))
878 block = ext4_es_pblock(&orig_es) +
879 orig_es.es_len - len2;
880 ext4_es_store_pblock_status(&newes, block,
881 ext4_es_status(&orig_es));
882 err = __es_insert_extent(inode, &newes);
883 if (err) {
884 es->es_lblk = orig_es.es_lblk;
885 es->es_len = orig_es.es_len;
886 if ((err == -ENOMEM) &&
887 __es_shrink(EXT4_SB(inode->i_sb),
888 128, EXT4_I(inode)))
889 goto retry;
890 goto out;
891 }
892 } else {
893 es->es_lblk = end + 1;
894 es->es_len = len2;
895 if (ext4_es_is_written(es) ||
896 ext4_es_is_unwritten(es)) {
897 block = orig_es.es_pblk + orig_es.es_len - len2;
898 ext4_es_store_pblock(es, block);
899 }
900 }
901 goto out;
902 }
903
904 if (len1 > 0) {
905 node = rb_next(&es->rb_node);
906 if (node)
907 es = rb_entry(node, struct extent_status, rb_node);
908 else
909 es = NULL;
910 }
911
912 while (es && ext4_es_end(es) <= end) {
913 node = rb_next(&es->rb_node);
914 rb_erase(&es->rb_node, &tree->root);
915 ext4_es_free_extent(inode, es);
916 if (!node) {
917 es = NULL;
918 break;
919 }
920 es = rb_entry(node, struct extent_status, rb_node);
921 }
922
923 if (es && es->es_lblk < end + 1) {
924 ext4_lblk_t orig_len = es->es_len;
925
926 len1 = ext4_es_end(es) - end;
927 es->es_lblk = end + 1;
928 es->es_len = len1;
929 if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
930 block = es->es_pblk + orig_len - len1;
931 ext4_es_store_pblock(es, block);
932 }
933 }
934
935out:
936 return err;
937}
938
939
940
941
942
943
944int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
945 ext4_lblk_t len)
946{
947 ext4_lblk_t end;
948 int err = 0;
949
950 trace_ext4_es_remove_extent(inode, lblk, len);
951 es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
952 lblk, len, inode->i_ino);
953
954 if (!len)
955 return err;
956
957 end = lblk + len - 1;
958 BUG_ON(end < lblk);
959
960
961
962
963
964
965 write_lock(&EXT4_I(inode)->i_es_lock);
966 err = __es_remove_extent(inode, lblk, end);
967 write_unlock(&EXT4_I(inode)->i_es_lock);
968 ext4_es_print_tree(inode);
969 return err;
970}
971
972static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
973 struct ext4_inode_info *locked_ei)
974{
975 struct ext4_inode_info *ei;
976 struct ext4_es_stats *es_stats;
977 ktime_t start_time;
978 u64 scan_time;
979 int nr_to_walk;
980 int nr_shrunk = 0;
981 int retried = 0, nr_skipped = 0;
982
983 es_stats = &sbi->s_es_stats;
984 start_time = ktime_get();
985
986retry:
987 spin_lock(&sbi->s_es_lock);
988 nr_to_walk = sbi->s_es_nr_inode;
989 while (nr_to_walk-- > 0) {
990 if (list_empty(&sbi->s_es_list)) {
991 spin_unlock(&sbi->s_es_lock);
992 goto out;
993 }
994 ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
995 i_es_list);
996
997 list_move_tail(&ei->i_es_list, &sbi->s_es_list);
998
999
1000
1001
1002
1003 if (!retried && ext4_test_inode_state(&ei->vfs_inode,
1004 EXT4_STATE_EXT_PRECACHED)) {
1005 nr_skipped++;
1006 continue;
1007 }
1008
1009 if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
1010 nr_skipped++;
1011 continue;
1012 }
1013
1014
1015
1016
1017 spin_unlock(&sbi->s_es_lock);
1018
1019 nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
1020 write_unlock(&ei->i_es_lock);
1021
1022 if (nr_to_scan <= 0)
1023 goto out;
1024 spin_lock(&sbi->s_es_lock);
1025 }
1026 spin_unlock(&sbi->s_es_lock);
1027
1028
1029
1030
1031
1032 if ((nr_shrunk == 0) && nr_skipped && !retried) {
1033 retried++;
1034 goto retry;
1035 }
1036
1037 if (locked_ei && nr_shrunk == 0)
1038 nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1039
1040out:
1041 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1042 if (likely(es_stats->es_stats_scan_time))
1043 es_stats->es_stats_scan_time = (scan_time +
1044 es_stats->es_stats_scan_time*3) / 4;
1045 else
1046 es_stats->es_stats_scan_time = scan_time;
1047 if (scan_time > es_stats->es_stats_max_scan_time)
1048 es_stats->es_stats_max_scan_time = scan_time;
1049 if (likely(es_stats->es_stats_shrunk))
1050 es_stats->es_stats_shrunk = (nr_shrunk +
1051 es_stats->es_stats_shrunk*3) / 4;
1052 else
1053 es_stats->es_stats_shrunk = nr_shrunk;
1054
1055 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1056 nr_skipped, retried);
1057 return nr_shrunk;
1058}
1059
1060static unsigned long ext4_es_count(struct shrinker *shrink,
1061 struct shrink_control *sc)
1062{
1063 unsigned long nr;
1064 struct ext4_sb_info *sbi;
1065
1066 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1067 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1068 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1069 return nr;
1070}
1071
1072static unsigned long ext4_es_scan(struct shrinker *shrink,
1073 struct shrink_control *sc)
1074{
1075 struct ext4_sb_info *sbi = container_of(shrink,
1076 struct ext4_sb_info, s_es_shrinker);
1077 int nr_to_scan = sc->nr_to_scan;
1078 int ret, nr_shrunk;
1079
1080 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1081 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1082
1083 if (!nr_to_scan)
1084 return ret;
1085
1086 nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1087
1088 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1089 return nr_shrunk;
1090}
1091
1092int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
1093{
1094 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
1095 struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1096 struct ext4_inode_info *ei, *max = NULL;
1097 unsigned int inode_cnt = 0;
1098
1099 if (v != SEQ_START_TOKEN)
1100 return 0;
1101
1102
1103 spin_lock(&sbi->s_es_lock);
1104 list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1105 inode_cnt++;
1106 if (max && max->i_es_all_nr < ei->i_es_all_nr)
1107 max = ei;
1108 else if (!max)
1109 max = ei;
1110 }
1111 spin_unlock(&sbi->s_es_lock);
1112
1113 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
1114 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1115 percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1116 seq_printf(seq, " %lu/%lu cache hits/misses\n",
1117 es_stats->es_stats_cache_hits,
1118 es_stats->es_stats_cache_misses);
1119 if (inode_cnt)
1120 seq_printf(seq, " %d inodes on list\n", inode_cnt);
1121
1122 seq_printf(seq, "average:\n %llu us scan time\n",
1123 div_u64(es_stats->es_stats_scan_time, 1000));
1124 seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
1125 if (inode_cnt)
1126 seq_printf(seq,
1127 "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
1128 " %llu us max scan time\n",
1129 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1130 div_u64(es_stats->es_stats_max_scan_time, 1000));
1131
1132 return 0;
1133}
1134
1135int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1136{
1137 int err;
1138
1139
1140 BUILD_BUG_ON(ES_SHIFT < 48);
1141 INIT_LIST_HEAD(&sbi->s_es_list);
1142 sbi->s_es_nr_inode = 0;
1143 spin_lock_init(&sbi->s_es_lock);
1144 sbi->s_es_stats.es_stats_shrunk = 0;
1145 sbi->s_es_stats.es_stats_cache_hits = 0;
1146 sbi->s_es_stats.es_stats_cache_misses = 0;
1147 sbi->s_es_stats.es_stats_scan_time = 0;
1148 sbi->s_es_stats.es_stats_max_scan_time = 0;
1149 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1150 if (err)
1151 return err;
1152 err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1153 if (err)
1154 goto err1;
1155
1156 sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1157 sbi->s_es_shrinker.count_objects = ext4_es_count;
1158 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1159 err = register_shrinker(&sbi->s_es_shrinker);
1160 if (err)
1161 goto err2;
1162
1163 return 0;
1164
1165err2:
1166 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1167err1:
1168 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1169 return err;
1170}
1171
1172void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1173{
1174 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1175 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1176 unregister_shrinker(&sbi->s_es_shrinker);
1177}
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1188 int *nr_to_scan, int *nr_shrunk)
1189{
1190 struct inode *inode = &ei->vfs_inode;
1191 struct ext4_es_tree *tree = &ei->i_es_tree;
1192 struct extent_status *es;
1193 struct rb_node *node;
1194
1195 es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1196 if (!es)
1197 goto out_wrap;
1198 node = &es->rb_node;
1199 while (*nr_to_scan > 0) {
1200 if (es->es_lblk > end) {
1201 ei->i_es_shrink_lblk = end + 1;
1202 return 0;
1203 }
1204
1205 (*nr_to_scan)--;
1206 node = rb_next(&es->rb_node);
1207
1208
1209
1210
1211 if (ext4_es_is_delayed(es))
1212 goto next;
1213 if (ext4_es_is_referenced(es)) {
1214 ext4_es_clear_referenced(es);
1215 goto next;
1216 }
1217
1218 rb_erase(&es->rb_node, &tree->root);
1219 ext4_es_free_extent(inode, es);
1220 (*nr_shrunk)++;
1221next:
1222 if (!node)
1223 goto out_wrap;
1224 es = rb_entry(node, struct extent_status, rb_node);
1225 }
1226 ei->i_es_shrink_lblk = es->es_lblk;
1227 return 1;
1228out_wrap:
1229 ei->i_es_shrink_lblk = 0;
1230 return 0;
1231}
1232
1233static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1234{
1235 struct inode *inode = &ei->vfs_inode;
1236 int nr_shrunk = 0;
1237 ext4_lblk_t start = ei->i_es_shrink_lblk;
1238 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1239 DEFAULT_RATELIMIT_BURST);
1240
1241 if (ei->i_es_shk_nr == 0)
1242 return 0;
1243
1244 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1245 __ratelimit(&_rs))
1246 ext4_warning(inode->i_sb, "forced shrink of precached extents");
1247
1248 if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1249 start != 0)
1250 es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1251
1252 ei->i_es_tree.cache_es = NULL;
1253 return nr_shrunk;
1254}
1255