linux/fs/ext4/extents_status.c
<<
>>
Prefs
   1/*
   2 *  fs/ext4/extents_status.c
   3 *
   4 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
   5 * Modified by
   6 *      Allison Henderson <achender@linux.vnet.ibm.com>
   7 *      Hugh Dickins <hughd@google.com>
   8 *      Zheng Liu <wenqing.lz@taobao.com>
   9 *
  10 * Ext4 extents status tree core functions.
  11 */
  12#include <linux/rbtree.h>
  13#include <linux/list_sort.h>
  14#include "ext4.h"
  15#include "extents_status.h"
  16#include "ext4_extents.h"
  17
  18#include <trace/events/ext4.h>
  19
  20/*
  21 * According to previous discussion in Ext4 Developer Workshop, we
  22 * will introduce a new structure called io tree to track all extent
  23 * status in order to solve some problems that we have met
  24 * (e.g. Reservation space warning), and provide extent-level locking.
  25 * Delay extent tree is the first step to achieve this goal.  It is
  26 * original built by Yongqiang Yang.  At that time it is called delay
  27 * extent tree, whose goal is only track delayed extents in memory to
  28 * simplify the implementation of fiemap and bigalloc, and introduce
  29 * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
  30 * delay extent tree at the first commit.  But for better understand
  31 * what it does, it has been rename to extent status tree.
  32 *
  33 * Step1:
  34 * Currently the first step has been done.  All delayed extents are
  35 * tracked in the tree.  It maintains the delayed extent when a delayed
  36 * allocation is issued, and the delayed extent is written out or
  37 * invalidated.  Therefore the implementation of fiemap and bigalloc
  38 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
  39 *
  40 * The following comment describes the implemenmtation of extent
  41 * status tree and future works.
  42 *
  43 * Step2:
  44 * In this step all extent status are tracked by extent status tree.
  45 * Thus, we can first try to lookup a block mapping in this tree before
  46 * finding it in extent tree.  Hence, single extent cache can be removed
  47 * because extent status tree can do a better job.  Extents in status
  48 * tree are loaded on-demand.  Therefore, the extent status tree may not
  49 * contain all of the extents in a file.  Meanwhile we define a shrinker
  50 * to reclaim memory from extent status tree because fragmented extent
  51 * tree will make status tree cost too much memory.  written/unwritten/-
  52 * hole extents in the tree will be reclaimed by this shrinker when we
  53 * are under high memory pressure.  Delayed extents will not be
  54 * reclimed because fiemap, bigalloc, and seek_data/hole need it.
  55 */
  56
  57/*
  58 * Extent status tree implementation for ext4.
  59 *
  60 *
  61 * ==========================================================================
  62 * Extent status tree tracks all extent status.
  63 *
  64 * 1. Why we need to implement extent status tree?
  65 *
  66 * Without extent status tree, ext4 identifies a delayed extent by looking
  67 * up page cache, this has several deficiencies - complicated, buggy,
  68 * and inefficient code.
  69 *
  70 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
  71 * block or a range of blocks are belonged to a delayed extent.
  72 *
  73 * Let us have a look at how they do without extent status tree.
  74 *   -- FIEMAP
  75 *      FIEMAP looks up page cache to identify delayed allocations from holes.
  76 *
  77 *   -- SEEK_HOLE/DATA
  78 *      SEEK_HOLE/DATA has the same problem as FIEMAP.
  79 *
  80 *   -- bigalloc
  81 *      bigalloc looks up page cache to figure out if a block is
  82 *      already under delayed allocation or not to determine whether
  83 *      quota reserving is needed for the cluster.
  84 *
  85 *   -- writeout
  86 *      Writeout looks up whole page cache to see if a buffer is
  87 *      mapped, If there are not very many delayed buffers, then it is
  88 *      time comsuming.
  89 *
  90 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
  91 * bigalloc and writeout can figure out if a block or a range of
  92 * blocks is under delayed allocation(belonged to a delayed extent) or
  93 * not by searching the extent tree.
  94 *
  95 *
  96 * ==========================================================================
  97 * 2. Ext4 extent status tree impelmentation
  98 *
  99 *   -- extent
 100 *      A extent is a range of blocks which are contiguous logically and
 101 *      physically.  Unlike extent in extent tree, this extent in ext4 is
 102 *      a in-memory struct, there is no corresponding on-disk data.  There
 103 *      is no limit on length of extent, so an extent can contain as many
 104 *      blocks as they are contiguous logically and physically.
 105 *
 106 *   -- extent status tree
 107 *      Every inode has an extent status tree and all allocation blocks
 108 *      are added to the tree with different status.  The extent in the
 109 *      tree are ordered by logical block no.
 110 *
 111 *   -- operations on a extent status tree
 112 *      There are three important operations on a delayed extent tree: find
 113 *      next extent, adding a extent(a range of blocks) and removing a extent.
 114 *
 115 *   -- race on a extent status tree
 116 *      Extent status tree is protected by inode->i_es_lock.
 117 *
 118 *   -- memory consumption
 119 *      Fragmented extent tree will make extent status tree cost too much
 120 *      memory.  Hence, we will reclaim written/unwritten/hole extents from
 121 *      the tree under a heavy memory pressure.
 122 *
 123 *
 124 * ==========================================================================
 125 * 3. Performance analysis
 126 *
 127 *   -- overhead
 128 *      1. There is a cache extent for write access, so if writes are
 129 *      not very random, adding space operaions are in O(1) time.
 130 *
 131 *   -- gain
 132 *      2. Code is much simpler, more readable, more maintainable and
 133 *      more efficient.
 134 *
 135 *
 136 * ==========================================================================
 137 * 4. TODO list
 138 *
 139 *   -- Refactor delayed space reservation
 140 *
 141 *   -- Extent-level locking
 142 */
 143
 144static struct kmem_cache *ext4_es_cachep;
 145
 146static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
 147static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 148                              ext4_lblk_t end);
 149static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
 150                                       int nr_to_scan);
 151static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
 152                            struct ext4_inode_info *locked_ei);
 153
 154int __init ext4_init_es(void)
 155{
 156        ext4_es_cachep = kmem_cache_create("ext4_extent_status",
 157                                           sizeof(struct extent_status),
 158                                           0, (SLAB_RECLAIM_ACCOUNT), NULL);
 159        if (ext4_es_cachep == NULL)
 160                return -ENOMEM;
 161        return 0;
 162}
 163
 164void ext4_exit_es(void)
 165{
 166        if (ext4_es_cachep)
 167                kmem_cache_destroy(ext4_es_cachep);
 168}
 169
 170void ext4_es_init_tree(struct ext4_es_tree *tree)
 171{
 172        tree->root = RB_ROOT;
 173        tree->cache_es = NULL;
 174}
 175
 176#ifdef ES_DEBUG__
 177static void ext4_es_print_tree(struct inode *inode)
 178{
 179        struct ext4_es_tree *tree;
 180        struct rb_node *node;
 181
 182        printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
 183        tree = &EXT4_I(inode)->i_es_tree;
 184        node = rb_first(&tree->root);
 185        while (node) {
 186                struct extent_status *es;
 187                es = rb_entry(node, struct extent_status, rb_node);
 188                printk(KERN_DEBUG " [%u/%u) %llu %llx",
 189                       es->es_lblk, es->es_len,
 190                       ext4_es_pblock(es), ext4_es_status(es));
 191                node = rb_next(node);
 192        }
 193        printk(KERN_DEBUG "\n");
 194}
 195#else
 196#define ext4_es_print_tree(inode)
 197#endif
 198
 199static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
 200{
 201        BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
 202        return es->es_lblk + es->es_len - 1;
 203}
 204
 205/*
 206 * search through the tree for an delayed extent with a given offset.  If
 207 * it can't be found, try to find next extent.
 208 */
 209static struct extent_status *__es_tree_search(struct rb_root *root,
 210                                              ext4_lblk_t lblk)
 211{
 212        struct rb_node *node = root->rb_node;
 213        struct extent_status *es = NULL;
 214
 215        while (node) {
 216                es = rb_entry(node, struct extent_status, rb_node);
 217                if (lblk < es->es_lblk)
 218                        node = node->rb_left;
 219                else if (lblk > ext4_es_end(es))
 220                        node = node->rb_right;
 221                else
 222                        return es;
 223        }
 224
 225        if (es && lblk < es->es_lblk)
 226                return es;
 227
 228        if (es && lblk > ext4_es_end(es)) {
 229                node = rb_next(&es->rb_node);
 230                return node ? rb_entry(node, struct extent_status, rb_node) :
 231                              NULL;
 232        }
 233
 234        return NULL;
 235}
 236
 237/*
 238 * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
 239 * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
 240 *
 241 * @inode: the inode which owns delayed extents
 242 * @lblk: the offset where we start to search
 243 * @end: the offset where we stop to search
 244 * @es: delayed extent that we found
 245 */
 246void ext4_es_find_delayed_extent_range(struct inode *inode,
 247                                 ext4_lblk_t lblk, ext4_lblk_t end,
 248                                 struct extent_status *es)
 249{
 250        struct ext4_es_tree *tree = NULL;
 251        struct extent_status *es1 = NULL;
 252        struct rb_node *node;
 253
 254        BUG_ON(es == NULL);
 255        BUG_ON(end < lblk);
 256        trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
 257
 258        read_lock(&EXT4_I(inode)->i_es_lock);
 259        tree = &EXT4_I(inode)->i_es_tree;
 260
 261        /* find extent in cache firstly */
 262        es->es_lblk = es->es_len = es->es_pblk = 0;
 263        if (tree->cache_es) {
 264                es1 = tree->cache_es;
 265                if (in_range(lblk, es1->es_lblk, es1->es_len)) {
 266                        es_debug("%u cached by [%u/%u) %llu %llx\n",
 267                                 lblk, es1->es_lblk, es1->es_len,
 268                                 ext4_es_pblock(es1), ext4_es_status(es1));
 269                        goto out;
 270                }
 271        }
 272
 273        es1 = __es_tree_search(&tree->root, lblk);
 274
 275out:
 276        if (es1 && !ext4_es_is_delayed(es1)) {
 277                while ((node = rb_next(&es1->rb_node)) != NULL) {
 278                        es1 = rb_entry(node, struct extent_status, rb_node);
 279                        if (es1->es_lblk > end) {
 280                                es1 = NULL;
 281                                break;
 282                        }
 283                        if (ext4_es_is_delayed(es1))
 284                                break;
 285                }
 286        }
 287
 288        if (es1 && ext4_es_is_delayed(es1)) {
 289                tree->cache_es = es1;
 290                es->es_lblk = es1->es_lblk;
 291                es->es_len = es1->es_len;
 292                es->es_pblk = es1->es_pblk;
 293        }
 294
 295        read_unlock(&EXT4_I(inode)->i_es_lock);
 296
 297        trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 298}
 299
 300static struct extent_status *
 301ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
 302                     ext4_fsblk_t pblk)
 303{
 304        struct extent_status *es;
 305        es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
 306        if (es == NULL)
 307                return NULL;
 308        es->es_lblk = lblk;
 309        es->es_len = len;
 310        es->es_pblk = pblk;
 311
 312        /*
 313         * We don't count delayed extent because we never try to reclaim them
 314         */
 315        if (!ext4_es_is_delayed(es)) {
 316                EXT4_I(inode)->i_es_lru_nr++;
 317                percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
 318        }
 319
 320        return es;
 321}
 322
 323static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
 324{
 325        /* Decrease the lru counter when this es is not delayed */
 326        if (!ext4_es_is_delayed(es)) {
 327                BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
 328                EXT4_I(inode)->i_es_lru_nr--;
 329                percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
 330        }
 331
 332        kmem_cache_free(ext4_es_cachep, es);
 333}
 334
 335/*
 336 * Check whether or not two extents can be merged
 337 * Condition:
 338 *  - logical block number is contiguous
 339 *  - physical block number is contiguous
 340 *  - status is equal
 341 */
 342static int ext4_es_can_be_merged(struct extent_status *es1,
 343                                 struct extent_status *es2)
 344{
 345        if (ext4_es_status(es1) != ext4_es_status(es2))
 346                return 0;
 347
 348        if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL)
 349                return 0;
 350
 351        if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
 352                return 0;
 353
 354        if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
 355            (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
 356                return 1;
 357
 358        if (ext4_es_is_hole(es1))
 359                return 1;
 360
 361        /* we need to check delayed extent is without unwritten status */
 362        if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
 363                return 1;
 364
 365        return 0;
 366}
 367
 368static struct extent_status *
 369ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
 370{
 371        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 372        struct extent_status *es1;
 373        struct rb_node *node;
 374
 375        node = rb_prev(&es->rb_node);
 376        if (!node)
 377                return es;
 378
 379        es1 = rb_entry(node, struct extent_status, rb_node);
 380        if (ext4_es_can_be_merged(es1, es)) {
 381                es1->es_len += es->es_len;
 382                rb_erase(&es->rb_node, &tree->root);
 383                ext4_es_free_extent(inode, es);
 384                es = es1;
 385        }
 386
 387        return es;
 388}
 389
 390static struct extent_status *
 391ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
 392{
 393        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 394        struct extent_status *es1;
 395        struct rb_node *node;
 396
 397        node = rb_next(&es->rb_node);
 398        if (!node)
 399                return es;
 400
 401        es1 = rb_entry(node, struct extent_status, rb_node);
 402        if (ext4_es_can_be_merged(es, es1)) {
 403                es->es_len += es1->es_len;
 404                rb_erase(node, &tree->root);
 405                ext4_es_free_extent(inode, es1);
 406        }
 407
 408        return es;
 409}
 410
 411#ifdef ES_AGGRESSIVE_TEST
 412static void ext4_es_insert_extent_ext_check(struct inode *inode,
 413                                            struct extent_status *es)
 414{
 415        struct ext4_ext_path *path = NULL;
 416        struct ext4_extent *ex;
 417        ext4_lblk_t ee_block;
 418        ext4_fsblk_t ee_start;
 419        unsigned short ee_len;
 420        int depth, ee_status, es_status;
 421
 422        path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
 423        if (IS_ERR(path))
 424                return;
 425
 426        depth = ext_depth(inode);
 427        ex = path[depth].p_ext;
 428
 429        if (ex) {
 430
 431                ee_block = le32_to_cpu(ex->ee_block);
 432                ee_start = ext4_ext_pblock(ex);
 433                ee_len = ext4_ext_get_actual_len(ex);
 434
 435                ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0;
 436                es_status = ext4_es_is_unwritten(es) ? 1 : 0;
 437
 438                /*
 439                 * Make sure ex and es are not overlap when we try to insert
 440                 * a delayed/hole extent.
 441                 */
 442                if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
 443                        if (in_range(es->es_lblk, ee_block, ee_len)) {
 444                                pr_warn("ES insert assertion failed for "
 445                                        "inode: %lu we can find an extent "
 446                                        "at block [%d/%d/%llu/%c], but we "
 447                                        "want to add an delayed/hole extent "
 448                                        "[%d/%d/%llu/%llx]\n",
 449                                        inode->i_ino, ee_block, ee_len,
 450                                        ee_start, ee_status ? 'u' : 'w',
 451                                        es->es_lblk, es->es_len,
 452                                        ext4_es_pblock(es), ext4_es_status(es));
 453                        }
 454                        goto out;
 455                }
 456
 457                /*
 458                 * We don't check ee_block == es->es_lblk, etc. because es
 459                 * might be a part of whole extent, vice versa.
 460                 */
 461                if (es->es_lblk < ee_block ||
 462                    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
 463                        pr_warn("ES insert assertion failed for inode: %lu "
 464                                "ex_status [%d/%d/%llu/%c] != "
 465                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
 466                                ee_block, ee_len, ee_start,
 467                                ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
 468                                ext4_es_pblock(es), es_status ? 'u' : 'w');
 469                        goto out;
 470                }
 471
 472                if (ee_status ^ es_status) {
 473                        pr_warn("ES insert assertion failed for inode: %lu "
 474                                "ex_status [%d/%d/%llu/%c] != "
 475                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
 476                                ee_block, ee_len, ee_start,
 477                                ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
 478                                ext4_es_pblock(es), es_status ? 'u' : 'w');
 479                }
 480        } else {
 481                /*
 482                 * We can't find an extent on disk.  So we need to make sure
 483                 * that we don't want to add an written/unwritten extent.
 484                 */
 485                if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
 486                        pr_warn("ES insert assertion failed for inode: %lu "
 487                                "can't find an extent at block %d but we want "
 488                                "to add an written/unwritten extent "
 489                                "[%d/%d/%llu/%llx]\n", inode->i_ino,
 490                                es->es_lblk, es->es_lblk, es->es_len,
 491                                ext4_es_pblock(es), ext4_es_status(es));
 492                }
 493        }
 494out:
 495        if (path) {
 496                ext4_ext_drop_refs(path);
 497                kfree(path);
 498        }
 499}
 500
 501static void ext4_es_insert_extent_ind_check(struct inode *inode,
 502                                            struct extent_status *es)
 503{
 504        struct ext4_map_blocks map;
 505        int retval;
 506
 507        /*
 508         * Here we call ext4_ind_map_blocks to lookup a block mapping because
 509         * 'Indirect' structure is defined in indirect.c.  So we couldn't
 510         * access direct/indirect tree from outside.  It is too dirty to define
 511         * this function in indirect.c file.
 512         */
 513
 514        map.m_lblk = es->es_lblk;
 515        map.m_len = es->es_len;
 516
 517        retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
 518        if (retval > 0) {
 519                if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
 520                        /*
 521                         * We want to add a delayed/hole extent but this
 522                         * block has been allocated.
 523                         */
 524                        pr_warn("ES insert assertion failed for inode: %lu "
 525                                "We can find blocks but we want to add a "
 526                                "delayed/hole extent [%d/%d/%llu/%llx]\n",
 527                                inode->i_ino, es->es_lblk, es->es_len,
 528                                ext4_es_pblock(es), ext4_es_status(es));
 529                        return;
 530                } else if (ext4_es_is_written(es)) {
 531                        if (retval != es->es_len) {
 532                                pr_warn("ES insert assertion failed for "
 533                                        "inode: %lu retval %d != es_len %d\n",
 534                                        inode->i_ino, retval, es->es_len);
 535                                return;
 536                        }
 537                        if (map.m_pblk != ext4_es_pblock(es)) {
 538                                pr_warn("ES insert assertion failed for "
 539                                        "inode: %lu m_pblk %llu != "
 540                                        "es_pblk %llu\n",
 541                                        inode->i_ino, map.m_pblk,
 542                                        ext4_es_pblock(es));
 543                                return;
 544                        }
 545                } else {
 546                        /*
 547                         * We don't need to check unwritten extent because
 548                         * indirect-based file doesn't have it.
 549                         */
 550                        BUG_ON(1);
 551                }
 552        } else if (retval == 0) {
 553                if (ext4_es_is_written(es)) {
 554                        pr_warn("ES insert assertion failed for inode: %lu "
 555                                "We can't find the block but we want to add "
 556                                "an written extent [%d/%d/%llu/%llx]\n",
 557                                inode->i_ino, es->es_lblk, es->es_len,
 558                                ext4_es_pblock(es), ext4_es_status(es));
 559                        return;
 560                }
 561        }
 562}
 563
 564static inline void ext4_es_insert_extent_check(struct inode *inode,
 565                                               struct extent_status *es)
 566{
 567        /*
 568         * We don't need to worry about the race condition because
 569         * caller takes i_data_sem locking.
 570         */
 571        BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
 572        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 573                ext4_es_insert_extent_ext_check(inode, es);
 574        else
 575                ext4_es_insert_extent_ind_check(inode, es);
 576}
 577#else
 578static inline void ext4_es_insert_extent_check(struct inode *inode,
 579                                               struct extent_status *es)
 580{
 581}
 582#endif
 583
 584static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
 585{
 586        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 587        struct rb_node **p = &tree->root.rb_node;
 588        struct rb_node *parent = NULL;
 589        struct extent_status *es;
 590
 591        while (*p) {
 592                parent = *p;
 593                es = rb_entry(parent, struct extent_status, rb_node);
 594
 595                if (newes->es_lblk < es->es_lblk) {
 596                        if (ext4_es_can_be_merged(newes, es)) {
 597                                /*
 598                                 * Here we can modify es_lblk directly
 599                                 * because it isn't overlapped.
 600                                 */
 601                                es->es_lblk = newes->es_lblk;
 602                                es->es_len += newes->es_len;
 603                                if (ext4_es_is_written(es) ||
 604                                    ext4_es_is_unwritten(es))
 605                                        ext4_es_store_pblock(es,
 606                                                             newes->es_pblk);
 607                                es = ext4_es_try_to_merge_left(inode, es);
 608                                goto out;
 609                        }
 610                        p = &(*p)->rb_left;
 611                } else if (newes->es_lblk > ext4_es_end(es)) {
 612                        if (ext4_es_can_be_merged(es, newes)) {
 613                                es->es_len += newes->es_len;
 614                                es = ext4_es_try_to_merge_right(inode, es);
 615                                goto out;
 616                        }
 617                        p = &(*p)->rb_right;
 618                } else {
 619                        BUG_ON(1);
 620                        return -EINVAL;
 621                }
 622        }
 623
 624        es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
 625                                  newes->es_pblk);
 626        if (!es)
 627                return -ENOMEM;
 628        rb_link_node(&es->rb_node, parent, p);
 629        rb_insert_color(&es->rb_node, &tree->root);
 630
 631out:
 632        tree->cache_es = es;
 633        return 0;
 634}
 635
 636/*
 637 * ext4_es_insert_extent() adds information to an inode's extent
 638 * status tree.
 639 *
 640 * Return 0 on success, error code on failure.
 641 */
 642int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 643                          ext4_lblk_t len, ext4_fsblk_t pblk,
 644                          unsigned long long status)
 645{
 646        struct extent_status newes;
 647        ext4_lblk_t end = lblk + len - 1;
 648        int err = 0;
 649
 650        es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
 651                 lblk, len, pblk, status, inode->i_ino);
 652
 653        if (!len)
 654                return 0;
 655
 656        BUG_ON(end < lblk);
 657
 658        newes.es_lblk = lblk;
 659        newes.es_len = len;
 660        ext4_es_store_pblock(&newes, pblk);
 661        ext4_es_store_status(&newes, status);
 662        trace_ext4_es_insert_extent(inode, &newes);
 663
 664        ext4_es_insert_extent_check(inode, &newes);
 665
 666        write_lock(&EXT4_I(inode)->i_es_lock);
 667        err = __es_remove_extent(inode, lblk, end);
 668        if (err != 0)
 669                goto error;
 670retry:
 671        err = __es_insert_extent(inode, &newes);
 672        if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
 673                                               EXT4_I(inode)))
 674                goto retry;
 675        if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
 676                err = 0;
 677
 678error:
 679        write_unlock(&EXT4_I(inode)->i_es_lock);
 680
 681        ext4_es_print_tree(inode);
 682
 683        return err;
 684}
 685
 686/*
 687 * ext4_es_lookup_extent() looks up an extent in extent status tree.
 688 *
 689 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
 690 *
 691 * Return: 1 on found, 0 on not
 692 */
 693int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 694                          struct extent_status *es)
 695{
 696        struct ext4_es_tree *tree;
 697        struct extent_status *es1 = NULL;
 698        struct rb_node *node;
 699        int found = 0;
 700
 701        trace_ext4_es_lookup_extent_enter(inode, lblk);
 702        es_debug("lookup extent in block %u\n", lblk);
 703
 704        tree = &EXT4_I(inode)->i_es_tree;
 705        read_lock(&EXT4_I(inode)->i_es_lock);
 706
 707        /* find extent in cache firstly */
 708        es->es_lblk = es->es_len = es->es_pblk = 0;
 709        if (tree->cache_es) {
 710                es1 = tree->cache_es;
 711                if (in_range(lblk, es1->es_lblk, es1->es_len)) {
 712                        es_debug("%u cached by [%u/%u)\n",
 713                                 lblk, es1->es_lblk, es1->es_len);
 714                        found = 1;
 715                        goto out;
 716                }
 717        }
 718
 719        node = tree->root.rb_node;
 720        while (node) {
 721                es1 = rb_entry(node, struct extent_status, rb_node);
 722                if (lblk < es1->es_lblk)
 723                        node = node->rb_left;
 724                else if (lblk > ext4_es_end(es1))
 725                        node = node->rb_right;
 726                else {
 727                        found = 1;
 728                        break;
 729                }
 730        }
 731
 732out:
 733        if (found) {
 734                BUG_ON(!es1);
 735                es->es_lblk = es1->es_lblk;
 736                es->es_len = es1->es_len;
 737                es->es_pblk = es1->es_pblk;
 738        }
 739
 740        read_unlock(&EXT4_I(inode)->i_es_lock);
 741
 742        trace_ext4_es_lookup_extent_exit(inode, es, found);
 743        return found;
 744}
 745
 746static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 747                              ext4_lblk_t end)
 748{
 749        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 750        struct rb_node *node;
 751        struct extent_status *es;
 752        struct extent_status orig_es;
 753        ext4_lblk_t len1, len2;
 754        ext4_fsblk_t block;
 755        int err;
 756
 757retry:
 758        err = 0;
 759        es = __es_tree_search(&tree->root, lblk);
 760        if (!es)
 761                goto out;
 762        if (es->es_lblk > end)
 763                goto out;
 764
 765        /* Simply invalidate cache_es. */
 766        tree->cache_es = NULL;
 767
 768        orig_es.es_lblk = es->es_lblk;
 769        orig_es.es_len = es->es_len;
 770        orig_es.es_pblk = es->es_pblk;
 771
 772        len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
 773        len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
 774        if (len1 > 0)
 775                es->es_len = len1;
 776        if (len2 > 0) {
 777                if (len1 > 0) {
 778                        struct extent_status newes;
 779
 780                        newes.es_lblk = end + 1;
 781                        newes.es_len = len2;
 782                        if (ext4_es_is_written(&orig_es) ||
 783                            ext4_es_is_unwritten(&orig_es)) {
 784                                block = ext4_es_pblock(&orig_es) +
 785                                        orig_es.es_len - len2;
 786                                ext4_es_store_pblock(&newes, block);
 787                        }
 788                        ext4_es_store_status(&newes, ext4_es_status(&orig_es));
 789                        err = __es_insert_extent(inode, &newes);
 790                        if (err) {
 791                                es->es_lblk = orig_es.es_lblk;
 792                                es->es_len = orig_es.es_len;
 793                                if ((err == -ENOMEM) &&
 794                                    __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
 795                                                     EXT4_I(inode)))
 796                                        goto retry;
 797                                goto out;
 798                        }
 799                } else {
 800                        es->es_lblk = end + 1;
 801                        es->es_len = len2;
 802                        if (ext4_es_is_written(es) ||
 803                            ext4_es_is_unwritten(es)) {
 804                                block = orig_es.es_pblk + orig_es.es_len - len2;
 805                                ext4_es_store_pblock(es, block);
 806                        }
 807                }
 808                goto out;
 809        }
 810
 811        if (len1 > 0) {
 812                node = rb_next(&es->rb_node);
 813                if (node)
 814                        es = rb_entry(node, struct extent_status, rb_node);
 815                else
 816                        es = NULL;
 817        }
 818
 819        while (es && ext4_es_end(es) <= end) {
 820                node = rb_next(&es->rb_node);
 821                rb_erase(&es->rb_node, &tree->root);
 822                ext4_es_free_extent(inode, es);
 823                if (!node) {
 824                        es = NULL;
 825                        break;
 826                }
 827                es = rb_entry(node, struct extent_status, rb_node);
 828        }
 829
 830        if (es && es->es_lblk < end + 1) {
 831                ext4_lblk_t orig_len = es->es_len;
 832
 833                len1 = ext4_es_end(es) - end;
 834                es->es_lblk = end + 1;
 835                es->es_len = len1;
 836                if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
 837                        block = es->es_pblk + orig_len - len1;
 838                        ext4_es_store_pblock(es, block);
 839                }
 840        }
 841
 842out:
 843        return err;
 844}
 845
 846/*
 847 * ext4_es_remove_extent() removes a space from a extent status tree.
 848 *
 849 * Return 0 on success, error code on failure.
 850 */
 851int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 852                          ext4_lblk_t len)
 853{
 854        ext4_lblk_t end;
 855        int err = 0;
 856
 857        trace_ext4_es_remove_extent(inode, lblk, len);
 858        es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
 859                 lblk, len, inode->i_ino);
 860
 861        if (!len)
 862                return err;
 863
 864        end = lblk + len - 1;
 865        BUG_ON(end < lblk);
 866
 867        write_lock(&EXT4_I(inode)->i_es_lock);
 868        err = __es_remove_extent(inode, lblk, end);
 869        write_unlock(&EXT4_I(inode)->i_es_lock);
 870        ext4_es_print_tree(inode);
 871        return err;
 872}
 873
 874int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
 875{
 876        ext4_lblk_t  ee_block;
 877        ext4_fsblk_t ee_pblock;
 878        unsigned int ee_len;
 879
 880        ee_block  = le32_to_cpu(ex->ee_block);
 881        ee_len    = ext4_ext_get_actual_len(ex);
 882        ee_pblock = ext4_ext_pblock(ex);
 883
 884        if (ee_len == 0)
 885                return 0;
 886
 887        return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
 888                                     EXTENT_STATUS_WRITTEN);
 889}
 890
 891static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
 892                                     struct list_head *b)
 893{
 894        struct ext4_inode_info *eia, *eib;
 895        eia = list_entry(a, struct ext4_inode_info, i_es_lru);
 896        eib = list_entry(b, struct ext4_inode_info, i_es_lru);
 897
 898        if (eia->i_touch_when == eib->i_touch_when)
 899                return 0;
 900        if (time_after(eia->i_touch_when, eib->i_touch_when))
 901                return 1;
 902        else
 903                return -1;
 904}
 905
 906static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
 907                            struct ext4_inode_info *locked_ei)
 908{
 909        struct ext4_inode_info *ei;
 910        struct list_head *cur, *tmp;
 911        LIST_HEAD(skiped);
 912        int ret, nr_shrunk = 0;
 913
 914        spin_lock(&sbi->s_es_lru_lock);
 915
 916        /*
 917         * If the inode that is at the head of LRU list is newer than
 918         * last_sorted time, that means that we need to sort this list.
 919         */
 920        ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
 921        if (sbi->s_es_last_sorted < ei->i_touch_when) {
 922                list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
 923                sbi->s_es_last_sorted = jiffies;
 924        }
 925
 926        list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
 927                /*
 928                 * If we have already reclaimed all extents from extent
 929                 * status tree, just stop the loop immediately.
 930                 */
 931                if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
 932                        break;
 933
 934                ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
 935
 936                /* Skip the inode that is newer than the last_sorted time */
 937                if (sbi->s_es_last_sorted < ei->i_touch_when) {
 938                        list_move_tail(cur, &skiped);
 939                        continue;
 940                }
 941
 942                if (ei->i_es_lru_nr == 0 || ei == locked_ei)
 943                        continue;
 944
 945                write_lock(&ei->i_es_lock);
 946                ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
 947                if (ei->i_es_lru_nr == 0)
 948                        list_del_init(&ei->i_es_lru);
 949                write_unlock(&ei->i_es_lock);
 950
 951                nr_shrunk += ret;
 952                nr_to_scan -= ret;
 953                if (nr_to_scan == 0)
 954                        break;
 955        }
 956
 957        /* Move the newer inodes into the tail of the LRU list. */
 958        list_splice_tail(&skiped, &sbi->s_es_lru);
 959        spin_unlock(&sbi->s_es_lru_lock);
 960
 961        if (locked_ei && nr_shrunk == 0)
 962                nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
 963
 964        return nr_shrunk;
 965}
 966
 967static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 968{
 969        struct ext4_sb_info *sbi = container_of(shrink,
 970                                        struct ext4_sb_info, s_es_shrinker);
 971        int nr_to_scan = sc->nr_to_scan;
 972        int ret, nr_shrunk;
 973
 974        ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
 975        trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
 976
 977        if (!nr_to_scan)
 978                return ret;
 979
 980        nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
 981
 982        ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
 983        trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
 984        return ret;
 985}
 986
 987void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 988{
 989        INIT_LIST_HEAD(&sbi->s_es_lru);
 990        spin_lock_init(&sbi->s_es_lru_lock);
 991        sbi->s_es_last_sorted = 0;
 992        sbi->s_es_shrinker.shrink = ext4_es_shrink;
 993        sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
 994        register_shrinker(&sbi->s_es_shrinker);
 995}
 996
 997void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 998{
 999        unregister_shrinker(&sbi->s_es_shrinker);
1000}
1001
1002void ext4_es_lru_add(struct inode *inode)
1003{
1004        struct ext4_inode_info *ei = EXT4_I(inode);
1005        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1006
1007        ei->i_touch_when = jiffies;
1008
1009        if (!list_empty(&ei->i_es_lru))
1010                return;
1011
1012        spin_lock(&sbi->s_es_lru_lock);
1013        if (list_empty(&ei->i_es_lru))
1014                list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
1015        spin_unlock(&sbi->s_es_lru_lock);
1016}
1017
1018void ext4_es_lru_del(struct inode *inode)
1019{
1020        struct ext4_inode_info *ei = EXT4_I(inode);
1021        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1022
1023        spin_lock(&sbi->s_es_lru_lock);
1024        if (!list_empty(&ei->i_es_lru))
1025                list_del_init(&ei->i_es_lru);
1026        spin_unlock(&sbi->s_es_lru_lock);
1027}
1028
1029static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
1030                                       int nr_to_scan)
1031{
1032        struct inode *inode = &ei->vfs_inode;
1033        struct ext4_es_tree *tree = &ei->i_es_tree;
1034        struct rb_node *node;
1035        struct extent_status *es;
1036        int nr_shrunk = 0;
1037
1038        if (ei->i_es_lru_nr == 0)
1039                return 0;
1040
1041        node = rb_first(&tree->root);
1042        while (node != NULL) {
1043                es = rb_entry(node, struct extent_status, rb_node);
1044                node = rb_next(&es->rb_node);
1045                /*
1046                 * We can't reclaim delayed extent from status tree because
1047                 * fiemap, bigallic, and seek_data/hole need to use it.
1048                 */
1049                if (!ext4_es_is_delayed(es)) {
1050                        rb_erase(&es->rb_node, &tree->root);
1051                        ext4_es_free_extent(inode, es);
1052                        nr_shrunk++;
1053                        if (--nr_to_scan == 0)
1054                                break;
1055                }
1056        }
1057        tree->cache_es = NULL;
1058        return nr_shrunk;
1059}
1060