linux/fs/ext4/extents.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   4 * Written by Alex Tomas <alex@clusterfs.com>
   5 *
   6 * Architecture independence:
   7 *   Copyright (c) 2005, Bull S.A.
   8 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
   9 */
  10
  11/*
  12 * Extents support for EXT4
  13 *
  14 * TODO:
  15 *   - ext4*_error() should be used in some situations
  16 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
  17 *   - smart tree reduction
  18 */
  19
  20#include <linux/fs.h>
  21#include <linux/time.h>
  22#include <linux/jbd2.h>
  23#include <linux/highuid.h>
  24#include <linux/pagemap.h>
  25#include <linux/quotaops.h>
  26#include <linux/string.h>
  27#include <linux/slab.h>
  28#include <linux/uaccess.h>
  29#include <linux/fiemap.h>
  30#include <linux/backing-dev.h>
  31#include "ext4_jbd2.h"
  32#include "ext4_extents.h"
  33#include "xattr.h"
  34
  35#include <trace/events/ext4.h>
  36
  37/*
  38 * used by extent splitting.
  39 */
  40#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
  41                                        due to ENOSPC */
  42#define EXT4_EXT_MARK_UNWRIT1   0x2  /* mark first half unwritten */
  43#define EXT4_EXT_MARK_UNWRIT2   0x4  /* mark second half unwritten */
  44
  45#define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
  46#define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
  47
  48static __le32 ext4_extent_block_csum(struct inode *inode,
  49                                     struct ext4_extent_header *eh)
  50{
  51        struct ext4_inode_info *ei = EXT4_I(inode);
  52        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  53        __u32 csum;
  54
  55        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
  56                           EXT4_EXTENT_TAIL_OFFSET(eh));
  57        return cpu_to_le32(csum);
  58}
  59
  60static int ext4_extent_block_csum_verify(struct inode *inode,
  61                                         struct ext4_extent_header *eh)
  62{
  63        struct ext4_extent_tail *et;
  64
  65        if (!ext4_has_metadata_csum(inode->i_sb))
  66                return 1;
  67
  68        et = find_ext4_extent_tail(eh);
  69        if (et->et_checksum != ext4_extent_block_csum(inode, eh))
  70                return 0;
  71        return 1;
  72}
  73
  74static void ext4_extent_block_csum_set(struct inode *inode,
  75                                       struct ext4_extent_header *eh)
  76{
  77        struct ext4_extent_tail *et;
  78
  79        if (!ext4_has_metadata_csum(inode->i_sb))
  80                return;
  81
  82        et = find_ext4_extent_tail(eh);
  83        et->et_checksum = ext4_extent_block_csum(inode, eh);
  84}
  85
  86static int ext4_split_extent(handle_t *handle,
  87                                struct inode *inode,
  88                                struct ext4_ext_path **ppath,
  89                                struct ext4_map_blocks *map,
  90                                int split_flag,
  91                                int flags);
  92
  93static int ext4_split_extent_at(handle_t *handle,
  94                             struct inode *inode,
  95                             struct ext4_ext_path **ppath,
  96                             ext4_lblk_t split,
  97                             int split_flag,
  98                             int flags);
  99
 100static int ext4_find_delayed_extent(struct inode *inode,
 101                                    struct extent_status *newes);
 102
 103static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
 104{
 105        /*
 106         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 107         * moment, get_block can be called only for blocks inside i_size since
 108         * page cache has been already dropped and writes are blocked by
 109         * i_mutex. So we can safely drop the i_data_sem here.
 110         */
 111        BUG_ON(EXT4_JOURNAL(inode) == NULL);
 112        ext4_discard_preallocations(inode);
 113        up_write(&EXT4_I(inode)->i_data_sem);
 114        *dropped = 1;
 115        return 0;
 116}
 117
 118/*
 119 * Make sure 'handle' has at least 'check_cred' credits. If not, restart
 120 * transaction with 'restart_cred' credits. The function drops i_data_sem
 121 * when restarting transaction and gets it after transaction is restarted.
 122 *
 123 * The function returns 0 on success, 1 if transaction had to be restarted,
 124 * and < 0 in case of fatal error.
 125 */
 126int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
 127                                int check_cred, int restart_cred,
 128                                int revoke_cred)
 129{
 130        int ret;
 131        int dropped = 0;
 132
 133        ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
 134                revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
 135        if (dropped)
 136                down_write(&EXT4_I(inode)->i_data_sem);
 137        return ret;
 138}
 139
 140/*
 141 * could return:
 142 *  - EROFS
 143 *  - ENOMEM
 144 */
 145static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 146                                struct ext4_ext_path *path)
 147{
 148        if (path->p_bh) {
 149                /* path points to block */
 150                BUFFER_TRACE(path->p_bh, "get_write_access");
 151                return ext4_journal_get_write_access(handle, path->p_bh);
 152        }
 153        /* path points to leaf/index in inode body */
 154        /* we use in-core data, no need to protect them */
 155        return 0;
 156}
 157
 158/*
 159 * could return:
 160 *  - EROFS
 161 *  - ENOMEM
 162 *  - EIO
 163 */
 164int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
 165                     struct inode *inode, struct ext4_ext_path *path)
 166{
 167        int err;
 168
 169        WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
 170        if (path->p_bh) {
 171                ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
 172                /* path points to block */
 173                err = __ext4_handle_dirty_metadata(where, line, handle,
 174                                                   inode, path->p_bh);
 175        } else {
 176                /* path points to leaf/index in inode body */
 177                err = ext4_mark_inode_dirty(handle, inode);
 178        }
 179        return err;
 180}
 181
 182static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 183                              struct ext4_ext_path *path,
 184                              ext4_lblk_t block)
 185{
 186        if (path) {
 187                int depth = path->p_depth;
 188                struct ext4_extent *ex;
 189
 190                /*
 191                 * Try to predict block placement assuming that we are
 192                 * filling in a file which will eventually be
 193                 * non-sparse --- i.e., in the case of libbfd writing
 194                 * an ELF object sections out-of-order but in a way
 195                 * the eventually results in a contiguous object or
 196                 * executable file, or some database extending a table
 197                 * space file.  However, this is actually somewhat
 198                 * non-ideal if we are writing a sparse file such as
 199                 * qemu or KVM writing a raw image file that is going
 200                 * to stay fairly sparse, since it will end up
 201                 * fragmenting the file system's free space.  Maybe we
 202                 * should have some hueristics or some way to allow
 203                 * userspace to pass a hint to file system,
 204                 * especially if the latter case turns out to be
 205                 * common.
 206                 */
 207                ex = path[depth].p_ext;
 208                if (ex) {
 209                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
 210                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
 211
 212                        if (block > ext_block)
 213                                return ext_pblk + (block - ext_block);
 214                        else
 215                                return ext_pblk - (ext_block - block);
 216                }
 217
 218                /* it looks like index is empty;
 219                 * try to find starting block from index itself */
 220                if (path[depth].p_bh)
 221                        return path[depth].p_bh->b_blocknr;
 222        }
 223
 224        /* OK. use inode's group */
 225        return ext4_inode_to_goal_block(inode);
 226}
 227
 228/*
 229 * Allocation for a meta data block
 230 */
 231static ext4_fsblk_t
 232ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 233                        struct ext4_ext_path *path,
 234                        struct ext4_extent *ex, int *err, unsigned int flags)
 235{
 236        ext4_fsblk_t goal, newblock;
 237
 238        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
 239        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
 240                                        NULL, err);
 241        return newblock;
 242}
 243
 244static inline int ext4_ext_space_block(struct inode *inode, int check)
 245{
 246        int size;
 247
 248        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 249                        / sizeof(struct ext4_extent);
 250#ifdef AGGRESSIVE_TEST
 251        if (!check && size > 6)
 252                size = 6;
 253#endif
 254        return size;
 255}
 256
 257static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 258{
 259        int size;
 260
 261        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 262                        / sizeof(struct ext4_extent_idx);
 263#ifdef AGGRESSIVE_TEST
 264        if (!check && size > 5)
 265                size = 5;
 266#endif
 267        return size;
 268}
 269
 270static inline int ext4_ext_space_root(struct inode *inode, int check)
 271{
 272        int size;
 273
 274        size = sizeof(EXT4_I(inode)->i_data);
 275        size -= sizeof(struct ext4_extent_header);
 276        size /= sizeof(struct ext4_extent);
 277#ifdef AGGRESSIVE_TEST
 278        if (!check && size > 3)
 279                size = 3;
 280#endif
 281        return size;
 282}
 283
 284static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 285{
 286        int size;
 287
 288        size = sizeof(EXT4_I(inode)->i_data);
 289        size -= sizeof(struct ext4_extent_header);
 290        size /= sizeof(struct ext4_extent_idx);
 291#ifdef AGGRESSIVE_TEST
 292        if (!check && size > 4)
 293                size = 4;
 294#endif
 295        return size;
 296}
 297
 298static inline int
 299ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
 300                           struct ext4_ext_path **ppath, ext4_lblk_t lblk,
 301                           int nofail)
 302{
 303        struct ext4_ext_path *path = *ppath;
 304        int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
 305        int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
 306
 307        if (nofail)
 308                flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
 309
 310        return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
 311                        EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
 312                        flags);
 313}
 314
 315/*
 316 * Calculate the number of metadata blocks needed
 317 * to allocate @blocks
 318 * Worse case is one block per extent
 319 */
 320int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 321{
 322        struct ext4_inode_info *ei = EXT4_I(inode);
 323        int idxs;
 324
 325        idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 326                / sizeof(struct ext4_extent_idx));
 327
 328        /*
 329         * If the new delayed allocation block is contiguous with the
 330         * previous da block, it can share index blocks with the
 331         * previous block, so we only need to allocate a new index
 332         * block every idxs leaf blocks.  At ldxs**2 blocks, we need
 333         * an additional index block, and at ldxs**3 blocks, yet
 334         * another index blocks.
 335         */
 336        if (ei->i_da_metadata_calc_len &&
 337            ei->i_da_metadata_calc_last_lblock+1 == lblock) {
 338                int num = 0;
 339
 340                if ((ei->i_da_metadata_calc_len % idxs) == 0)
 341                        num++;
 342                if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
 343                        num++;
 344                if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
 345                        num++;
 346                        ei->i_da_metadata_calc_len = 0;
 347                } else
 348                        ei->i_da_metadata_calc_len++;
 349                ei->i_da_metadata_calc_last_lblock++;
 350                return num;
 351        }
 352
 353        /*
 354         * In the worst case we need a new set of index blocks at
 355         * every level of the inode's extent tree.
 356         */
 357        ei->i_da_metadata_calc_len = 1;
 358        ei->i_da_metadata_calc_last_lblock = lblock;
 359        return ext_depth(inode) + 1;
 360}
 361
 362static int
 363ext4_ext_max_entries(struct inode *inode, int depth)
 364{
 365        int max;
 366
 367        if (depth == ext_depth(inode)) {
 368                if (depth == 0)
 369                        max = ext4_ext_space_root(inode, 1);
 370                else
 371                        max = ext4_ext_space_root_idx(inode, 1);
 372        } else {
 373                if (depth == 0)
 374                        max = ext4_ext_space_block(inode, 1);
 375                else
 376                        max = ext4_ext_space_block_idx(inode, 1);
 377        }
 378
 379        return max;
 380}
 381
 382static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 383{
 384        ext4_fsblk_t block = ext4_ext_pblock(ext);
 385        int len = ext4_ext_get_actual_len(ext);
 386        ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
 387
 388        /*
 389         * We allow neither:
 390         *  - zero length
 391         *  - overflow/wrap-around
 392         */
 393        if (lblock + len <= lblock)
 394                return 0;
 395        return ext4_inode_block_valid(inode, block, len);
 396}
 397
 398static int ext4_valid_extent_idx(struct inode *inode,
 399                                struct ext4_extent_idx *ext_idx)
 400{
 401        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
 402
 403        return ext4_inode_block_valid(inode, block, 1);
 404}
 405
 406static int ext4_valid_extent_entries(struct inode *inode,
 407                                     struct ext4_extent_header *eh,
 408                                     ext4_fsblk_t *pblk, int depth)
 409{
 410        unsigned short entries;
 411        if (eh->eh_entries == 0)
 412                return 1;
 413
 414        entries = le16_to_cpu(eh->eh_entries);
 415
 416        if (depth == 0) {
 417                /* leaf entries */
 418                struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
 419                ext4_lblk_t lblock = 0;
 420                ext4_lblk_t prev = 0;
 421                int len = 0;
 422                while (entries) {
 423                        if (!ext4_valid_extent(inode, ext))
 424                                return 0;
 425
 426                        /* Check for overlapping extents */
 427                        lblock = le32_to_cpu(ext->ee_block);
 428                        len = ext4_ext_get_actual_len(ext);
 429                        if ((lblock <= prev) && prev) {
 430                                *pblk = ext4_ext_pblock(ext);
 431                                return 0;
 432                        }
 433                        ext++;
 434                        entries--;
 435                        prev = lblock + len - 1;
 436                }
 437        } else {
 438                struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
 439                while (entries) {
 440                        if (!ext4_valid_extent_idx(inode, ext_idx))
 441                                return 0;
 442                        ext_idx++;
 443                        entries--;
 444                }
 445        }
 446        return 1;
 447}
 448
 449static int __ext4_ext_check(const char *function, unsigned int line,
 450                            struct inode *inode, struct ext4_extent_header *eh,
 451                            int depth, ext4_fsblk_t pblk)
 452{
 453        const char *error_msg;
 454        int max = 0, err = -EFSCORRUPTED;
 455
 456        if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
 457                error_msg = "invalid magic";
 458                goto corrupted;
 459        }
 460        if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
 461                error_msg = "unexpected eh_depth";
 462                goto corrupted;
 463        }
 464        if (unlikely(eh->eh_max == 0)) {
 465                error_msg = "invalid eh_max";
 466                goto corrupted;
 467        }
 468        max = ext4_ext_max_entries(inode, depth);
 469        if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
 470                error_msg = "too large eh_max";
 471                goto corrupted;
 472        }
 473        if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
 474                error_msg = "invalid eh_entries";
 475                goto corrupted;
 476        }
 477        if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
 478                error_msg = "invalid extent entries";
 479                goto corrupted;
 480        }
 481        if (unlikely(depth > 32)) {
 482                error_msg = "too large eh_depth";
 483                goto corrupted;
 484        }
 485        /* Verify checksum on non-root extent tree nodes */
 486        if (ext_depth(inode) != depth &&
 487            !ext4_extent_block_csum_verify(inode, eh)) {
 488                error_msg = "extent tree corrupted";
 489                err = -EFSBADCRC;
 490                goto corrupted;
 491        }
 492        return 0;
 493
 494corrupted:
 495        ext4_error_inode_err(inode, function, line, 0, -err,
 496                             "pblk %llu bad header/extent: %s - magic %x, "
 497                             "entries %u, max %u(%u), depth %u(%u)",
 498                             (unsigned long long) pblk, error_msg,
 499                             le16_to_cpu(eh->eh_magic),
 500                             le16_to_cpu(eh->eh_entries),
 501                             le16_to_cpu(eh->eh_max),
 502                             max, le16_to_cpu(eh->eh_depth), depth);
 503        return err;
 504}
 505
 506#define ext4_ext_check(inode, eh, depth, pblk)                  \
 507        __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
 508
 509int ext4_ext_check_inode(struct inode *inode)
 510{
 511        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
 512}
 513
 514static void ext4_cache_extents(struct inode *inode,
 515                               struct ext4_extent_header *eh)
 516{
 517        struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
 518        ext4_lblk_t prev = 0;
 519        int i;
 520
 521        for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
 522                unsigned int status = EXTENT_STATUS_WRITTEN;
 523                ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
 524                int len = ext4_ext_get_actual_len(ex);
 525
 526                if (prev && (prev != lblk))
 527                        ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
 528                                             EXTENT_STATUS_HOLE);
 529
 530                if (ext4_ext_is_unwritten(ex))
 531                        status = EXTENT_STATUS_UNWRITTEN;
 532                ext4_es_cache_extent(inode, lblk, len,
 533                                     ext4_ext_pblock(ex), status);
 534                prev = lblk + len;
 535        }
 536}
 537
 538static struct buffer_head *
 539__read_extent_tree_block(const char *function, unsigned int line,
 540                         struct inode *inode, ext4_fsblk_t pblk, int depth,
 541                         int flags)
 542{
 543        struct buffer_head              *bh;
 544        int                             err;
 545        gfp_t                           gfp_flags = __GFP_MOVABLE | GFP_NOFS;
 546
 547        if (flags & EXT4_EX_NOFAIL)
 548                gfp_flags |= __GFP_NOFAIL;
 549
 550        bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
 551        if (unlikely(!bh))
 552                return ERR_PTR(-ENOMEM);
 553
 554        if (!bh_uptodate_or_lock(bh)) {
 555                trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
 556                clear_buffer_verified(bh);
 557                err = bh_submit_read(bh);
 558                if (err < 0)
 559                        goto errout;
 560        }
 561        if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
 562                return bh;
 563        err = __ext4_ext_check(function, line, inode,
 564                               ext_block_hdr(bh), depth, pblk);
 565        if (err)
 566                goto errout;
 567        set_buffer_verified(bh);
 568        /*
 569         * If this is a leaf block, cache all of its entries
 570         */
 571        if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
 572                struct ext4_extent_header *eh = ext_block_hdr(bh);
 573                ext4_cache_extents(inode, eh);
 574        }
 575        return bh;
 576errout:
 577        put_bh(bh);
 578        return ERR_PTR(err);
 579
 580}
 581
 582#define read_extent_tree_block(inode, pblk, depth, flags)               \
 583        __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
 584                                 (depth), (flags))
 585
 586/*
 587 * This function is called to cache a file's extent information in the
 588 * extent status tree
 589 */
 590int ext4_ext_precache(struct inode *inode)
 591{
 592        struct ext4_inode_info *ei = EXT4_I(inode);
 593        struct ext4_ext_path *path = NULL;
 594        struct buffer_head *bh;
 595        int i = 0, depth, ret = 0;
 596
 597        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 598                return 0;       /* not an extent-mapped inode */
 599
 600        down_read(&ei->i_data_sem);
 601        depth = ext_depth(inode);
 602
 603        path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
 604                       GFP_NOFS);
 605        if (path == NULL) {
 606                up_read(&ei->i_data_sem);
 607                return -ENOMEM;
 608        }
 609
 610        /* Don't cache anything if there are no external extent blocks */
 611        if (depth == 0)
 612                goto out;
 613        path[0].p_hdr = ext_inode_hdr(inode);
 614        ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
 615        if (ret)
 616                goto out;
 617        path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
 618        while (i >= 0) {
 619                /*
 620                 * If this is a leaf block or we've reached the end of
 621                 * the index block, go up
 622                 */
 623                if ((i == depth) ||
 624                    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
 625                        brelse(path[i].p_bh);
 626                        path[i].p_bh = NULL;
 627                        i--;
 628                        continue;
 629                }
 630                bh = read_extent_tree_block(inode,
 631                                            ext4_idx_pblock(path[i].p_idx++),
 632                                            depth - i - 1,
 633                                            EXT4_EX_FORCE_CACHE);
 634                if (IS_ERR(bh)) {
 635                        ret = PTR_ERR(bh);
 636                        break;
 637                }
 638                i++;
 639                path[i].p_bh = bh;
 640                path[i].p_hdr = ext_block_hdr(bh);
 641                path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
 642        }
 643        ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
 644out:
 645        up_read(&ei->i_data_sem);
 646        ext4_ext_drop_refs(path);
 647        kfree(path);
 648        return ret;
 649}
 650
 651#ifdef EXT_DEBUG
 652static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 653{
 654        int k, l = path->p_depth;
 655
 656        ext_debug("path:");
 657        for (k = 0; k <= l; k++, path++) {
 658                if (path->p_idx) {
 659                        ext_debug("  %d->%llu",
 660                                  le32_to_cpu(path->p_idx->ei_block),
 661                                  ext4_idx_pblock(path->p_idx));
 662                } else if (path->p_ext) {
 663                        ext_debug("  %d:[%d]%d:%llu ",
 664                                  le32_to_cpu(path->p_ext->ee_block),
 665                                  ext4_ext_is_unwritten(path->p_ext),
 666                                  ext4_ext_get_actual_len(path->p_ext),
 667                                  ext4_ext_pblock(path->p_ext));
 668                } else
 669                        ext_debug("  []");
 670        }
 671        ext_debug("\n");
 672}
 673
 674static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 675{
 676        int depth = ext_depth(inode);
 677        struct ext4_extent_header *eh;
 678        struct ext4_extent *ex;
 679        int i;
 680
 681        if (!path)
 682                return;
 683
 684        eh = path[depth].p_hdr;
 685        ex = EXT_FIRST_EXTENT(eh);
 686
 687        ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
 688
 689        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
 690                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
 691                          ext4_ext_is_unwritten(ex),
 692                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
 693        }
 694        ext_debug("\n");
 695}
 696
 697static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
 698                        ext4_fsblk_t newblock, int level)
 699{
 700        int depth = ext_depth(inode);
 701        struct ext4_extent *ex;
 702
 703        if (depth != level) {
 704                struct ext4_extent_idx *idx;
 705                idx = path[level].p_idx;
 706                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
 707                        ext_debug("%d: move %d:%llu in new index %llu\n", level,
 708                                        le32_to_cpu(idx->ei_block),
 709                                        ext4_idx_pblock(idx),
 710                                        newblock);
 711                        idx++;
 712                }
 713
 714                return;
 715        }
 716
 717        ex = path[depth].p_ext;
 718        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
 719                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
 720                                le32_to_cpu(ex->ee_block),
 721                                ext4_ext_pblock(ex),
 722                                ext4_ext_is_unwritten(ex),
 723                                ext4_ext_get_actual_len(ex),
 724                                newblock);
 725                ex++;
 726        }
 727}
 728
 729#else
 730#define ext4_ext_show_path(inode, path)
 731#define ext4_ext_show_leaf(inode, path)
 732#define ext4_ext_show_move(inode, path, newblock, level)
 733#endif
 734
 735void ext4_ext_drop_refs(struct ext4_ext_path *path)
 736{
 737        int depth, i;
 738
 739        if (!path)
 740                return;
 741        depth = path->p_depth;
 742        for (i = 0; i <= depth; i++, path++)
 743                if (path->p_bh) {
 744                        brelse(path->p_bh);
 745                        path->p_bh = NULL;
 746                }
 747}
 748
 749/*
 750 * ext4_ext_binsearch_idx:
 751 * binary search for the closest index of the given block
 752 * the header must be checked before calling this
 753 */
 754static void
 755ext4_ext_binsearch_idx(struct inode *inode,
 756                        struct ext4_ext_path *path, ext4_lblk_t block)
 757{
 758        struct ext4_extent_header *eh = path->p_hdr;
 759        struct ext4_extent_idx *r, *l, *m;
 760
 761
 762        ext_debug("binsearch for %u(idx):  ", block);
 763
 764        l = EXT_FIRST_INDEX(eh) + 1;
 765        r = EXT_LAST_INDEX(eh);
 766        while (l <= r) {
 767                m = l + (r - l) / 2;
 768                if (block < le32_to_cpu(m->ei_block))
 769                        r = m - 1;
 770                else
 771                        l = m + 1;
 772                ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
 773                                m, le32_to_cpu(m->ei_block),
 774                                r, le32_to_cpu(r->ei_block));
 775        }
 776
 777        path->p_idx = l - 1;
 778        ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
 779                  ext4_idx_pblock(path->p_idx));
 780
 781#ifdef CHECK_BINSEARCH
 782        {
 783                struct ext4_extent_idx *chix, *ix;
 784                int k;
 785
 786                chix = ix = EXT_FIRST_INDEX(eh);
 787                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
 788                        if (k != 0 && le32_to_cpu(ix->ei_block) <=
 789                            le32_to_cpu(ix[-1].ei_block)) {
 790                                printk(KERN_DEBUG "k=%d, ix=0x%p, "
 791                                       "first=0x%p\n", k,
 792                                       ix, EXT_FIRST_INDEX(eh));
 793                                printk(KERN_DEBUG "%u <= %u\n",
 794                                       le32_to_cpu(ix->ei_block),
 795                                       le32_to_cpu(ix[-1].ei_block));
 796                        }
 797                        BUG_ON(k && le32_to_cpu(ix->ei_block)
 798                                           <= le32_to_cpu(ix[-1].ei_block));
 799                        if (block < le32_to_cpu(ix->ei_block))
 800                                break;
 801                        chix = ix;
 802                }
 803                BUG_ON(chix != path->p_idx);
 804        }
 805#endif
 806
 807}
 808
 809/*
 810 * ext4_ext_binsearch:
 811 * binary search for closest extent of the given block
 812 * the header must be checked before calling this
 813 */
 814static void
 815ext4_ext_binsearch(struct inode *inode,
 816                struct ext4_ext_path *path, ext4_lblk_t block)
 817{
 818        struct ext4_extent_header *eh = path->p_hdr;
 819        struct ext4_extent *r, *l, *m;
 820
 821        if (eh->eh_entries == 0) {
 822                /*
 823                 * this leaf is empty:
 824                 * we get such a leaf in split/add case
 825                 */
 826                return;
 827        }
 828
 829        ext_debug("binsearch for %u:  ", block);
 830
 831        l = EXT_FIRST_EXTENT(eh) + 1;
 832        r = EXT_LAST_EXTENT(eh);
 833
 834        while (l <= r) {
 835                m = l + (r - l) / 2;
 836                if (block < le32_to_cpu(m->ee_block))
 837                        r = m - 1;
 838                else
 839                        l = m + 1;
 840                ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
 841                                m, le32_to_cpu(m->ee_block),
 842                                r, le32_to_cpu(r->ee_block));
 843        }
 844
 845        path->p_ext = l - 1;
 846        ext_debug("  -> %d:%llu:[%d]%d ",
 847                        le32_to_cpu(path->p_ext->ee_block),
 848                        ext4_ext_pblock(path->p_ext),
 849                        ext4_ext_is_unwritten(path->p_ext),
 850                        ext4_ext_get_actual_len(path->p_ext));
 851
 852#ifdef CHECK_BINSEARCH
 853        {
 854                struct ext4_extent *chex, *ex;
 855                int k;
 856
 857                chex = ex = EXT_FIRST_EXTENT(eh);
 858                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
 859                        BUG_ON(k && le32_to_cpu(ex->ee_block)
 860                                          <= le32_to_cpu(ex[-1].ee_block));
 861                        if (block < le32_to_cpu(ex->ee_block))
 862                                break;
 863                        chex = ex;
 864                }
 865                BUG_ON(chex != path->p_ext);
 866        }
 867#endif
 868
 869}
 870
 871int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 872{
 873        struct ext4_extent_header *eh;
 874
 875        eh = ext_inode_hdr(inode);
 876        eh->eh_depth = 0;
 877        eh->eh_entries = 0;
 878        eh->eh_magic = EXT4_EXT_MAGIC;
 879        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
 880        ext4_mark_inode_dirty(handle, inode);
 881        return 0;
 882}
 883
 884struct ext4_ext_path *
 885ext4_find_extent(struct inode *inode, ext4_lblk_t block,
 886                 struct ext4_ext_path **orig_path, int flags)
 887{
 888        struct ext4_extent_header *eh;
 889        struct buffer_head *bh;
 890        struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
 891        short int depth, i, ppos = 0;
 892        int ret;
 893        gfp_t gfp_flags = GFP_NOFS;
 894
 895        if (flags & EXT4_EX_NOFAIL)
 896                gfp_flags |= __GFP_NOFAIL;
 897
 898        eh = ext_inode_hdr(inode);
 899        depth = ext_depth(inode);
 900        if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
 901                EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
 902                                 depth);
 903                ret = -EFSCORRUPTED;
 904                goto err;
 905        }
 906
 907        if (path) {
 908                ext4_ext_drop_refs(path);
 909                if (depth > path[0].p_maxdepth) {
 910                        kfree(path);
 911                        *orig_path = path = NULL;
 912                }
 913        }
 914        if (!path) {
 915                /* account possible depth increase */
 916                path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
 917                                gfp_flags);
 918                if (unlikely(!path))
 919                        return ERR_PTR(-ENOMEM);
 920                path[0].p_maxdepth = depth + 1;
 921        }
 922        path[0].p_hdr = eh;
 923        path[0].p_bh = NULL;
 924
 925        i = depth;
 926        if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
 927                ext4_cache_extents(inode, eh);
 928        /* walk through the tree */
 929        while (i) {
 930                ext_debug("depth %d: num %d, max %d\n",
 931                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 932
 933                ext4_ext_binsearch_idx(inode, path + ppos, block);
 934                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 935                path[ppos].p_depth = i;
 936                path[ppos].p_ext = NULL;
 937
 938                bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
 939                                            flags);
 940                if (IS_ERR(bh)) {
 941                        ret = PTR_ERR(bh);
 942                        goto err;
 943                }
 944
 945                eh = ext_block_hdr(bh);
 946                ppos++;
 947                path[ppos].p_bh = bh;
 948                path[ppos].p_hdr = eh;
 949        }
 950
 951        path[ppos].p_depth = i;
 952        path[ppos].p_ext = NULL;
 953        path[ppos].p_idx = NULL;
 954
 955        /* find extent */
 956        ext4_ext_binsearch(inode, path + ppos, block);
 957        /* if not an empty leaf */
 958        if (path[ppos].p_ext)
 959                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 960
 961        ext4_ext_show_path(inode, path);
 962
 963        return path;
 964
 965err:
 966        ext4_ext_drop_refs(path);
 967        kfree(path);
 968        if (orig_path)
 969                *orig_path = NULL;
 970        return ERR_PTR(ret);
 971}
 972
 973/*
 974 * ext4_ext_insert_index:
 975 * insert new index [@logical;@ptr] into the block at @curp;
 976 * check where to insert: before @curp or after @curp
 977 */
 978static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 979                                 struct ext4_ext_path *curp,
 980                                 int logical, ext4_fsblk_t ptr)
 981{
 982        struct ext4_extent_idx *ix;
 983        int len, err;
 984
 985        err = ext4_ext_get_access(handle, inode, curp);
 986        if (err)
 987                return err;
 988
 989        if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
 990                EXT4_ERROR_INODE(inode,
 991                                 "logical %d == ei_block %d!",
 992                                 logical, le32_to_cpu(curp->p_idx->ei_block));
 993                return -EFSCORRUPTED;
 994        }
 995
 996        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
 997                             >= le16_to_cpu(curp->p_hdr->eh_max))) {
 998                EXT4_ERROR_INODE(inode,
 999                                 "eh_entries %d >= eh_max %d!",
1000                                 le16_to_cpu(curp->p_hdr->eh_entries),
1001                                 le16_to_cpu(curp->p_hdr->eh_max));
1002                return -EFSCORRUPTED;
1003        }
1004
1005        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
1006                /* insert after */
1007                ext_debug("insert new index %d after: %llu\n", logical, ptr);
1008                ix = curp->p_idx + 1;
1009        } else {
1010                /* insert before */
1011                ext_debug("insert new index %d before: %llu\n", logical, ptr);
1012                ix = curp->p_idx;
1013        }
1014
1015        len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
1016        BUG_ON(len < 0);
1017        if (len > 0) {
1018                ext_debug("insert new index %d: "
1019                                "move %d indices from 0x%p to 0x%p\n",
1020                                logical, len, ix, ix + 1);
1021                memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
1022        }
1023
1024        if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
1025                EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
1026                return -EFSCORRUPTED;
1027        }
1028
1029        ix->ei_block = cpu_to_le32(logical);
1030        ext4_idx_store_pblock(ix, ptr);
1031        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
1032
1033        if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
1034                EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
1035                return -EFSCORRUPTED;
1036        }
1037
1038        err = ext4_ext_dirty(handle, inode, curp);
1039        ext4_std_error(inode->i_sb, err);
1040
1041        return err;
1042}
1043
1044/*
1045 * ext4_ext_split:
1046 * inserts new subtree into the path, using free index entry
1047 * at depth @at:
1048 * - allocates all needed blocks (new leaf and all intermediate index blocks)
1049 * - makes decision where to split
1050 * - moves remaining extents and index entries (right to the split point)
1051 *   into the newly allocated blocks
1052 * - initializes subtree
1053 */
1054static int ext4_ext_split(handle_t *handle, struct inode *inode,
1055                          unsigned int flags,
1056                          struct ext4_ext_path *path,
1057                          struct ext4_extent *newext, int at)
1058{
1059        struct buffer_head *bh = NULL;
1060        int depth = ext_depth(inode);
1061        struct ext4_extent_header *neh;
1062        struct ext4_extent_idx *fidx;
1063        int i = at, k, m, a;
1064        ext4_fsblk_t newblock, oldblock;
1065        __le32 border;
1066        ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1067        gfp_t gfp_flags = GFP_NOFS;
1068        int err = 0;
1069        size_t ext_size = 0;
1070
1071        if (flags & EXT4_EX_NOFAIL)
1072                gfp_flags |= __GFP_NOFAIL;
1073
1074        /* make decision: where to split? */
1075        /* FIXME: now decision is simplest: at current extent */
1076
1077        /* if current leaf will be split, then we should use
1078         * border from split point */
1079        if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1080                EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1081                return -EFSCORRUPTED;
1082        }
1083        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1084                border = path[depth].p_ext[1].ee_block;
1085                ext_debug("leaf will be split."
1086                                " next leaf starts at %d\n",
1087                                  le32_to_cpu(border));
1088        } else {
1089                border = newext->ee_block;
1090                ext_debug("leaf will be added."
1091                                " next leaf starts at %d\n",
1092                                le32_to_cpu(border));
1093        }
1094
1095        /*
1096         * If error occurs, then we break processing
1097         * and mark filesystem read-only. index won't
1098         * be inserted and tree will be in consistent
1099         * state. Next mount will repair buffers too.
1100         */
1101
1102        /*
1103         * Get array to track all allocated blocks.
1104         * We need this to handle errors and free blocks
1105         * upon them.
1106         */
1107        ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
1108        if (!ablocks)
1109                return -ENOMEM;
1110
1111        /* allocate all needed blocks */
1112        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1113        for (a = 0; a < depth - at; a++) {
1114                newblock = ext4_ext_new_meta_block(handle, inode, path,
1115                                                   newext, &err, flags);
1116                if (newblock == 0)
1117                        goto cleanup;
1118                ablocks[a] = newblock;
1119        }
1120
1121        /* initialize new leaf */
1122        newblock = ablocks[--a];
1123        if (unlikely(newblock == 0)) {
1124                EXT4_ERROR_INODE(inode, "newblock == 0!");
1125                err = -EFSCORRUPTED;
1126                goto cleanup;
1127        }
1128        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1129        if (unlikely(!bh)) {
1130                err = -ENOMEM;
1131                goto cleanup;
1132        }
1133        lock_buffer(bh);
1134
1135        err = ext4_journal_get_create_access(handle, bh);
1136        if (err)
1137                goto cleanup;
1138
1139        neh = ext_block_hdr(bh);
1140        neh->eh_entries = 0;
1141        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1142        neh->eh_magic = EXT4_EXT_MAGIC;
1143        neh->eh_depth = 0;
1144
1145        /* move remainder of path[depth] to the new leaf */
1146        if (unlikely(path[depth].p_hdr->eh_entries !=
1147                     path[depth].p_hdr->eh_max)) {
1148                EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1149                                 path[depth].p_hdr->eh_entries,
1150                                 path[depth].p_hdr->eh_max);
1151                err = -EFSCORRUPTED;
1152                goto cleanup;
1153        }
1154        /* start copy from next extent */
1155        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1156        ext4_ext_show_move(inode, path, newblock, depth);
1157        if (m) {
1158                struct ext4_extent *ex;
1159                ex = EXT_FIRST_EXTENT(neh);
1160                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1161                le16_add_cpu(&neh->eh_entries, m);
1162        }
1163
1164        /* zero out unused area in the extent block */
1165        ext_size = sizeof(struct ext4_extent_header) +
1166                sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
1167        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1168        ext4_extent_block_csum_set(inode, neh);
1169        set_buffer_uptodate(bh);
1170        unlock_buffer(bh);
1171
1172        err = ext4_handle_dirty_metadata(handle, inode, bh);
1173        if (err)
1174                goto cleanup;
1175        brelse(bh);
1176        bh = NULL;
1177
1178        /* correct old leaf */
1179        if (m) {
1180                err = ext4_ext_get_access(handle, inode, path + depth);
1181                if (err)
1182                        goto cleanup;
1183                le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1184                err = ext4_ext_dirty(handle, inode, path + depth);
1185                if (err)
1186                        goto cleanup;
1187
1188        }
1189
1190        /* create intermediate indexes */
1191        k = depth - at - 1;
1192        if (unlikely(k < 0)) {
1193                EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1194                err = -EFSCORRUPTED;
1195                goto cleanup;
1196        }
1197        if (k)
1198                ext_debug("create %d intermediate indices\n", k);
1199        /* insert new index into current index block */
1200        /* current depth stored in i var */
1201        i = depth - 1;
1202        while (k--) {
1203                oldblock = newblock;
1204                newblock = ablocks[--a];
1205                bh = sb_getblk(inode->i_sb, newblock);
1206                if (unlikely(!bh)) {
1207                        err = -ENOMEM;
1208                        goto cleanup;
1209                }
1210                lock_buffer(bh);
1211
1212                err = ext4_journal_get_create_access(handle, bh);
1213                if (err)
1214                        goto cleanup;
1215
1216                neh = ext_block_hdr(bh);
1217                neh->eh_entries = cpu_to_le16(1);
1218                neh->eh_magic = EXT4_EXT_MAGIC;
1219                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1220                neh->eh_depth = cpu_to_le16(depth - i);
1221                fidx = EXT_FIRST_INDEX(neh);
1222                fidx->ei_block = border;
1223                ext4_idx_store_pblock(fidx, oldblock);
1224
1225                ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1226                                i, newblock, le32_to_cpu(border), oldblock);
1227
1228                /* move remainder of path[i] to the new index block */
1229                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1230                                        EXT_LAST_INDEX(path[i].p_hdr))) {
1231                        EXT4_ERROR_INODE(inode,
1232                                         "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1233                                         le32_to_cpu(path[i].p_ext->ee_block));
1234                        err = -EFSCORRUPTED;
1235                        goto cleanup;
1236                }
1237                /* start copy indexes */
1238                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1239                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1240                                EXT_MAX_INDEX(path[i].p_hdr));
1241                ext4_ext_show_move(inode, path, newblock, i);
1242                if (m) {
1243                        memmove(++fidx, path[i].p_idx,
1244                                sizeof(struct ext4_extent_idx) * m);
1245                        le16_add_cpu(&neh->eh_entries, m);
1246                }
1247                /* zero out unused area in the extent block */
1248                ext_size = sizeof(struct ext4_extent_header) +
1249                   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
1250                memset(bh->b_data + ext_size, 0,
1251                        inode->i_sb->s_blocksize - ext_size);
1252                ext4_extent_block_csum_set(inode, neh);
1253                set_buffer_uptodate(bh);
1254                unlock_buffer(bh);
1255
1256                err = ext4_handle_dirty_metadata(handle, inode, bh);
1257                if (err)
1258                        goto cleanup;
1259                brelse(bh);
1260                bh = NULL;
1261
1262                /* correct old index */
1263                if (m) {
1264                        err = ext4_ext_get_access(handle, inode, path + i);
1265                        if (err)
1266                                goto cleanup;
1267                        le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1268                        err = ext4_ext_dirty(handle, inode, path + i);
1269                        if (err)
1270                                goto cleanup;
1271                }
1272
1273                i--;
1274        }
1275
1276        /* insert new index */
1277        err = ext4_ext_insert_index(handle, inode, path + at,
1278                                    le32_to_cpu(border), newblock);
1279
1280cleanup:
1281        if (bh) {
1282                if (buffer_locked(bh))
1283                        unlock_buffer(bh);
1284                brelse(bh);
1285        }
1286
1287        if (err) {
1288                /* free all allocated blocks in error case */
1289                for (i = 0; i < depth; i++) {
1290                        if (!ablocks[i])
1291                                continue;
1292                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1293                                         EXT4_FREE_BLOCKS_METADATA);
1294                }
1295        }
1296        kfree(ablocks);
1297
1298        return err;
1299}
1300
1301/*
1302 * ext4_ext_grow_indepth:
1303 * implements tree growing procedure:
1304 * - allocates new block
1305 * - moves top-level data (index block or leaf) into the new block
1306 * - initializes new top-level, creating index that points to the
1307 *   just created block
1308 */
1309static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1310                                 unsigned int flags)
1311{
1312        struct ext4_extent_header *neh;
1313        struct buffer_head *bh;
1314        ext4_fsblk_t newblock, goal = 0;
1315        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1316        int err = 0;
1317        size_t ext_size = 0;
1318
1319        /* Try to prepend new index to old one */
1320        if (ext_depth(inode))
1321                goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1322        if (goal > le32_to_cpu(es->s_first_data_block)) {
1323                flags |= EXT4_MB_HINT_TRY_GOAL;
1324                goal--;
1325        } else
1326                goal = ext4_inode_to_goal_block(inode);
1327        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1328                                        NULL, &err);
1329        if (newblock == 0)
1330                return err;
1331
1332        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1333        if (unlikely(!bh))
1334                return -ENOMEM;
1335        lock_buffer(bh);
1336
1337        err = ext4_journal_get_create_access(handle, bh);
1338        if (err) {
1339                unlock_buffer(bh);
1340                goto out;
1341        }
1342
1343        ext_size = sizeof(EXT4_I(inode)->i_data);
1344        /* move top-level index/leaf into new block */
1345        memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
1346        /* zero out unused area in the extent block */
1347        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1348
1349        /* set size of new block */
1350        neh = ext_block_hdr(bh);
1351        /* old root could have indexes or leaves
1352         * so calculate e_max right way */
1353        if (ext_depth(inode))
1354                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1355        else
1356                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1357        neh->eh_magic = EXT4_EXT_MAGIC;
1358        ext4_extent_block_csum_set(inode, neh);
1359        set_buffer_uptodate(bh);
1360        unlock_buffer(bh);
1361
1362        err = ext4_handle_dirty_metadata(handle, inode, bh);
1363        if (err)
1364                goto out;
1365
1366        /* Update top-level index: num,max,pointer */
1367        neh = ext_inode_hdr(inode);
1368        neh->eh_entries = cpu_to_le16(1);
1369        ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1370        if (neh->eh_depth == 0) {
1371                /* Root extent block becomes index block */
1372                neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1373                EXT_FIRST_INDEX(neh)->ei_block =
1374                        EXT_FIRST_EXTENT(neh)->ee_block;
1375        }
1376        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1377                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1378                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1379                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1380
1381        le16_add_cpu(&neh->eh_depth, 1);
1382        ext4_mark_inode_dirty(handle, inode);
1383out:
1384        brelse(bh);
1385
1386        return err;
1387}
1388
1389/*
1390 * ext4_ext_create_new_leaf:
1391 * finds empty index and adds new leaf.
1392 * if no free index is found, then it requests in-depth growing.
1393 */
1394static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1395                                    unsigned int mb_flags,
1396                                    unsigned int gb_flags,
1397                                    struct ext4_ext_path **ppath,
1398                                    struct ext4_extent *newext)
1399{
1400        struct ext4_ext_path *path = *ppath;
1401        struct ext4_ext_path *curp;
1402        int depth, i, err = 0;
1403
1404repeat:
1405        i = depth = ext_depth(inode);
1406
1407        /* walk up to the tree and look for free index entry */
1408        curp = path + depth;
1409        while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1410                i--;
1411                curp--;
1412        }
1413
1414        /* we use already allocated block for index block,
1415         * so subsequent data blocks should be contiguous */
1416        if (EXT_HAS_FREE_INDEX(curp)) {
1417                /* if we found index with free entry, then use that
1418                 * entry: create all needed subtree and add new leaf */
1419                err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1420                if (err)
1421                        goto out;
1422
1423                /* refill path */
1424                path = ext4_find_extent(inode,
1425                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1426                                    ppath, gb_flags);
1427                if (IS_ERR(path))
1428                        err = PTR_ERR(path);
1429        } else {
1430                /* tree is full, time to grow in depth */
1431                err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1432                if (err)
1433                        goto out;
1434
1435                /* refill path */
1436                path = ext4_find_extent(inode,
1437                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1438                                    ppath, gb_flags);
1439                if (IS_ERR(path)) {
1440                        err = PTR_ERR(path);
1441                        goto out;
1442                }
1443
1444                /*
1445                 * only first (depth 0 -> 1) produces free space;
1446                 * in all other cases we have to split the grown tree
1447                 */
1448                depth = ext_depth(inode);
1449                if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1450                        /* now we need to split */
1451                        goto repeat;
1452                }
1453        }
1454
1455out:
1456        return err;
1457}
1458
1459/*
1460 * search the closest allocated block to the left for *logical
1461 * and returns it at @logical + it's physical address at @phys
1462 * if *logical is the smallest allocated block, the function
1463 * returns 0 at @phys
1464 * return value contains 0 (success) or error code
1465 */
1466static int ext4_ext_search_left(struct inode *inode,
1467                                struct ext4_ext_path *path,
1468                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
1469{
1470        struct ext4_extent_idx *ix;
1471        struct ext4_extent *ex;
1472        int depth, ee_len;
1473
1474        if (unlikely(path == NULL)) {
1475                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1476                return -EFSCORRUPTED;
1477        }
1478        depth = path->p_depth;
1479        *phys = 0;
1480
1481        if (depth == 0 && path->p_ext == NULL)
1482                return 0;
1483
1484        /* usually extent in the path covers blocks smaller
1485         * then *logical, but it can be that extent is the
1486         * first one in the file */
1487
1488        ex = path[depth].p_ext;
1489        ee_len = ext4_ext_get_actual_len(ex);
1490        if (*logical < le32_to_cpu(ex->ee_block)) {
1491                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1492                        EXT4_ERROR_INODE(inode,
1493                                         "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1494                                         *logical, le32_to_cpu(ex->ee_block));
1495                        return -EFSCORRUPTED;
1496                }
1497                while (--depth >= 0) {
1498                        ix = path[depth].p_idx;
1499                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1500                                EXT4_ERROR_INODE(inode,
1501                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1502                                  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1503                                  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1504                le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1505                                  depth);
1506                                return -EFSCORRUPTED;
1507                        }
1508                }
1509                return 0;
1510        }
1511
1512        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1513                EXT4_ERROR_INODE(inode,
1514                                 "logical %d < ee_block %d + ee_len %d!",
1515                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
1516                return -EFSCORRUPTED;
1517        }
1518
1519        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1520        *phys = ext4_ext_pblock(ex) + ee_len - 1;
1521        return 0;
1522}
1523
1524/*
1525 * Search the closest allocated block to the right for *logical
1526 * and returns it at @logical + it's physical address at @phys.
1527 * If not exists, return 0 and @phys is set to 0. We will return
1528 * 1 which means we found an allocated block and ret_ex is valid.
1529 * Or return a (< 0) error code.
1530 */
1531static int ext4_ext_search_right(struct inode *inode,
1532                                 struct ext4_ext_path *path,
1533                                 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1534                                 struct ext4_extent *ret_ex)
1535{
1536        struct buffer_head *bh = NULL;
1537        struct ext4_extent_header *eh;
1538        struct ext4_extent_idx *ix;
1539        struct ext4_extent *ex;
1540        ext4_fsblk_t block;
1541        int depth;      /* Note, NOT eh_depth; depth from top of tree */
1542        int ee_len;
1543
1544        if (unlikely(path == NULL)) {
1545                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1546                return -EFSCORRUPTED;
1547        }
1548        depth = path->p_depth;
1549        *phys = 0;
1550
1551        if (depth == 0 && path->p_ext == NULL)
1552                return 0;
1553
1554        /* usually extent in the path covers blocks smaller
1555         * then *logical, but it can be that extent is the
1556         * first one in the file */
1557
1558        ex = path[depth].p_ext;
1559        ee_len = ext4_ext_get_actual_len(ex);
1560        if (*logical < le32_to_cpu(ex->ee_block)) {
1561                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1562                        EXT4_ERROR_INODE(inode,
1563                                         "first_extent(path[%d].p_hdr) != ex",
1564                                         depth);
1565                        return -EFSCORRUPTED;
1566                }
1567                while (--depth >= 0) {
1568                        ix = path[depth].p_idx;
1569                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1570                                EXT4_ERROR_INODE(inode,
1571                                                 "ix != EXT_FIRST_INDEX *logical %d!",
1572                                                 *logical);
1573                                return -EFSCORRUPTED;
1574                        }
1575                }
1576                goto found_extent;
1577        }
1578
1579        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1580                EXT4_ERROR_INODE(inode,
1581                                 "logical %d < ee_block %d + ee_len %d!",
1582                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
1583                return -EFSCORRUPTED;
1584        }
1585
1586        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1587                /* next allocated block in this leaf */
1588                ex++;
1589                goto found_extent;
1590        }
1591
1592        /* go up and search for index to the right */
1593        while (--depth >= 0) {
1594                ix = path[depth].p_idx;
1595                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1596                        goto got_index;
1597        }
1598
1599        /* we've gone up to the root and found no index to the right */
1600        return 0;
1601
1602got_index:
1603        /* we've found index to the right, let's
1604         * follow it and find the closest allocated
1605         * block to the right */
1606        ix++;
1607        block = ext4_idx_pblock(ix);
1608        while (++depth < path->p_depth) {
1609                /* subtract from p_depth to get proper eh_depth */
1610                bh = read_extent_tree_block(inode, block,
1611                                            path->p_depth - depth, 0);
1612                if (IS_ERR(bh))
1613                        return PTR_ERR(bh);
1614                eh = ext_block_hdr(bh);
1615                ix = EXT_FIRST_INDEX(eh);
1616                block = ext4_idx_pblock(ix);
1617                put_bh(bh);
1618        }
1619
1620        bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1621        if (IS_ERR(bh))
1622                return PTR_ERR(bh);
1623        eh = ext_block_hdr(bh);
1624        ex = EXT_FIRST_EXTENT(eh);
1625found_extent:
1626        *logical = le32_to_cpu(ex->ee_block);
1627        *phys = ext4_ext_pblock(ex);
1628        if (ret_ex)
1629                *ret_ex = *ex;
1630        if (bh)
1631                put_bh(bh);
1632        return 1;
1633}
1634
1635/*
1636 * ext4_ext_next_allocated_block:
1637 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1638 * NOTE: it considers block number from index entry as
1639 * allocated block. Thus, index entries have to be consistent
1640 * with leaves.
1641 */
1642ext4_lblk_t
1643ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1644{
1645        int depth;
1646
1647        BUG_ON(path == NULL);
1648        depth = path->p_depth;
1649
1650        if (depth == 0 && path->p_ext == NULL)
1651                return EXT_MAX_BLOCKS;
1652
1653        while (depth >= 0) {
1654                struct ext4_ext_path *p = &path[depth];
1655
1656                if (depth == path->p_depth) {
1657                        /* leaf */
1658                        if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
1659                                return le32_to_cpu(p->p_ext[1].ee_block);
1660                } else {
1661                        /* index */
1662                        if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
1663                                return le32_to_cpu(p->p_idx[1].ei_block);
1664                }
1665                depth--;
1666        }
1667
1668        return EXT_MAX_BLOCKS;
1669}
1670
1671/*
1672 * ext4_ext_next_leaf_block:
1673 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1674 */
1675static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1676{
1677        int depth;
1678
1679        BUG_ON(path == NULL);
1680        depth = path->p_depth;
1681
1682        /* zero-tree has no leaf blocks at all */
1683        if (depth == 0)
1684                return EXT_MAX_BLOCKS;
1685
1686        /* go to index block */
1687        depth--;
1688
1689        while (depth >= 0) {
1690                if (path[depth].p_idx !=
1691                                EXT_LAST_INDEX(path[depth].p_hdr))
1692                        return (ext4_lblk_t)
1693                                le32_to_cpu(path[depth].p_idx[1].ei_block);
1694                depth--;
1695        }
1696
1697        return EXT_MAX_BLOCKS;
1698}
1699
1700/*
1701 * ext4_ext_correct_indexes:
1702 * if leaf gets modified and modified extent is first in the leaf,
1703 * then we have to correct all indexes above.
1704 * TODO: do we need to correct tree in all cases?
1705 */
1706static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1707                                struct ext4_ext_path *path)
1708{
1709        struct ext4_extent_header *eh;
1710        int depth = ext_depth(inode);
1711        struct ext4_extent *ex;
1712        __le32 border;
1713        int k, err = 0;
1714
1715        eh = path[depth].p_hdr;
1716        ex = path[depth].p_ext;
1717
1718        if (unlikely(ex == NULL || eh == NULL)) {
1719                EXT4_ERROR_INODE(inode,
1720                                 "ex %p == NULL or eh %p == NULL", ex, eh);
1721                return -EFSCORRUPTED;
1722        }
1723
1724        if (depth == 0) {
1725                /* there is no tree at all */
1726                return 0;
1727        }
1728
1729        if (ex != EXT_FIRST_EXTENT(eh)) {
1730                /* we correct tree if first leaf got modified only */
1731                return 0;
1732        }
1733
1734        /*
1735         * TODO: we need correction if border is smaller than current one
1736         */
1737        k = depth - 1;
1738        border = path[depth].p_ext->ee_block;
1739        err = ext4_ext_get_access(handle, inode, path + k);
1740        if (err)
1741                return err;
1742        path[k].p_idx->ei_block = border;
1743        err = ext4_ext_dirty(handle, inode, path + k);
1744        if (err)
1745                return err;
1746
1747        while (k--) {
1748                /* change all left-side indexes */
1749                if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1750                        break;
1751                err = ext4_ext_get_access(handle, inode, path + k);
1752                if (err)
1753                        break;
1754                path[k].p_idx->ei_block = border;
1755                err = ext4_ext_dirty(handle, inode, path + k);
1756                if (err)
1757                        break;
1758        }
1759
1760        return err;
1761}
1762
1763int
1764ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1765                                struct ext4_extent *ex2)
1766{
1767        unsigned short ext1_ee_len, ext2_ee_len;
1768
1769        if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1770                return 0;
1771
1772        ext1_ee_len = ext4_ext_get_actual_len(ex1);
1773        ext2_ee_len = ext4_ext_get_actual_len(ex2);
1774
1775        if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1776                        le32_to_cpu(ex2->ee_block))
1777                return 0;
1778
1779        /*
1780         * To allow future support for preallocated extents to be added
1781         * as an RO_COMPAT feature, refuse to merge to extents if
1782         * this can result in the top bit of ee_len being set.
1783         */
1784        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1785                return 0;
1786        /*
1787         * The check for IO to unwritten extent is somewhat racy as we
1788         * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
1789         * dropping i_data_sem. But reserved blocks should save us in that
1790         * case.
1791         */
1792        if (ext4_ext_is_unwritten(ex1) &&
1793            (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1794             atomic_read(&EXT4_I(inode)->i_unwritten) ||
1795             (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
1796                return 0;
1797#ifdef AGGRESSIVE_TEST
1798        if (ext1_ee_len >= 4)
1799                return 0;
1800#endif
1801
1802        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1803                return 1;
1804        return 0;
1805}
1806
1807/*
1808 * This function tries to merge the "ex" extent to the next extent in the tree.
1809 * It always tries to merge towards right. If you want to merge towards
1810 * left, pass "ex - 1" as argument instead of "ex".
1811 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1812 * 1 if they got merged.
1813 */
1814static int ext4_ext_try_to_merge_right(struct inode *inode,
1815                                 struct ext4_ext_path *path,
1816                                 struct ext4_extent *ex)
1817{
1818        struct ext4_extent_header *eh;
1819        unsigned int depth, len;
1820        int merge_done = 0, unwritten;
1821
1822        depth = ext_depth(inode);
1823        BUG_ON(path[depth].p_hdr == NULL);
1824        eh = path[depth].p_hdr;
1825
1826        while (ex < EXT_LAST_EXTENT(eh)) {
1827                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1828                        break;
1829                /* merge with next extent! */
1830                unwritten = ext4_ext_is_unwritten(ex);
1831                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1832                                + ext4_ext_get_actual_len(ex + 1));
1833                if (unwritten)
1834                        ext4_ext_mark_unwritten(ex);
1835
1836                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1837                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
1838                                * sizeof(struct ext4_extent);
1839                        memmove(ex + 1, ex + 2, len);
1840                }
1841                le16_add_cpu(&eh->eh_entries, -1);
1842                merge_done = 1;
1843                WARN_ON(eh->eh_entries == 0);
1844                if (!eh->eh_entries)
1845                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1846        }
1847
1848        return merge_done;
1849}
1850
1851/*
1852 * This function does a very simple check to see if we can collapse
1853 * an extent tree with a single extent tree leaf block into the inode.
1854 */
1855static void ext4_ext_try_to_merge_up(handle_t *handle,
1856                                     struct inode *inode,
1857                                     struct ext4_ext_path *path)
1858{
1859        size_t s;
1860        unsigned max_root = ext4_ext_space_root(inode, 0);
1861        ext4_fsblk_t blk;
1862
1863        if ((path[0].p_depth != 1) ||
1864            (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1865            (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1866                return;
1867
1868        /*
1869         * We need to modify the block allocation bitmap and the block
1870         * group descriptor to release the extent tree block.  If we
1871         * can't get the journal credits, give up.
1872         */
1873        if (ext4_journal_extend(handle, 2,
1874                        ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
1875                return;
1876
1877        /*
1878         * Copy the extent data up to the inode
1879         */
1880        blk = ext4_idx_pblock(path[0].p_idx);
1881        s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1882                sizeof(struct ext4_extent_idx);
1883        s += sizeof(struct ext4_extent_header);
1884
1885        path[1].p_maxdepth = path[0].p_maxdepth;
1886        memcpy(path[0].p_hdr, path[1].p_hdr, s);
1887        path[0].p_depth = 0;
1888        path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1889                (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1890        path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1891
1892        brelse(path[1].p_bh);
1893        ext4_free_blocks(handle, inode, NULL, blk, 1,
1894                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1895}
1896
1897/*
1898 * This function tries to merge the @ex extent to neighbours in the tree, then
1899 * tries to collapse the extent tree into the inode.
1900 */
1901static void ext4_ext_try_to_merge(handle_t *handle,
1902                                  struct inode *inode,
1903                                  struct ext4_ext_path *path,
1904                                  struct ext4_extent *ex)
1905{
1906        struct ext4_extent_header *eh;
1907        unsigned int depth;
1908        int merge_done = 0;
1909
1910        depth = ext_depth(inode);
1911        BUG_ON(path[depth].p_hdr == NULL);
1912        eh = path[depth].p_hdr;
1913
1914        if (ex > EXT_FIRST_EXTENT(eh))
1915                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1916
1917        if (!merge_done)
1918                (void) ext4_ext_try_to_merge_right(inode, path, ex);
1919
1920        ext4_ext_try_to_merge_up(handle, inode, path);
1921}
1922
1923/*
1924 * check if a portion of the "newext" extent overlaps with an
1925 * existing extent.
1926 *
1927 * If there is an overlap discovered, it updates the length of the newext
1928 * such that there will be no overlap, and then returns 1.
1929 * If there is no overlap found, it returns 0.
1930 */
1931static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1932                                           struct inode *inode,
1933                                           struct ext4_extent *newext,
1934                                           struct ext4_ext_path *path)
1935{
1936        ext4_lblk_t b1, b2;
1937        unsigned int depth, len1;
1938        unsigned int ret = 0;
1939
1940        b1 = le32_to_cpu(newext->ee_block);
1941        len1 = ext4_ext_get_actual_len(newext);
1942        depth = ext_depth(inode);
1943        if (!path[depth].p_ext)
1944                goto out;
1945        b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1946
1947        /*
1948         * get the next allocated block if the extent in the path
1949         * is before the requested block(s)
1950         */
1951        if (b2 < b1) {
1952                b2 = ext4_ext_next_allocated_block(path);
1953                if (b2 == EXT_MAX_BLOCKS)
1954                        goto out;
1955                b2 = EXT4_LBLK_CMASK(sbi, b2);
1956        }
1957
1958        /* check for wrap through zero on extent logical start block*/
1959        if (b1 + len1 < b1) {
1960                len1 = EXT_MAX_BLOCKS - b1;
1961                newext->ee_len = cpu_to_le16(len1);
1962                ret = 1;
1963        }
1964
1965        /* check for overlap */
1966        if (b1 + len1 > b2) {
1967                newext->ee_len = cpu_to_le16(b2 - b1);
1968                ret = 1;
1969        }
1970out:
1971        return ret;
1972}
1973
1974/*
1975 * ext4_ext_insert_extent:
1976 * tries to merge requsted extent into the existing extent or
1977 * inserts requested extent as new one into the tree,
1978 * creating new leaf in the no-space case.
1979 */
1980int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1981                                struct ext4_ext_path **ppath,
1982                                struct ext4_extent *newext, int gb_flags)
1983{
1984        struct ext4_ext_path *path = *ppath;
1985        struct ext4_extent_header *eh;
1986        struct ext4_extent *ex, *fex;
1987        struct ext4_extent *nearex; /* nearest extent */
1988        struct ext4_ext_path *npath = NULL;
1989        int depth, len, err;
1990        ext4_lblk_t next;
1991        int mb_flags = 0, unwritten;
1992
1993        if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1994                mb_flags |= EXT4_MB_DELALLOC_RESERVED;
1995        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1996                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1997                return -EFSCORRUPTED;
1998        }
1999        depth = ext_depth(inode);
2000        ex = path[depth].p_ext;
2001        eh = path[depth].p_hdr;
2002        if (unlikely(path[depth].p_hdr == NULL)) {
2003                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2004                return -EFSCORRUPTED;
2005        }
2006
2007        /* try to insert block into found extent and return */
2008        if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
2009
2010                /*
2011                 * Try to see whether we should rather test the extent on
2012                 * right from ex, or from the left of ex. This is because
2013                 * ext4_find_extent() can return either extent on the
2014                 * left, or on the right from the searched position. This
2015                 * will make merging more effective.
2016                 */
2017                if (ex < EXT_LAST_EXTENT(eh) &&
2018                    (le32_to_cpu(ex->ee_block) +
2019                    ext4_ext_get_actual_len(ex) <
2020                    le32_to_cpu(newext->ee_block))) {
2021                        ex += 1;
2022                        goto prepend;
2023                } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
2024                           (le32_to_cpu(newext->ee_block) +
2025                           ext4_ext_get_actual_len(newext) <
2026                           le32_to_cpu(ex->ee_block)))
2027                        ex -= 1;
2028
2029                /* Try to append newex to the ex */
2030                if (ext4_can_extents_be_merged(inode, ex, newext)) {
2031                        ext_debug("append [%d]%d block to %u:[%d]%d"
2032                                  "(from %llu)\n",
2033                                  ext4_ext_is_unwritten(newext),
2034                                  ext4_ext_get_actual_len(newext),
2035                                  le32_to_cpu(ex->ee_block),
2036                                  ext4_ext_is_unwritten(ex),
2037                                  ext4_ext_get_actual_len(ex),
2038                                  ext4_ext_pblock(ex));
2039                        err = ext4_ext_get_access(handle, inode,
2040                                                  path + depth);
2041                        if (err)
2042                                return err;
2043                        unwritten = ext4_ext_is_unwritten(ex);
2044                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2045                                        + ext4_ext_get_actual_len(newext));
2046                        if (unwritten)
2047                                ext4_ext_mark_unwritten(ex);
2048                        eh = path[depth].p_hdr;
2049                        nearex = ex;
2050                        goto merge;
2051                }
2052
2053prepend:
2054                /* Try to prepend newex to the ex */
2055                if (ext4_can_extents_be_merged(inode, newext, ex)) {
2056                        ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
2057                                  "(from %llu)\n",
2058                                  le32_to_cpu(newext->ee_block),
2059                                  ext4_ext_is_unwritten(newext),
2060                                  ext4_ext_get_actual_len(newext),
2061                                  le32_to_cpu(ex->ee_block),
2062                                  ext4_ext_is_unwritten(ex),
2063                                  ext4_ext_get_actual_len(ex),
2064                                  ext4_ext_pblock(ex));
2065                        err = ext4_ext_get_access(handle, inode,
2066                                                  path + depth);
2067                        if (err)
2068                                return err;
2069
2070                        unwritten = ext4_ext_is_unwritten(ex);
2071                        ex->ee_block = newext->ee_block;
2072                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
2073                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2074                                        + ext4_ext_get_actual_len(newext));
2075                        if (unwritten)
2076                                ext4_ext_mark_unwritten(ex);
2077                        eh = path[depth].p_hdr;
2078                        nearex = ex;
2079                        goto merge;
2080                }
2081        }
2082
2083        depth = ext_depth(inode);
2084        eh = path[depth].p_hdr;
2085        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2086                goto has_space;
2087
2088        /* probably next leaf has space for us? */
2089        fex = EXT_LAST_EXTENT(eh);
2090        next = EXT_MAX_BLOCKS;
2091        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2092                next = ext4_ext_next_leaf_block(path);
2093        if (next != EXT_MAX_BLOCKS) {
2094                ext_debug("next leaf block - %u\n", next);
2095                BUG_ON(npath != NULL);
2096                npath = ext4_find_extent(inode, next, NULL, gb_flags);
2097                if (IS_ERR(npath))
2098                        return PTR_ERR(npath);
2099                BUG_ON(npath->p_depth != path->p_depth);
2100                eh = npath[depth].p_hdr;
2101                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2102                        ext_debug("next leaf isn't full(%d)\n",
2103                                  le16_to_cpu(eh->eh_entries));
2104                        path = npath;
2105                        goto has_space;
2106                }
2107                ext_debug("next leaf has no free space(%d,%d)\n",
2108                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2109        }
2110
2111        /*
2112         * There is no free space in the found leaf.
2113         * We're gonna add a new leaf in the tree.
2114         */
2115        if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2116                mb_flags |= EXT4_MB_USE_RESERVED;
2117        err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2118                                       ppath, newext);
2119        if (err)
2120                goto cleanup;
2121        depth = ext_depth(inode);
2122        eh = path[depth].p_hdr;
2123
2124has_space:
2125        nearex = path[depth].p_ext;
2126
2127        err = ext4_ext_get_access(handle, inode, path + depth);
2128        if (err)
2129                goto cleanup;
2130
2131        if (!nearex) {
2132                /* there is no extent in this leaf, create first one */
2133                ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2134                                le32_to_cpu(newext->ee_block),
2135                                ext4_ext_pblock(newext),
2136                                ext4_ext_is_unwritten(newext),
2137                                ext4_ext_get_actual_len(newext));
2138                nearex = EXT_FIRST_EXTENT(eh);
2139        } else {
2140                if (le32_to_cpu(newext->ee_block)
2141                           > le32_to_cpu(nearex->ee_block)) {
2142                        /* Insert after */
2143                        ext_debug("insert %u:%llu:[%d]%d before: "
2144                                        "nearest %p\n",
2145                                        le32_to_cpu(newext->ee_block),
2146                                        ext4_ext_pblock(newext),
2147                                        ext4_ext_is_unwritten(newext),
2148                                        ext4_ext_get_actual_len(newext),
2149                                        nearex);
2150                        nearex++;
2151                } else {
2152                        /* Insert before */
2153                        BUG_ON(newext->ee_block == nearex->ee_block);
2154                        ext_debug("insert %u:%llu:[%d]%d after: "
2155                                        "nearest %p\n",
2156                                        le32_to_cpu(newext->ee_block),
2157                                        ext4_ext_pblock(newext),
2158                                        ext4_ext_is_unwritten(newext),
2159                                        ext4_ext_get_actual_len(newext),
2160                                        nearex);
2161                }
2162                len = EXT_LAST_EXTENT(eh) - nearex + 1;
2163                if (len > 0) {
2164                        ext_debug("insert %u:%llu:[%d]%d: "
2165                                        "move %d extents from 0x%p to 0x%p\n",
2166                                        le32_to_cpu(newext->ee_block),
2167                                        ext4_ext_pblock(newext),
2168                                        ext4_ext_is_unwritten(newext),
2169                                        ext4_ext_get_actual_len(newext),
2170                                        len, nearex, nearex + 1);
2171                        memmove(nearex + 1, nearex,
2172                                len * sizeof(struct ext4_extent));
2173                }
2174        }
2175
2176        le16_add_cpu(&eh->eh_entries, 1);
2177        path[depth].p_ext = nearex;
2178        nearex->ee_block = newext->ee_block;
2179        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2180        nearex->ee_len = newext->ee_len;
2181
2182merge:
2183        /* try to merge extents */
2184        if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2185                ext4_ext_try_to_merge(handle, inode, path, nearex);
2186
2187
2188        /* time to correct all indexes above */
2189        err = ext4_ext_correct_indexes(handle, inode, path);
2190        if (err)
2191                goto cleanup;
2192
2193        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2194
2195cleanup:
2196        ext4_ext_drop_refs(npath);
2197        kfree(npath);
2198        return err;
2199}
2200
2201static int ext4_fill_fiemap_extents(struct inode *inode,
2202                                    ext4_lblk_t block, ext4_lblk_t num,
2203                                    struct fiemap_extent_info *fieinfo)
2204{
2205        struct ext4_ext_path *path = NULL;
2206        struct ext4_extent *ex;
2207        struct extent_status es;
2208        ext4_lblk_t next, next_del, start = 0, end = 0;
2209        ext4_lblk_t last = block + num;
2210        int exists, depth = 0, err = 0;
2211        unsigned int flags = 0;
2212        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2213
2214        while (block < last && block != EXT_MAX_BLOCKS) {
2215                num = last - block;
2216                /* find extent for this block */
2217                down_read(&EXT4_I(inode)->i_data_sem);
2218
2219                path = ext4_find_extent(inode, block, &path, 0);
2220                if (IS_ERR(path)) {
2221                        up_read(&EXT4_I(inode)->i_data_sem);
2222                        err = PTR_ERR(path);
2223                        path = NULL;
2224                        break;
2225                }
2226
2227                depth = ext_depth(inode);
2228                if (unlikely(path[depth].p_hdr == NULL)) {
2229                        up_read(&EXT4_I(inode)->i_data_sem);
2230                        EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2231                        err = -EFSCORRUPTED;
2232                        break;
2233                }
2234                ex = path[depth].p_ext;
2235                next = ext4_ext_next_allocated_block(path);
2236
2237                flags = 0;
2238                exists = 0;
2239                if (!ex) {
2240                        /* there is no extent yet, so try to allocate
2241                         * all requested space */
2242                        start = block;
2243                        end = block + num;
2244                } else if (le32_to_cpu(ex->ee_block) > block) {
2245                        /* need to allocate space before found extent */
2246                        start = block;
2247                        end = le32_to_cpu(ex->ee_block);
2248                        if (block + num < end)
2249                                end = block + num;
2250                } else if (block >= le32_to_cpu(ex->ee_block)
2251                                        + ext4_ext_get_actual_len(ex)) {
2252                        /* need to allocate space after found extent */
2253                        start = block;
2254                        end = block + num;
2255                        if (end >= next)
2256                                end = next;
2257                } else if (block >= le32_to_cpu(ex->ee_block)) {
2258                        /*
2259                         * some part of requested space is covered
2260                         * by found extent
2261                         */
2262                        start = block;
2263                        end = le32_to_cpu(ex->ee_block)
2264                                + ext4_ext_get_actual_len(ex);
2265                        if (block + num < end)
2266                                end = block + num;
2267                        exists = 1;
2268                } else {
2269                        BUG();
2270                }
2271                BUG_ON(end <= start);
2272
2273                if (!exists) {
2274                        es.es_lblk = start;
2275                        es.es_len = end - start;
2276                        es.es_pblk = 0;
2277                } else {
2278                        es.es_lblk = le32_to_cpu(ex->ee_block);
2279                        es.es_len = ext4_ext_get_actual_len(ex);
2280                        es.es_pblk = ext4_ext_pblock(ex);
2281                        if (ext4_ext_is_unwritten(ex))
2282                                flags |= FIEMAP_EXTENT_UNWRITTEN;
2283                }
2284
2285                /*
2286                 * Find delayed extent and update es accordingly. We call
2287                 * it even in !exists case to find out whether es is the
2288                 * last existing extent or not.
2289                 */
2290                next_del = ext4_find_delayed_extent(inode, &es);
2291                if (!exists && next_del) {
2292                        exists = 1;
2293                        flags |= (FIEMAP_EXTENT_DELALLOC |
2294                                  FIEMAP_EXTENT_UNKNOWN);
2295                }
2296                up_read(&EXT4_I(inode)->i_data_sem);
2297
2298                if (unlikely(es.es_len == 0)) {
2299                        EXT4_ERROR_INODE(inode, "es.es_len == 0");
2300                        err = -EFSCORRUPTED;
2301                        break;
2302                }
2303
2304                /*
2305                 * This is possible iff next == next_del == EXT_MAX_BLOCKS.
2306                 * we need to check next == EXT_MAX_BLOCKS because it is
2307                 * possible that an extent is with unwritten and delayed
2308                 * status due to when an extent is delayed allocated and
2309                 * is allocated by fallocate status tree will track both of
2310                 * them in a extent.
2311                 *
2312                 * So we could return a unwritten and delayed extent, and
2313                 * its block is equal to 'next'.
2314                 */
2315                if (next == next_del && next == EXT_MAX_BLOCKS) {
2316                        flags |= FIEMAP_EXTENT_LAST;
2317                        if (unlikely(next_del != EXT_MAX_BLOCKS ||
2318                                     next != EXT_MAX_BLOCKS)) {
2319                                EXT4_ERROR_INODE(inode,
2320                                                 "next extent == %u, next "
2321                                                 "delalloc extent = %u",
2322                                                 next, next_del);
2323                                err = -EFSCORRUPTED;
2324                                break;
2325                        }
2326                }
2327
2328                if (exists) {
2329                        err = fiemap_fill_next_extent(fieinfo,
2330                                (__u64)es.es_lblk << blksize_bits,
2331                                (__u64)es.es_pblk << blksize_bits,
2332                                (__u64)es.es_len << blksize_bits,
2333                                flags);
2334                        if (err < 0)
2335                                break;
2336                        if (err == 1) {
2337                                err = 0;
2338                                break;
2339                        }
2340                }
2341
2342                block = es.es_lblk + es.es_len;
2343        }
2344
2345        ext4_ext_drop_refs(path);
2346        kfree(path);
2347        return err;
2348}
2349
2350/*
2351 * ext4_ext_determine_hole - determine hole around given block
2352 * @inode:      inode we lookup in
2353 * @path:       path in extent tree to @lblk
2354 * @lblk:       pointer to logical block around which we want to determine hole
2355 *
2356 * Determine hole length (and start if easily possible) around given logical
2357 * block. We don't try too hard to find the beginning of the hole but @path
2358 * actually points to extent before @lblk, we provide it.
2359 *
2360 * The function returns the length of a hole starting at @lblk. We update @lblk
2361 * to the beginning of the hole if we managed to find it.
2362 */
2363static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2364                                           struct ext4_ext_path *path,
2365                                           ext4_lblk_t *lblk)
2366{
2367        int depth = ext_depth(inode);
2368        struct ext4_extent *ex;
2369        ext4_lblk_t len;
2370
2371        ex = path[depth].p_ext;
2372        if (ex == NULL) {
2373                /* there is no extent yet, so gap is [0;-] */
2374                *lblk = 0;
2375                len = EXT_MAX_BLOCKS;
2376        } else if (*lblk < le32_to_cpu(ex->ee_block)) {
2377                len = le32_to_cpu(ex->ee_block) - *lblk;
2378        } else if (*lblk >= le32_to_cpu(ex->ee_block)
2379                        + ext4_ext_get_actual_len(ex)) {
2380                ext4_lblk_t next;
2381
2382                *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2383                next = ext4_ext_next_allocated_block(path);
2384                BUG_ON(next == *lblk);
2385                len = next - *lblk;
2386        } else {
2387                BUG();
2388        }
2389        return len;
2390}
2391
2392/*
2393 * ext4_ext_put_gap_in_cache:
2394 * calculate boundaries of the gap that the requested block fits into
2395 * and cache this gap
2396 */
2397static void
2398ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2399                          ext4_lblk_t hole_len)
2400{
2401        struct extent_status es;
2402
2403        ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
2404                                  hole_start + hole_len - 1, &es);
2405        if (es.es_len) {
2406                /* There's delayed extent containing lblock? */
2407                if (es.es_lblk <= hole_start)
2408                        return;
2409                hole_len = min(es.es_lblk - hole_start, hole_len);
2410        }
2411        ext_debug(" -> %u:%u\n", hole_start, hole_len);
2412        ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2413                              EXTENT_STATUS_HOLE);
2414}
2415
2416/*
2417 * ext4_ext_rm_idx:
2418 * removes index from the index block.
2419 */
2420static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2421                        struct ext4_ext_path *path, int depth)
2422{
2423        int err;
2424        ext4_fsblk_t leaf;
2425
2426        /* free index block */
2427        depth--;
2428        path = path + depth;
2429        leaf = ext4_idx_pblock(path->p_idx);
2430        if (unlikely(path->p_hdr->eh_entries == 0)) {
2431                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2432                return -EFSCORRUPTED;
2433        }
2434        err = ext4_ext_get_access(handle, inode, path);
2435        if (err)
2436                return err;
2437
2438        if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2439                int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2440                len *= sizeof(struct ext4_extent_idx);
2441                memmove(path->p_idx, path->p_idx + 1, len);
2442        }
2443
2444        le16_add_cpu(&path->p_hdr->eh_entries, -1);
2445        err = ext4_ext_dirty(handle, inode, path);
2446        if (err)
2447                return err;
2448        ext_debug("index is empty, remove it, free block %llu\n", leaf);
2449        trace_ext4_ext_rm_idx(inode, leaf);
2450
2451        ext4_free_blocks(handle, inode, NULL, leaf, 1,
2452                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2453
2454        while (--depth >= 0) {
2455                if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2456                        break;
2457                path--;
2458                err = ext4_ext_get_access(handle, inode, path);
2459                if (err)
2460                        break;
2461                path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2462                err = ext4_ext_dirty(handle, inode, path);
2463                if (err)
2464                        break;
2465        }
2466        return err;
2467}
2468
2469/*
2470 * ext4_ext_calc_credits_for_single_extent:
2471 * This routine returns max. credits that needed to insert an extent
2472 * to the extent tree.
2473 * When pass the actual path, the caller should calculate credits
2474 * under i_data_sem.
2475 */
2476int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2477                                                struct ext4_ext_path *path)
2478{
2479        if (path) {
2480                int depth = ext_depth(inode);
2481                int ret = 0;
2482
2483                /* probably there is space in leaf? */
2484                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2485                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2486
2487                        /*
2488                         *  There are some space in the leaf tree, no
2489                         *  need to account for leaf block credit
2490                         *
2491                         *  bitmaps and block group descriptor blocks
2492                         *  and other metadata blocks still need to be
2493                         *  accounted.
2494                         */
2495                        /* 1 bitmap, 1 block group descriptor */
2496                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2497                        return ret;
2498                }
2499        }
2500
2501        return ext4_chunk_trans_blocks(inode, nrblocks);
2502}
2503
2504/*
2505 * How many index/leaf blocks need to change/allocate to add @extents extents?
2506 *
2507 * If we add a single extent, then in the worse case, each tree level
2508 * index/leaf need to be changed in case of the tree split.
2509 *
2510 * If more extents are inserted, they could cause the whole tree split more
2511 * than once, but this is really rare.
2512 */
2513int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2514{
2515        int index;
2516        int depth;
2517
2518        /* If we are converting the inline data, only one is needed here. */
2519        if (ext4_has_inline_data(inode))
2520                return 1;
2521
2522        depth = ext_depth(inode);
2523
2524        if (extents <= 1)
2525                index = depth * 2;
2526        else
2527                index = depth * 3;
2528
2529        return index;
2530}
2531
2532static inline int get_default_free_blocks_flags(struct inode *inode)
2533{
2534        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
2535            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
2536                return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2537        else if (ext4_should_journal_data(inode))
2538                return EXT4_FREE_BLOCKS_FORGET;
2539        return 0;
2540}
2541
2542/*
2543 * ext4_rereserve_cluster - increment the reserved cluster count when
2544 *                          freeing a cluster with a pending reservation
2545 *
2546 * @inode - file containing the cluster
2547 * @lblk - logical block in cluster to be reserved
2548 *
2549 * Increments the reserved cluster count and adjusts quota in a bigalloc
2550 * file system when freeing a partial cluster containing at least one
2551 * delayed and unwritten block.  A partial cluster meeting that
2552 * requirement will have a pending reservation.  If so, the
2553 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2554 * defer reserved and allocated space accounting to a subsequent call
2555 * to this function.
2556 */
2557static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
2558{
2559        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2560        struct ext4_inode_info *ei = EXT4_I(inode);
2561
2562        dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
2563
2564        spin_lock(&ei->i_block_reservation_lock);
2565        ei->i_reserved_data_blocks++;
2566        percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
2567        spin_unlock(&ei->i_block_reservation_lock);
2568
2569        percpu_counter_add(&sbi->s_freeclusters_counter, 1);
2570        ext4_remove_pending(inode, lblk);
2571}
2572
2573static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2574                              struct ext4_extent *ex,
2575                              struct partial_cluster *partial,
2576                              ext4_lblk_t from, ext4_lblk_t to)
2577{
2578        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2579        unsigned short ee_len = ext4_ext_get_actual_len(ex);
2580        ext4_fsblk_t last_pblk, pblk;
2581        ext4_lblk_t num;
2582        int flags;
2583
2584        /* only extent tail removal is allowed */
2585        if (from < le32_to_cpu(ex->ee_block) ||
2586            to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
2587                ext4_error(sbi->s_sb,
2588                           "strange request: removal(2) %u-%u from %u:%u",
2589                           from, to, le32_to_cpu(ex->ee_block), ee_len);
2590                return 0;
2591        }
2592
2593#ifdef EXTENTS_STATS
2594        spin_lock(&sbi->s_ext_stats_lock);
2595        sbi->s_ext_blocks += ee_len;
2596        sbi->s_ext_extents++;
2597        if (ee_len < sbi->s_ext_min)
2598                sbi->s_ext_min = ee_len;
2599        if (ee_len > sbi->s_ext_max)
2600                sbi->s_ext_max = ee_len;
2601        if (ext_depth(inode) > sbi->s_depth_max)
2602                sbi->s_depth_max = ext_depth(inode);
2603        spin_unlock(&sbi->s_ext_stats_lock);
2604#endif
2605
2606        trace_ext4_remove_blocks(inode, ex, from, to, partial);
2607
2608        /*
2609         * if we have a partial cluster, and it's different from the
2610         * cluster of the last block in the extent, we free it
2611         */
2612        last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
2613
2614        if (partial->state != initial &&
2615            partial->pclu != EXT4_B2C(sbi, last_pblk)) {
2616                if (partial->state == tofree) {
2617                        flags = get_default_free_blocks_flags(inode);
2618                        if (ext4_is_pending(inode, partial->lblk))
2619                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2620                        ext4_free_blocks(handle, inode, NULL,
2621                                         EXT4_C2B(sbi, partial->pclu),
2622                                         sbi->s_cluster_ratio, flags);
2623                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2624                                ext4_rereserve_cluster(inode, partial->lblk);
2625                }
2626                partial->state = initial;
2627        }
2628
2629        num = le32_to_cpu(ex->ee_block) + ee_len - from;
2630        pblk = ext4_ext_pblock(ex) + ee_len - num;
2631
2632        /*
2633         * We free the partial cluster at the end of the extent (if any),
2634         * unless the cluster is used by another extent (partial_cluster
2635         * state is nofree).  If a partial cluster exists here, it must be
2636         * shared with the last block in the extent.
2637         */
2638        flags = get_default_free_blocks_flags(inode);
2639
2640        /* partial, left end cluster aligned, right end unaligned */
2641        if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
2642            (EXT4_LBLK_CMASK(sbi, to) >= from) &&
2643            (partial->state != nofree)) {
2644                if (ext4_is_pending(inode, to))
2645                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2646                ext4_free_blocks(handle, inode, NULL,
2647                                 EXT4_PBLK_CMASK(sbi, last_pblk),
2648                                 sbi->s_cluster_ratio, flags);
2649                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2650                        ext4_rereserve_cluster(inode, to);
2651                partial->state = initial;
2652                flags = get_default_free_blocks_flags(inode);
2653        }
2654
2655        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2656
2657        /*
2658         * For bigalloc file systems, we never free a partial cluster
2659         * at the beginning of the extent.  Instead, we check to see if we
2660         * need to free it on a subsequent call to ext4_remove_blocks,
2661         * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2662         */
2663        flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2664        ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2665
2666        /* reset the partial cluster if we've freed past it */
2667        if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
2668                partial->state = initial;
2669
2670        /*
2671         * If we've freed the entire extent but the beginning is not left
2672         * cluster aligned and is not marked as ineligible for freeing we
2673         * record the partial cluster at the beginning of the extent.  It
2674         * wasn't freed by the preceding ext4_free_blocks() call, and we
2675         * need to look farther to the left to determine if it's to be freed
2676         * (not shared with another extent). Else, reset the partial
2677         * cluster - we're either  done freeing or the beginning of the
2678         * extent is left cluster aligned.
2679         */
2680        if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
2681                if (partial->state == initial) {
2682                        partial->pclu = EXT4_B2C(sbi, pblk);
2683                        partial->lblk = from;
2684                        partial->state = tofree;
2685                }
2686        } else {
2687                partial->state = initial;
2688        }
2689
2690        return 0;
2691}
2692
2693/*
2694 * ext4_ext_rm_leaf() Removes the extents associated with the
2695 * blocks appearing between "start" and "end".  Both "start"
2696 * and "end" must appear in the same extent or EIO is returned.
2697 *
2698 * @handle: The journal handle
2699 * @inode:  The files inode
2700 * @path:   The path to the leaf
2701 * @partial_cluster: The cluster which we'll have to free if all extents
2702 *                   has been released from it.  However, if this value is
2703 *                   negative, it's a cluster just to the right of the
2704 *                   punched region and it must not be freed.
2705 * @start:  The first block to remove
2706 * @end:   The last block to remove
2707 */
2708static int
2709ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2710                 struct ext4_ext_path *path,
2711                 struct partial_cluster *partial,
2712                 ext4_lblk_t start, ext4_lblk_t end)
2713{
2714        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2715        int err = 0, correct_index = 0;
2716        int depth = ext_depth(inode), credits, revoke_credits;
2717        struct ext4_extent_header *eh;
2718        ext4_lblk_t a, b;
2719        unsigned num;
2720        ext4_lblk_t ex_ee_block;
2721        unsigned short ex_ee_len;
2722        unsigned unwritten = 0;
2723        struct ext4_extent *ex;
2724        ext4_fsblk_t pblk;
2725
2726        /* the header must be checked already in ext4_ext_remove_space() */
2727        ext_debug("truncate since %u in leaf to %u\n", start, end);
2728        if (!path[depth].p_hdr)
2729                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2730        eh = path[depth].p_hdr;
2731        if (unlikely(path[depth].p_hdr == NULL)) {
2732                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2733                return -EFSCORRUPTED;
2734        }
2735        /* find where to start removing */
2736        ex = path[depth].p_ext;
2737        if (!ex)
2738                ex = EXT_LAST_EXTENT(eh);
2739
2740        ex_ee_block = le32_to_cpu(ex->ee_block);
2741        ex_ee_len = ext4_ext_get_actual_len(ex);
2742
2743        trace_ext4_ext_rm_leaf(inode, start, ex, partial);
2744
2745        while (ex >= EXT_FIRST_EXTENT(eh) &&
2746                        ex_ee_block + ex_ee_len > start) {
2747
2748                if (ext4_ext_is_unwritten(ex))
2749                        unwritten = 1;
2750                else
2751                        unwritten = 0;
2752
2753                ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2754                          unwritten, ex_ee_len);
2755                path[depth].p_ext = ex;
2756
2757                a = ex_ee_block > start ? ex_ee_block : start;
2758                b = ex_ee_block+ex_ee_len - 1 < end ?
2759                        ex_ee_block+ex_ee_len - 1 : end;
2760
2761                ext_debug("  border %u:%u\n", a, b);
2762
2763                /* If this extent is beyond the end of the hole, skip it */
2764                if (end < ex_ee_block) {
2765                        /*
2766                         * We're going to skip this extent and move to another,
2767                         * so note that its first cluster is in use to avoid
2768                         * freeing it when removing blocks.  Eventually, the
2769                         * right edge of the truncated/punched region will
2770                         * be just to the left.
2771                         */
2772                        if (sbi->s_cluster_ratio > 1) {
2773                                pblk = ext4_ext_pblock(ex);
2774                                partial->pclu = EXT4_B2C(sbi, pblk);
2775                                partial->state = nofree;
2776                        }
2777                        ex--;
2778                        ex_ee_block = le32_to_cpu(ex->ee_block);
2779                        ex_ee_len = ext4_ext_get_actual_len(ex);
2780                        continue;
2781                } else if (b != ex_ee_block + ex_ee_len - 1) {
2782                        EXT4_ERROR_INODE(inode,
2783                                         "can not handle truncate %u:%u "
2784                                         "on extent %u:%u",
2785                                         start, end, ex_ee_block,
2786                                         ex_ee_block + ex_ee_len - 1);
2787                        err = -EFSCORRUPTED;
2788                        goto out;
2789                } else if (a != ex_ee_block) {
2790                        /* remove tail of the extent */
2791                        num = a - ex_ee_block;
2792                } else {
2793                        /* remove whole extent: excellent! */
2794                        num = 0;
2795                }
2796                /*
2797                 * 3 for leaf, sb, and inode plus 2 (bmap and group
2798                 * descriptor) for each block group; assume two block
2799                 * groups plus ex_ee_len/blocks_per_block_group for
2800                 * the worst case
2801                 */
2802                credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2803                if (ex == EXT_FIRST_EXTENT(eh)) {
2804                        correct_index = 1;
2805                        credits += (ext_depth(inode)) + 1;
2806                }
2807                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2808                /*
2809                 * We may end up freeing some index blocks and data from the
2810                 * punched range. Note that partial clusters are accounted for
2811                 * by ext4_free_data_revoke_credits().
2812                 */
2813                revoke_credits =
2814                        ext4_free_metadata_revoke_credits(inode->i_sb,
2815                                                          ext_depth(inode)) +
2816                        ext4_free_data_revoke_credits(inode, b - a + 1);
2817
2818                err = ext4_datasem_ensure_credits(handle, inode, credits,
2819                                                  credits, revoke_credits);
2820                if (err) {
2821                        if (err > 0)
2822                                err = -EAGAIN;
2823                        goto out;
2824                }
2825
2826                err = ext4_ext_get_access(handle, inode, path + depth);
2827                if (err)
2828                        goto out;
2829
2830                err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
2831                if (err)
2832                        goto out;
2833
2834                if (num == 0)
2835                        /* this extent is removed; mark slot entirely unused */
2836                        ext4_ext_store_pblock(ex, 0);
2837
2838                ex->ee_len = cpu_to_le16(num);
2839                /*
2840                 * Do not mark unwritten if all the blocks in the
2841                 * extent have been removed.
2842                 */
2843                if (unwritten && num)
2844                        ext4_ext_mark_unwritten(ex);
2845                /*
2846                 * If the extent was completely released,
2847                 * we need to remove it from the leaf
2848                 */
2849                if (num == 0) {
2850                        if (end != EXT_MAX_BLOCKS - 1) {
2851                                /*
2852                                 * For hole punching, we need to scoot all the
2853                                 * extents up when an extent is removed so that
2854                                 * we dont have blank extents in the middle
2855                                 */
2856                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2857                                        sizeof(struct ext4_extent));
2858
2859                                /* Now get rid of the one at the end */
2860                                memset(EXT_LAST_EXTENT(eh), 0,
2861                                        sizeof(struct ext4_extent));
2862                        }
2863                        le16_add_cpu(&eh->eh_entries, -1);
2864                }
2865
2866                err = ext4_ext_dirty(handle, inode, path + depth);
2867                if (err)
2868                        goto out;
2869
2870                ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2871                                ext4_ext_pblock(ex));
2872                ex--;
2873                ex_ee_block = le32_to_cpu(ex->ee_block);
2874                ex_ee_len = ext4_ext_get_actual_len(ex);
2875        }
2876
2877        if (correct_index && eh->eh_entries)
2878                err = ext4_ext_correct_indexes(handle, inode, path);
2879
2880        /*
2881         * If there's a partial cluster and at least one extent remains in
2882         * the leaf, free the partial cluster if it isn't shared with the
2883         * current extent.  If it is shared with the current extent
2884         * we reset the partial cluster because we've reached the start of the
2885         * truncated/punched region and we're done removing blocks.
2886         */
2887        if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
2888                pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2889                if (partial->pclu != EXT4_B2C(sbi, pblk)) {
2890                        int flags = get_default_free_blocks_flags(inode);
2891
2892                        if (ext4_is_pending(inode, partial->lblk))
2893                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2894                        ext4_free_blocks(handle, inode, NULL,
2895                                         EXT4_C2B(sbi, partial->pclu),
2896                                         sbi->s_cluster_ratio, flags);
2897                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2898                                ext4_rereserve_cluster(inode, partial->lblk);
2899                }
2900                partial->state = initial;
2901        }
2902
2903        /* if this leaf is free, then we should
2904         * remove it from index block above */
2905        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2906                err = ext4_ext_rm_idx(handle, inode, path, depth);
2907
2908out:
2909        return err;
2910}
2911
2912/*
2913 * ext4_ext_more_to_rm:
2914 * returns 1 if current index has to be freed (even partial)
2915 */
2916static int
2917ext4_ext_more_to_rm(struct ext4_ext_path *path)
2918{
2919        BUG_ON(path->p_idx == NULL);
2920
2921        if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2922                return 0;
2923
2924        /*
2925         * if truncate on deeper level happened, it wasn't partial,
2926         * so we have to consider current index for truncation
2927         */
2928        if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2929                return 0;
2930        return 1;
2931}
2932
2933int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2934                          ext4_lblk_t end)
2935{
2936        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2937        int depth = ext_depth(inode);
2938        struct ext4_ext_path *path = NULL;
2939        struct partial_cluster partial;
2940        handle_t *handle;
2941        int i = 0, err = 0;
2942
2943        partial.pclu = 0;
2944        partial.lblk = 0;
2945        partial.state = initial;
2946
2947        ext_debug("truncate since %u to %u\n", start, end);
2948
2949        /* probably first extent we're gonna free will be last in block */
2950        handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
2951                        depth + 1,
2952                        ext4_free_metadata_revoke_credits(inode->i_sb, depth));
2953        if (IS_ERR(handle))
2954                return PTR_ERR(handle);
2955
2956again:
2957        trace_ext4_ext_remove_space(inode, start, end, depth);
2958
2959        /*
2960         * Check if we are removing extents inside the extent tree. If that
2961         * is the case, we are going to punch a hole inside the extent tree
2962         * so we have to check whether we need to split the extent covering
2963         * the last block to remove so we can easily remove the part of it
2964         * in ext4_ext_rm_leaf().
2965         */
2966        if (end < EXT_MAX_BLOCKS - 1) {
2967                struct ext4_extent *ex;
2968                ext4_lblk_t ee_block, ex_end, lblk;
2969                ext4_fsblk_t pblk;
2970
2971                /* find extent for or closest extent to this block */
2972                path = ext4_find_extent(inode, end, NULL,
2973                                        EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
2974                if (IS_ERR(path)) {
2975                        ext4_journal_stop(handle);
2976                        return PTR_ERR(path);
2977                }
2978                depth = ext_depth(inode);
2979                /* Leaf not may not exist only if inode has no blocks at all */
2980                ex = path[depth].p_ext;
2981                if (!ex) {
2982                        if (depth) {
2983                                EXT4_ERROR_INODE(inode,
2984                                                 "path[%d].p_hdr == NULL",
2985                                                 depth);
2986                                err = -EFSCORRUPTED;
2987                        }
2988                        goto out;
2989                }
2990
2991                ee_block = le32_to_cpu(ex->ee_block);
2992                ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2993
2994                /*
2995                 * See if the last block is inside the extent, if so split
2996                 * the extent at 'end' block so we can easily remove the
2997                 * tail of the first part of the split extent in
2998                 * ext4_ext_rm_leaf().
2999                 */
3000                if (end >= ee_block && end < ex_end) {
3001
3002                        /*
3003                         * If we're going to split the extent, note that
3004                         * the cluster containing the block after 'end' is
3005                         * in use to avoid freeing it when removing blocks.
3006                         */
3007                        if (sbi->s_cluster_ratio > 1) {
3008                                pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
3009                                partial.pclu = EXT4_B2C(sbi, pblk);
3010                                partial.state = nofree;
3011                        }
3012
3013                        /*
3014                         * Split the extent in two so that 'end' is the last
3015                         * block in the first new extent. Also we should not
3016                         * fail removing space due to ENOSPC so try to use
3017                         * reserved block if that happens.
3018                         */
3019                        err = ext4_force_split_extent_at(handle, inode, &path,
3020                                                         end + 1, 1);
3021                        if (err < 0)
3022                                goto out;
3023
3024                } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
3025                           partial.state == initial) {
3026                        /*
3027                         * If we're punching, there's an extent to the right.
3028                         * If the partial cluster hasn't been set, set it to
3029                         * that extent's first cluster and its state to nofree
3030                         * so it won't be freed should it contain blocks to be
3031                         * removed. If it's already set (tofree/nofree), we're
3032                         * retrying and keep the original partial cluster info
3033                         * so a cluster marked tofree as a result of earlier
3034                         * extent removal is not lost.
3035                         */
3036                        lblk = ex_end + 1;
3037                        err = ext4_ext_search_right(inode, path, &lblk, &pblk,
3038                                                    NULL);
3039                        if (err < 0)
3040                                goto out;
3041                        if (pblk) {
3042                                partial.pclu = EXT4_B2C(sbi, pblk);
3043                                partial.state = nofree;
3044                        }
3045                }
3046        }
3047        /*
3048         * We start scanning from right side, freeing all the blocks
3049         * after i_size and walking into the tree depth-wise.
3050         */
3051        depth = ext_depth(inode);
3052        if (path) {
3053                int k = i = depth;
3054                while (--k > 0)
3055                        path[k].p_block =
3056                                le16_to_cpu(path[k].p_hdr->eh_entries)+1;
3057        } else {
3058                path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
3059                               GFP_NOFS | __GFP_NOFAIL);
3060                if (path == NULL) {
3061                        ext4_journal_stop(handle);
3062                        return -ENOMEM;
3063                }
3064                path[0].p_maxdepth = path[0].p_depth = depth;
3065                path[0].p_hdr = ext_inode_hdr(inode);
3066                i = 0;
3067
3068                if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
3069                        err = -EFSCORRUPTED;
3070                        goto out;
3071                }
3072        }
3073        err = 0;
3074
3075        while (i >= 0 && err == 0) {
3076                if (i == depth) {
3077                        /* this is leaf block */
3078                        err = ext4_ext_rm_leaf(handle, inode, path,
3079                                               &partial, start, end);
3080                        /* root level has p_bh == NULL, brelse() eats this */
3081                        brelse(path[i].p_bh);
3082                        path[i].p_bh = NULL;
3083                        i--;
3084                        continue;
3085                }
3086
3087                /* this is index block */
3088                if (!path[i].p_hdr) {
3089                        ext_debug("initialize header\n");
3090                        path[i].p_hdr = ext_block_hdr(path[i].p_bh);
3091                }
3092
3093                if (!path[i].p_idx) {
3094                        /* this level hasn't been touched yet */
3095                        path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
3096                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
3097                        ext_debug("init index ptr: hdr 0x%p, num %d\n",
3098                                  path[i].p_hdr,
3099                                  le16_to_cpu(path[i].p_hdr->eh_entries));
3100                } else {
3101                        /* we were already here, see at next index */
3102                        path[i].p_idx--;
3103                }
3104
3105                ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
3106                                i, EXT_FIRST_INDEX(path[i].p_hdr),
3107                                path[i].p_idx);
3108                if (ext4_ext_more_to_rm(path + i)) {
3109                        struct buffer_head *bh;
3110                        /* go to the next level */
3111                        ext_debug("move to level %d (block %llu)\n",
3112                                  i + 1, ext4_idx_pblock(path[i].p_idx));
3113                        memset(path + i + 1, 0, sizeof(*path));
3114                        bh = read_extent_tree_block(inode,
3115                                ext4_idx_pblock(path[i].p_idx), depth - i - 1,
3116                                EXT4_EX_NOCACHE);
3117                        if (IS_ERR(bh)) {
3118                                /* should we reset i_size? */
3119                                err = PTR_ERR(bh);
3120                                break;
3121                        }
3122                        /* Yield here to deal with large extent trees.
3123                         * Should be a no-op if we did IO above. */
3124                        cond_resched();
3125                        if (WARN_ON(i + 1 > depth)) {
3126                                err = -EFSCORRUPTED;
3127                                break;
3128                        }
3129                        path[i + 1].p_bh = bh;
3130
3131                        /* save actual number of indexes since this
3132                         * number is changed at the next iteration */
3133                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
3134                        i++;
3135                } else {
3136                        /* we finished processing this index, go up */
3137                        if (path[i].p_hdr->eh_entries == 0 && i > 0) {
3138                                /* index is empty, remove it;
3139                                 * handle must be already prepared by the
3140                                 * truncatei_leaf() */
3141                                err = ext4_ext_rm_idx(handle, inode, path, i);
3142                        }
3143                        /* root level has p_bh == NULL, brelse() eats this */
3144                        brelse(path[i].p_bh);
3145                        path[i].p_bh = NULL;
3146                        i--;
3147                        ext_debug("return to level %d\n", i);
3148                }
3149        }
3150
3151        trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
3152                                         path->p_hdr->eh_entries);
3153
3154        /*
3155         * if there's a partial cluster and we have removed the first extent
3156         * in the file, then we also free the partial cluster, if any
3157         */
3158        if (partial.state == tofree && err == 0) {
3159                int flags = get_default_free_blocks_flags(inode);
3160
3161                if (ext4_is_pending(inode, partial.lblk))
3162                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
3163                ext4_free_blocks(handle, inode, NULL,
3164                                 EXT4_C2B(sbi, partial.pclu),
3165                                 sbi->s_cluster_ratio, flags);
3166                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
3167                        ext4_rereserve_cluster(inode, partial.lblk);
3168                partial.state = initial;
3169        }
3170
3171        /* TODO: flexible tree reduction should be here */
3172        if (path->p_hdr->eh_entries == 0) {
3173                /*
3174                 * truncate to zero freed all the tree,
3175                 * so we need to correct eh_depth
3176                 */
3177                err = ext4_ext_get_access(handle, inode, path);
3178                if (err == 0) {
3179                        ext_inode_hdr(inode)->eh_depth = 0;
3180                        ext_inode_hdr(inode)->eh_max =
3181                                cpu_to_le16(ext4_ext_space_root(inode, 0));
3182                        err = ext4_ext_dirty(handle, inode, path);
3183                }
3184        }
3185out:
3186        ext4_ext_drop_refs(path);
3187        kfree(path);
3188        path = NULL;
3189        if (err == -EAGAIN)
3190                goto again;
3191        ext4_journal_stop(handle);
3192
3193        return err;
3194}
3195
3196/*
3197 * called at mount time
3198 */
3199void ext4_ext_init(struct super_block *sb)
3200{
3201        /*
3202         * possible initialization would be here
3203         */
3204
3205        if (ext4_has_feature_extents(sb)) {
3206#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3207                printk(KERN_INFO "EXT4-fs: file extents enabled"
3208#ifdef AGGRESSIVE_TEST
3209                       ", aggressive tests"
3210#endif
3211#ifdef CHECK_BINSEARCH
3212                       ", check binsearch"
3213#endif
3214#ifdef EXTENTS_STATS
3215                       ", stats"
3216#endif
3217                       "\n");
3218#endif
3219#ifdef EXTENTS_STATS
3220                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3221                EXT4_SB(sb)->s_ext_min = 1 << 30;
3222                EXT4_SB(sb)->s_ext_max = 0;
3223#endif
3224        }
3225}
3226
3227/*
3228 * called at umount time
3229 */
3230void ext4_ext_release(struct super_block *sb)
3231{
3232        if (!ext4_has_feature_extents(sb))
3233                return;
3234
3235#ifdef EXTENTS_STATS
3236        if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3237                struct ext4_sb_info *sbi = EXT4_SB(sb);
3238                printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3239                        sbi->s_ext_blocks, sbi->s_ext_extents,
3240                        sbi->s_ext_blocks / sbi->s_ext_extents);
3241                printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3242                        sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3243        }
3244#endif
3245}
3246
3247static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3248{
3249        ext4_lblk_t  ee_block;
3250        ext4_fsblk_t ee_pblock;
3251        unsigned int ee_len;
3252
3253        ee_block  = le32_to_cpu(ex->ee_block);
3254        ee_len    = ext4_ext_get_actual_len(ex);
3255        ee_pblock = ext4_ext_pblock(ex);
3256
3257        if (ee_len == 0)
3258                return 0;
3259
3260        return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3261                                     EXTENT_STATUS_WRITTEN);
3262}
3263
3264/* FIXME!! we need to try to merge to left or right after zero-out  */
3265static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3266{
3267        ext4_fsblk_t ee_pblock;
3268        unsigned int ee_len;
3269
3270        ee_len    = ext4_ext_get_actual_len(ex);
3271        ee_pblock = ext4_ext_pblock(ex);
3272        return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
3273                                  ee_len);
3274}
3275
3276/*
3277 * ext4_split_extent_at() splits an extent at given block.
3278 *
3279 * @handle: the journal handle
3280 * @inode: the file inode
3281 * @path: the path to the extent
3282 * @split: the logical block where the extent is splitted.
3283 * @split_flags: indicates if the extent could be zeroout if split fails, and
3284 *               the states(init or unwritten) of new extents.
3285 * @flags: flags used to insert new extent to extent tree.
3286 *
3287 *
3288 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3289 * of which are deterimined by split_flag.
3290 *
3291 * There are two cases:
3292 *  a> the extent are splitted into two extent.
3293 *  b> split is not needed, and just mark the extent.
3294 *
3295 * return 0 on success.
3296 */
3297static int ext4_split_extent_at(handle_t *handle,
3298                             struct inode *inode,
3299                             struct ext4_ext_path **ppath,
3300                             ext4_lblk_t split,
3301                             int split_flag,
3302                             int flags)
3303{
3304        struct ext4_ext_path *path = *ppath;
3305        ext4_fsblk_t newblock;
3306        ext4_lblk_t ee_block;
3307        struct ext4_extent *ex, newex, orig_ex, zero_ex;
3308        struct ext4_extent *ex2 = NULL;
3309        unsigned int ee_len, depth;
3310        int err = 0;
3311
3312        BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3313               (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3314
3315        ext_debug("ext4_split_extents_at: inode %lu, logical"
3316                "block %llu\n", inode->i_ino, (unsigned long long)split);
3317
3318        ext4_ext_show_leaf(inode, path);
3319
3320        depth = ext_depth(inode);
3321        ex = path[depth].p_ext;
3322        ee_block = le32_to_cpu(ex->ee_block);
3323        ee_len = ext4_ext_get_actual_len(ex);
3324        newblock = split - ee_block + ext4_ext_pblock(ex);
3325
3326        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3327        BUG_ON(!ext4_ext_is_unwritten(ex) &&
3328               split_flag & (EXT4_EXT_MAY_ZEROOUT |
3329                             EXT4_EXT_MARK_UNWRIT1 |
3330                             EXT4_EXT_MARK_UNWRIT2));
3331
3332        err = ext4_ext_get_access(handle, inode, path + depth);
3333        if (err)
3334                goto out;
3335
3336        if (split == ee_block) {
3337                /*
3338                 * case b: block @split is the block that the extent begins with
3339                 * then we just change the state of the extent, and splitting
3340                 * is not needed.
3341                 */
3342                if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3343                        ext4_ext_mark_unwritten(ex);
3344                else
3345                        ext4_ext_mark_initialized(ex);
3346
3347                if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3348                        ext4_ext_try_to_merge(handle, inode, path, ex);
3349
3350                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3351                goto out;
3352        }
3353
3354        /* case a */
3355        memcpy(&orig_ex, ex, sizeof(orig_ex));
3356        ex->ee_len = cpu_to_le16(split - ee_block);
3357        if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3358                ext4_ext_mark_unwritten(ex);
3359
3360        /*
3361         * path may lead to new leaf, not to original leaf any more
3362         * after ext4_ext_insert_extent() returns,
3363         */
3364        err = ext4_ext_dirty(handle, inode, path + depth);
3365        if (err)
3366                goto fix_extent_len;
3367
3368        ex2 = &newex;
3369        ex2->ee_block = cpu_to_le32(split);
3370        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3371        ext4_ext_store_pblock(ex2, newblock);
3372        if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3373                ext4_ext_mark_unwritten(ex2);
3374
3375        err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3376        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3377                if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3378                        if (split_flag & EXT4_EXT_DATA_VALID1) {
3379                                err = ext4_ext_zeroout(inode, ex2);
3380                                zero_ex.ee_block = ex2->ee_block;
3381                                zero_ex.ee_len = cpu_to_le16(
3382                                                ext4_ext_get_actual_len(ex2));
3383                                ext4_ext_store_pblock(&zero_ex,
3384                                                      ext4_ext_pblock(ex2));
3385                        } else {
3386                                err = ext4_ext_zeroout(inode, ex);
3387                                zero_ex.ee_block = ex->ee_block;
3388                                zero_ex.ee_len = cpu_to_le16(
3389                                                ext4_ext_get_actual_len(ex));
3390                                ext4_ext_store_pblock(&zero_ex,
3391                                                      ext4_ext_pblock(ex));
3392                        }
3393                } else {
3394                        err = ext4_ext_zeroout(inode, &orig_ex);
3395                        zero_ex.ee_block = orig_ex.ee_block;
3396                        zero_ex.ee_len = cpu_to_le16(
3397                                                ext4_ext_get_actual_len(&orig_ex));
3398                        ext4_ext_store_pblock(&zero_ex,
3399                                              ext4_ext_pblock(&orig_ex));
3400                }
3401
3402                if (err)
3403                        goto fix_extent_len;
3404                /* update the extent length and mark as initialized */
3405                ex->ee_len = cpu_to_le16(ee_len);
3406                ext4_ext_try_to_merge(handle, inode, path, ex);
3407                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3408                if (err)
3409                        goto fix_extent_len;
3410
3411                /* update extent status tree */
3412                err = ext4_zeroout_es(inode, &zero_ex);
3413
3414                goto out;
3415        } else if (err)
3416                goto fix_extent_len;
3417
3418out:
3419        ext4_ext_show_leaf(inode, path);
3420        return err;
3421
3422fix_extent_len:
3423        ex->ee_len = orig_ex.ee_len;
3424        ext4_ext_dirty(handle, inode, path + path->p_depth);
3425        return err;
3426}
3427
3428/*
3429 * ext4_split_extents() splits an extent and mark extent which is covered
3430 * by @map as split_flags indicates
3431 *
3432 * It may result in splitting the extent into multiple extents (up to three)
3433 * There are three possibilities:
3434 *   a> There is no split required
3435 *   b> Splits in two extents: Split is happening at either end of the extent
3436 *   c> Splits in three extents: Somone is splitting in middle of the extent
3437 *
3438 */
3439static int ext4_split_extent(handle_t *handle,
3440                              struct inode *inode,
3441                              struct ext4_ext_path **ppath,
3442                              struct ext4_map_blocks *map,
3443                              int split_flag,
3444                              int flags)
3445{
3446        struct ext4_ext_path *path = *ppath;
3447        ext4_lblk_t ee_block;
3448        struct ext4_extent *ex;
3449        unsigned int ee_len, depth;
3450        int err = 0;
3451        int unwritten;
3452        int split_flag1, flags1;
3453        int allocated = map->m_len;
3454
3455        depth = ext_depth(inode);
3456        ex = path[depth].p_ext;
3457        ee_block = le32_to_cpu(ex->ee_block);
3458        ee_len = ext4_ext_get_actual_len(ex);
3459        unwritten = ext4_ext_is_unwritten(ex);
3460
3461        if (map->m_lblk + map->m_len < ee_block + ee_len) {
3462                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3463                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3464                if (unwritten)
3465                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3466                                       EXT4_EXT_MARK_UNWRIT2;
3467                if (split_flag & EXT4_EXT_DATA_VALID2)
3468                        split_flag1 |= EXT4_EXT_DATA_VALID1;
3469                err = ext4_split_extent_at(handle, inode, ppath,
3470                                map->m_lblk + map->m_len, split_flag1, flags1);
3471                if (err)
3472                        goto out;
3473        } else {
3474                allocated = ee_len - (map->m_lblk - ee_block);
3475        }
3476        /*
3477         * Update path is required because previous ext4_split_extent_at() may
3478         * result in split of original leaf or extent zeroout.
3479         */
3480        path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
3481        if (IS_ERR(path))
3482                return PTR_ERR(path);
3483        depth = ext_depth(inode);
3484        ex = path[depth].p_ext;
3485        if (!ex) {
3486                EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3487                                 (unsigned long) map->m_lblk);
3488                return -EFSCORRUPTED;
3489        }
3490        unwritten = ext4_ext_is_unwritten(ex);
3491        split_flag1 = 0;
3492
3493        if (map->m_lblk >= ee_block) {
3494                split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3495                if (unwritten) {
3496                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3497                        split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3498                                                     EXT4_EXT_MARK_UNWRIT2);
3499                }
3500                err = ext4_split_extent_at(handle, inode, ppath,
3501                                map->m_lblk, split_flag1, flags);
3502                if (err)
3503                        goto out;
3504        }
3505
3506        ext4_ext_show_leaf(inode, path);
3507out:
3508        return err ? err : allocated;
3509}
3510
3511/*
3512 * This function is called by ext4_ext_map_blocks() if someone tries to write
3513 * to an unwritten extent. It may result in splitting the unwritten
3514 * extent into multiple extents (up to three - one initialized and two
3515 * unwritten).
3516 * There are three possibilities:
3517 *   a> There is no split required: Entire extent should be initialized
3518 *   b> Splits in two extents: Write is happening at either end of the extent
3519 *   c> Splits in three extents: Somone is writing in middle of the extent
3520 *
3521 * Pre-conditions:
3522 *  - The extent pointed to by 'path' is unwritten.
3523 *  - The extent pointed to by 'path' contains a superset
3524 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3525 *
3526 * Post-conditions on success:
3527 *  - the returned value is the number of blocks beyond map->l_lblk
3528 *    that are allocated and initialized.
3529 *    It is guaranteed to be >= map->m_len.
3530 */
3531static int ext4_ext_convert_to_initialized(handle_t *handle,
3532                                           struct inode *inode,
3533                                           struct ext4_map_blocks *map,
3534                                           struct ext4_ext_path **ppath,
3535                                           int flags)
3536{
3537        struct ext4_ext_path *path = *ppath;
3538        struct ext4_sb_info *sbi;
3539        struct ext4_extent_header *eh;
3540        struct ext4_map_blocks split_map;
3541        struct ext4_extent zero_ex1, zero_ex2;
3542        struct ext4_extent *ex, *abut_ex;
3543        ext4_lblk_t ee_block, eof_block;
3544        unsigned int ee_len, depth, map_len = map->m_len;
3545        int allocated = 0, max_zeroout = 0;
3546        int err = 0;
3547        int split_flag = EXT4_EXT_DATA_VALID2;
3548
3549        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3550                "block %llu, max_blocks %u\n", inode->i_ino,
3551                (unsigned long long)map->m_lblk, map_len);
3552
3553        sbi = EXT4_SB(inode->i_sb);
3554        eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3555                        >> inode->i_sb->s_blocksize_bits;
3556        if (eof_block < map->m_lblk + map_len)
3557                eof_block = map->m_lblk + map_len;
3558
3559        depth = ext_depth(inode);
3560        eh = path[depth].p_hdr;
3561        ex = path[depth].p_ext;
3562        ee_block = le32_to_cpu(ex->ee_block);
3563        ee_len = ext4_ext_get_actual_len(ex);
3564        zero_ex1.ee_len = 0;
3565        zero_ex2.ee_len = 0;
3566
3567        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3568
3569        /* Pre-conditions */
3570        BUG_ON(!ext4_ext_is_unwritten(ex));
3571        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3572
3573        /*
3574         * Attempt to transfer newly initialized blocks from the currently
3575         * unwritten extent to its neighbor. This is much cheaper
3576         * than an insertion followed by a merge as those involve costly
3577         * memmove() calls. Transferring to the left is the common case in
3578         * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3579         * followed by append writes.
3580         *
3581         * Limitations of the current logic:
3582         *  - L1: we do not deal with writes covering the whole extent.
3583         *    This would require removing the extent if the transfer
3584         *    is possible.
3585         *  - L2: we only attempt to merge with an extent stored in the
3586         *    same extent tree node.
3587         */
3588        if ((map->m_lblk == ee_block) &&
3589                /* See if we can merge left */
3590                (map_len < ee_len) &&           /*L1*/
3591                (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3592                ext4_lblk_t prev_lblk;
3593                ext4_fsblk_t prev_pblk, ee_pblk;
3594                unsigned int prev_len;
3595
3596                abut_ex = ex - 1;
3597                prev_lblk = le32_to_cpu(abut_ex->ee_block);
3598                prev_len = ext4_ext_get_actual_len(abut_ex);
3599                prev_pblk = ext4_ext_pblock(abut_ex);
3600                ee_pblk = ext4_ext_pblock(ex);
3601
3602                /*
3603                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3604                 * upon those conditions:
3605                 * - C1: abut_ex is initialized,
3606                 * - C2: abut_ex is logically abutting ex,
3607                 * - C3: abut_ex is physically abutting ex,
3608                 * - C4: abut_ex can receive the additional blocks without
3609                 *   overflowing the (initialized) length limit.
3610                 */
3611                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3612                        ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3613                        ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3614                        (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3615                        err = ext4_ext_get_access(handle, inode, path + depth);
3616                        if (err)
3617                                goto out;
3618
3619                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
3620                                map, ex, abut_ex);
3621
3622                        /* Shift the start of ex by 'map_len' blocks */
3623                        ex->ee_block = cpu_to_le32(ee_block + map_len);
3624                        ext4_ext_store_pblock(ex, ee_pblk + map_len);
3625                        ex->ee_len = cpu_to_le16(ee_len - map_len);
3626                        ext4_ext_mark_unwritten(ex); /* Restore the flag */
3627
3628                        /* Extend abut_ex by 'map_len' blocks */
3629                        abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3630
3631                        /* Result: number of initialized blocks past m_lblk */
3632                        allocated = map_len;
3633                }
3634        } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3635                   (map_len < ee_len) &&        /*L1*/
3636                   ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3637                /* See if we can merge right */
3638                ext4_lblk_t next_lblk;
3639                ext4_fsblk_t next_pblk, ee_pblk;
3640                unsigned int next_len;
3641
3642                abut_ex = ex + 1;
3643                next_lblk = le32_to_cpu(abut_ex->ee_block);
3644                next_len = ext4_ext_get_actual_len(abut_ex);
3645                next_pblk = ext4_ext_pblock(abut_ex);
3646                ee_pblk = ext4_ext_pblock(ex);
3647
3648                /*
3649                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3650                 * upon those conditions:
3651                 * - C1: abut_ex is initialized,
3652                 * - C2: abut_ex is logically abutting ex,
3653                 * - C3: abut_ex is physically abutting ex,
3654                 * - C4: abut_ex can receive the additional blocks without
3655                 *   overflowing the (initialized) length limit.
3656                 */
3657                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3658                    ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3659                    ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3660                    (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3661                        err = ext4_ext_get_access(handle, inode, path + depth);
3662                        if (err)
3663                                goto out;
3664
3665                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
3666                                map, ex, abut_ex);
3667
3668                        /* Shift the start of abut_ex by 'map_len' blocks */
3669                        abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3670                        ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3671                        ex->ee_len = cpu_to_le16(ee_len - map_len);
3672                        ext4_ext_mark_unwritten(ex); /* Restore the flag */
3673
3674                        /* Extend abut_ex by 'map_len' blocks */
3675                        abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3676
3677                        /* Result: number of initialized blocks past m_lblk */
3678                        allocated = map_len;
3679                }
3680        }
3681        if (allocated) {
3682                /* Mark the block containing both extents as dirty */
3683                ext4_ext_dirty(handle, inode, path + depth);
3684
3685                /* Update path to point to the right extent */
3686                path[depth].p_ext = abut_ex;
3687                goto out;
3688        } else
3689                allocated = ee_len - (map->m_lblk - ee_block);
3690
3691        WARN_ON(map->m_lblk < ee_block);
3692        /*
3693         * It is safe to convert extent to initialized via explicit
3694         * zeroout only if extent is fully inside i_size or new_size.
3695         */
3696        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3697
3698        if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3699                max_zeroout = sbi->s_extent_max_zeroout_kb >>
3700                        (inode->i_sb->s_blocksize_bits - 10);
3701
3702        if (IS_ENCRYPTED(inode))
3703                max_zeroout = 0;
3704
3705        /*
3706         * five cases:
3707         * 1. split the extent into three extents.
3708         * 2. split the extent into two extents, zeroout the head of the first
3709         *    extent.
3710         * 3. split the extent into two extents, zeroout the tail of the second
3711         *    extent.
3712         * 4. split the extent into two extents with out zeroout.
3713         * 5. no splitting needed, just possibly zeroout the head and / or the
3714         *    tail of the extent.
3715         */
3716        split_map.m_lblk = map->m_lblk;
3717        split_map.m_len = map->m_len;
3718
3719        if (max_zeroout && (allocated > split_map.m_len)) {
3720                if (allocated <= max_zeroout) {
3721                        /* case 3 or 5 */
3722                        zero_ex1.ee_block =
3723                                 cpu_to_le32(split_map.m_lblk +
3724                                             split_map.m_len);
3725                        zero_ex1.ee_len =
3726                                cpu_to_le16(allocated - split_map.m_len);
3727                        ext4_ext_store_pblock(&zero_ex1,
3728                                ext4_ext_pblock(ex) + split_map.m_lblk +
3729                                split_map.m_len - ee_block);
3730                        err = ext4_ext_zeroout(inode, &zero_ex1);
3731                        if (err)
3732                                goto out;
3733                        split_map.m_len = allocated;
3734                }
3735                if (split_map.m_lblk - ee_block + split_map.m_len <
3736                                                                max_zeroout) {
3737                        /* case 2 or 5 */
3738                        if (split_map.m_lblk != ee_block) {
3739                                zero_ex2.ee_block = ex->ee_block;
3740                                zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
3741                                                        ee_block);
3742                                ext4_ext_store_pblock(&zero_ex2,
3743                                                      ext4_ext_pblock(ex));
3744                                err = ext4_ext_zeroout(inode, &zero_ex2);
3745                                if (err)
3746                                        goto out;
3747                        }
3748
3749                        split_map.m_len += split_map.m_lblk - ee_block;
3750                        split_map.m_lblk = ee_block;
3751                        allocated = map->m_len;
3752                }
3753        }
3754
3755        err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3756                                flags);
3757        if (err > 0)
3758                err = 0;
3759out:
3760        /* If we have gotten a failure, don't zero out status tree */
3761        if (!err) {
3762                err = ext4_zeroout_es(inode, &zero_ex1);
3763                if (!err)
3764                        err = ext4_zeroout_es(inode, &zero_ex2);
3765        }
3766        return err ? err : allocated;
3767}
3768
3769/*
3770 * This function is called by ext4_ext_map_blocks() from
3771 * ext4_get_blocks_dio_write() when DIO to write
3772 * to an unwritten extent.
3773 *
3774 * Writing to an unwritten extent may result in splitting the unwritten
3775 * extent into multiple initialized/unwritten extents (up to three)
3776 * There are three possibilities:
3777 *   a> There is no split required: Entire extent should be unwritten
3778 *   b> Splits in two extents: Write is happening at either end of the extent
3779 *   c> Splits in three extents: Somone is writing in middle of the extent
3780 *
3781 * This works the same way in the case of initialized -> unwritten conversion.
3782 *
3783 * One of more index blocks maybe needed if the extent tree grow after
3784 * the unwritten extent split. To prevent ENOSPC occur at the IO
3785 * complete, we need to split the unwritten extent before DIO submit
3786 * the IO. The unwritten extent called at this time will be split
3787 * into three unwritten extent(at most). After IO complete, the part
3788 * being filled will be convert to initialized by the end_io callback function
3789 * via ext4_convert_unwritten_extents().
3790 *
3791 * Returns the size of unwritten extent to be written on success.
3792 */
3793static int ext4_split_convert_extents(handle_t *handle,
3794                                        struct inode *inode,
3795                                        struct ext4_map_blocks *map,
3796                                        struct ext4_ext_path **ppath,
3797                                        int flags)
3798{
3799        struct ext4_ext_path *path = *ppath;
3800        ext4_lblk_t eof_block;
3801        ext4_lblk_t ee_block;
3802        struct ext4_extent *ex;
3803        unsigned int ee_len;
3804        int split_flag = 0, depth;
3805
3806        ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3807                  __func__, inode->i_ino,
3808                  (unsigned long long)map->m_lblk, map->m_len);
3809
3810        eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3811                        >> inode->i_sb->s_blocksize_bits;
3812        if (eof_block < map->m_lblk + map->m_len)
3813                eof_block = map->m_lblk + map->m_len;
3814        /*
3815         * It is safe to convert extent to initialized via explicit
3816         * zeroout only if extent is fully insde i_size or new_size.
3817         */
3818        depth = ext_depth(inode);
3819        ex = path[depth].p_ext;
3820        ee_block = le32_to_cpu(ex->ee_block);
3821        ee_len = ext4_ext_get_actual_len(ex);
3822
3823        /* Convert to unwritten */
3824        if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3825                split_flag |= EXT4_EXT_DATA_VALID1;
3826        /* Convert to initialized */
3827        } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3828                split_flag |= ee_block + ee_len <= eof_block ?
3829                              EXT4_EXT_MAY_ZEROOUT : 0;
3830                split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3831        }
3832        flags |= EXT4_GET_BLOCKS_PRE_IO;
3833        return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3834}
3835
3836static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3837                                                struct inode *inode,
3838                                                struct ext4_map_blocks *map,
3839                                                struct ext4_ext_path **ppath)
3840{
3841        struct ext4_ext_path *path = *ppath;
3842        struct ext4_extent *ex;
3843        ext4_lblk_t ee_block;
3844        unsigned int ee_len;
3845        int depth;
3846        int err = 0;
3847
3848        depth = ext_depth(inode);
3849        ex = path[depth].p_ext;
3850        ee_block = le32_to_cpu(ex->ee_block);
3851        ee_len = ext4_ext_get_actual_len(ex);
3852
3853        ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3854                "block %llu, max_blocks %u\n", inode->i_ino,
3855                  (unsigned long long)ee_block, ee_len);
3856
3857        /* If extent is larger than requested it is a clear sign that we still
3858         * have some extent state machine issues left. So extent_split is still
3859         * required.
3860         * TODO: Once all related issues will be fixed this situation should be
3861         * illegal.
3862         */
3863        if (ee_block != map->m_lblk || ee_len > map->m_len) {
3864#ifdef CONFIG_EXT4_DEBUG
3865                ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
3866                             " len %u; IO logical block %llu, len %u",
3867                             inode->i_ino, (unsigned long long)ee_block, ee_len,
3868                             (unsigned long long)map->m_lblk, map->m_len);
3869#endif
3870                err = ext4_split_convert_extents(handle, inode, map, ppath,
3871                                                 EXT4_GET_BLOCKS_CONVERT);
3872                if (err < 0)
3873                        return err;
3874                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3875                if (IS_ERR(path))
3876                        return PTR_ERR(path);
3877                depth = ext_depth(inode);
3878                ex = path[depth].p_ext;
3879        }
3880
3881        err = ext4_ext_get_access(handle, inode, path + depth);
3882        if (err)
3883                goto out;
3884        /* first mark the extent as initialized */
3885        ext4_ext_mark_initialized(ex);
3886
3887        /* note: ext4_ext_correct_indexes() isn't needed here because
3888         * borders are not changed
3889         */
3890        ext4_ext_try_to_merge(handle, inode, path, ex);
3891
3892        /* Mark modified extent as dirty */
3893        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3894out:
3895        ext4_ext_show_leaf(inode, path);
3896        return err;
3897}
3898
3899/*
3900 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3901 */
3902static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3903                              ext4_lblk_t lblk,
3904                              struct ext4_ext_path *path,
3905                              unsigned int len)
3906{
3907        int i, depth;
3908        struct ext4_extent_header *eh;
3909        struct ext4_extent *last_ex;
3910
3911        if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3912                return 0;
3913
3914        depth = ext_depth(inode);
3915        eh = path[depth].p_hdr;
3916
3917        /*
3918         * We're going to remove EOFBLOCKS_FL entirely in future so we
3919         * do not care for this case anymore. Simply remove the flag
3920         * if there are no extents.
3921         */
3922        if (unlikely(!eh->eh_entries))
3923                goto out;
3924        last_ex = EXT_LAST_EXTENT(eh);
3925        /*
3926         * We should clear the EOFBLOCKS_FL flag if we are writing the
3927         * last block in the last extent in the file.  We test this by
3928         * first checking to see if the caller to
3929         * ext4_ext_get_blocks() was interested in the last block (or
3930         * a block beyond the last block) in the current extent.  If
3931         * this turns out to be false, we can bail out from this
3932         * function immediately.
3933         */
3934        if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3935            ext4_ext_get_actual_len(last_ex))
3936                return 0;
3937        /*
3938         * If the caller does appear to be planning to write at or
3939         * beyond the end of the current extent, we then test to see
3940         * if the current extent is the last extent in the file, by
3941         * checking to make sure it was reached via the rightmost node
3942         * at each level of the tree.
3943         */
3944        for (i = depth-1; i >= 0; i--)
3945                if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3946                        return 0;
3947out:
3948        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3949        return ext4_mark_inode_dirty(handle, inode);
3950}
3951
3952static int
3953convert_initialized_extent(handle_t *handle, struct inode *inode,
3954                           struct ext4_map_blocks *map,
3955                           struct ext4_ext_path **ppath,
3956                           unsigned int *allocated)
3957{
3958        struct ext4_ext_path *path = *ppath;
3959        struct ext4_extent *ex;
3960        ext4_lblk_t ee_block;
3961        unsigned int ee_len;
3962        int depth;
3963        int err = 0;
3964
3965        /*
3966         * Make sure that the extent is no bigger than we support with
3967         * unwritten extent
3968         */
3969        if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3970                map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3971
3972        depth = ext_depth(inode);
3973        ex = path[depth].p_ext;
3974        ee_block = le32_to_cpu(ex->ee_block);
3975        ee_len = ext4_ext_get_actual_len(ex);
3976
3977        ext_debug("%s: inode %lu, logical"
3978                "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3979                  (unsigned long long)ee_block, ee_len);
3980
3981        if (ee_block != map->m_lblk || ee_len > map->m_len) {
3982                err = ext4_split_convert_extents(handle, inode, map, ppath,
3983                                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3984                if (err < 0)
3985                        return err;
3986                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3987                if (IS_ERR(path))
3988                        return PTR_ERR(path);
3989                depth = ext_depth(inode);
3990                ex = path[depth].p_ext;
3991                if (!ex) {
3992                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3993                                         (unsigned long) map->m_lblk);
3994                        return -EFSCORRUPTED;
3995                }
3996        }
3997
3998        err = ext4_ext_get_access(handle, inode, path + depth);
3999        if (err)
4000                return err;
4001        /* first mark the extent as unwritten */
4002        ext4_ext_mark_unwritten(ex);
4003
4004        /* note: ext4_ext_correct_indexes() isn't needed here because
4005         * borders are not changed
4006         */
4007        ext4_ext_try_to_merge(handle, inode, path, ex);
4008
4009        /* Mark modified extent as dirty */
4010        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
4011        if (err)
4012                return err;
4013        ext4_ext_show_leaf(inode, path);
4014
4015        ext4_update_inode_fsync_trans(handle, inode, 1);
4016        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
4017        if (err)
4018                return err;
4019        map->m_flags |= EXT4_MAP_UNWRITTEN;
4020        if (*allocated > map->m_len)
4021                *allocated = map->m_len;
4022        map->m_len = *allocated;
4023        return 0;
4024}
4025
4026static int
4027ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4028                        struct ext4_map_blocks *map,
4029                        struct ext4_ext_path **ppath, int flags,
4030                        unsigned int allocated, ext4_fsblk_t newblock)
4031{
4032        struct ext4_ext_path *path = *ppath;
4033        int ret = 0;
4034        int err = 0;
4035
4036        ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
4037                  "block %llu, max_blocks %u, flags %x, allocated %u\n",
4038                  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
4039                  flags, allocated);
4040        ext4_ext_show_leaf(inode, path);
4041
4042        /*
4043         * When writing into unwritten space, we should not fail to
4044         * allocate metadata blocks for the new extent block if needed.
4045         */
4046        flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
4047
4048        trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
4049                                                    allocated, newblock);
4050
4051        /* get_block() before submitting IO, split the extent */
4052        if (flags & EXT4_GET_BLOCKS_PRE_IO) {
4053                ret = ext4_split_convert_extents(handle, inode, map, ppath,
4054                                         flags | EXT4_GET_BLOCKS_CONVERT);
4055                if (ret < 0) {
4056                        err = ret;
4057                        goto out2;
4058                }
4059                /*
4060                 * shouldn't get a 0 return when splitting an extent unless
4061                 * m_len is 0 (bug) or extent has been corrupted
4062                 */
4063                if (unlikely(ret == 0)) {
4064                        EXT4_ERROR_INODE(inode,
4065                                         "unexpected ret == 0, m_len = %u",
4066                                         map->m_len);
4067                        err = -EFSCORRUPTED;
4068                        goto out2;
4069                }
4070                map->m_flags |= EXT4_MAP_UNWRITTEN;
4071                goto out;
4072        }
4073        /* IO end_io complete, convert the filled extent to written */
4074        if (flags & EXT4_GET_BLOCKS_CONVERT) {
4075                if (flags & EXT4_GET_BLOCKS_ZERO) {
4076                        if (allocated > map->m_len)
4077                                allocated = map->m_len;
4078                        err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
4079                                                 allocated);
4080                        if (err < 0)
4081                                goto out2;
4082                }
4083                ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
4084                                                           ppath);
4085                if (ret >= 0) {
4086                        ext4_update_inode_fsync_trans(handle, inode, 1);
4087                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
4088                                                 path, map->m_len);
4089                } else
4090                        err = ret;
4091                map->m_flags |= EXT4_MAP_MAPPED;
4092                map->m_pblk = newblock;
4093                if (allocated > map->m_len)
4094                        allocated = map->m_len;
4095                map->m_len = allocated;
4096                goto out2;
4097        }
4098        /* buffered IO case */
4099        /*
4100         * repeat fallocate creation request
4101         * we already have an unwritten extent
4102         */
4103        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4104                map->m_flags |= EXT4_MAP_UNWRITTEN;
4105                goto map_out;
4106        }
4107
4108        /* buffered READ or buffered write_begin() lookup */
4109        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4110                /*
4111                 * We have blocks reserved already.  We
4112                 * return allocated blocks so that delalloc
4113                 * won't do block reservation for us.  But
4114                 * the buffer head will be unmapped so that
4115                 * a read from the block returns 0s.
4116                 */
4117                map->m_flags |= EXT4_MAP_UNWRITTEN;
4118                goto out1;
4119        }
4120
4121        /*
4122         * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
4123         * For buffered writes, at writepage time, etc.  Convert a
4124         * discovered unwritten extent to written.
4125         */
4126        ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
4127        if (ret < 0) {
4128                err = ret;
4129                goto out2;
4130        }
4131        ext4_update_inode_fsync_trans(handle, inode, 1);
4132        /*
4133         * shouldn't get a 0 return when converting an unwritten extent
4134         * unless m_len is 0 (bug) or extent has been corrupted
4135         */
4136        if (unlikely(ret == 0)) {
4137                EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
4138                                 map->m_len);
4139                err = -EFSCORRUPTED;
4140                goto out2;
4141        }
4142
4143out:
4144        allocated = ret;
4145        map->m_flags |= EXT4_MAP_NEW;
4146        if (allocated > map->m_len)
4147                allocated = map->m_len;
4148        map->m_len = allocated;
4149
4150map_out:
4151        map->m_flags |= EXT4_MAP_MAPPED;
4152        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
4153                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
4154                                         map->m_len);
4155                if (err < 0)
4156                        goto out2;
4157        }
4158out1:
4159        if (allocated > map->m_len)
4160                allocated = map->m_len;
4161        ext4_ext_show_leaf(inode, path);
4162        map->m_pblk = newblock;
4163        map->m_len = allocated;
4164out2:
4165        return err ? err : allocated;
4166}
4167
4168/*
4169 * get_implied_cluster_alloc - check to see if the requested
4170 * allocation (in the map structure) overlaps with a cluster already
4171 * allocated in an extent.
4172 *      @sb     The filesystem superblock structure
4173 *      @map    The requested lblk->pblk mapping
4174 *      @ex     The extent structure which might contain an implied
4175 *                      cluster allocation
4176 *
4177 * This function is called by ext4_ext_map_blocks() after we failed to
4178 * find blocks that were already in the inode's extent tree.  Hence,
4179 * we know that the beginning of the requested region cannot overlap
4180 * the extent from the inode's extent tree.  There are three cases we
4181 * want to catch.  The first is this case:
4182 *
4183 *               |--- cluster # N--|
4184 *    |--- extent ---|  |---- requested region ---|
4185 *                      |==========|
4186 *
4187 * The second case that we need to test for is this one:
4188 *
4189 *   |--------- cluster # N ----------------|
4190 *         |--- requested region --|   |------- extent ----|
4191 *         |=======================|
4192 *
4193 * The third case is when the requested region lies between two extents
4194 * within the same cluster:
4195 *          |------------- cluster # N-------------|
4196 * |----- ex -----|                  |---- ex_right ----|
4197 *                  |------ requested region ------|
4198 *                  |================|
4199 *
4200 * In each of the above cases, we need to set the map->m_pblk and
4201 * map->m_len so it corresponds to the return the extent labelled as
4202 * "|====|" from cluster #N, since it is already in use for data in
4203 * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
4204 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4205 * as a new "allocated" block region.  Otherwise, we will return 0 and
4206 * ext4_ext_map_blocks() will then allocate one or more new clusters
4207 * by calling ext4_mb_new_blocks().
4208 */
4209static int get_implied_cluster_alloc(struct super_block *sb,
4210                                     struct ext4_map_blocks *map,
4211                                     struct ext4_extent *ex,
4212                                     struct ext4_ext_path *path)
4213{
4214        struct ext4_sb_info *sbi = EXT4_SB(sb);
4215        ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4216        ext4_lblk_t ex_cluster_start, ex_cluster_end;
4217        ext4_lblk_t rr_cluster_start;
4218        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4219        ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4220        unsigned short ee_len = ext4_ext_get_actual_len(ex);
4221
4222        /* The extent passed in that we are trying to match */
4223        ex_cluster_start = EXT4_B2C(sbi, ee_block);
4224        ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4225
4226        /* The requested region passed into ext4_map_blocks() */
4227        rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4228
4229        if ((rr_cluster_start == ex_cluster_end) ||
4230            (rr_cluster_start == ex_cluster_start)) {
4231                if (rr_cluster_start == ex_cluster_end)
4232                        ee_start += ee_len - 1;
4233                map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4234                map->m_len = min(map->m_len,
4235                                 (unsigned) sbi->s_cluster_ratio - c_offset);
4236                /*
4237                 * Check for and handle this case:
4238                 *
4239                 *   |--------- cluster # N-------------|
4240                 *                     |------- extent ----|
4241                 *         |--- requested region ---|
4242                 *         |===========|
4243                 */
4244
4245                if (map->m_lblk < ee_block)
4246                        map->m_len = min(map->m_len, ee_block - map->m_lblk);
4247
4248                /*
4249                 * Check for the case where there is already another allocated
4250                 * block to the right of 'ex' but before the end of the cluster.
4251                 *
4252                 *          |------------- cluster # N-------------|
4253                 * |----- ex -----|                  |---- ex_right ----|
4254                 *                  |------ requested region ------|
4255                 *                  |================|
4256                 */
4257                if (map->m_lblk > ee_block) {
4258                        ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4259                        map->m_len = min(map->m_len, next - map->m_lblk);
4260                }
4261
4262                trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4263                return 1;
4264        }
4265
4266        trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4267        return 0;
4268}
4269
4270
4271/*
4272 * Block allocation/map/preallocation routine for extents based files
4273 *
4274 *
4275 * Need to be called with
4276 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4277 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4278 *
4279 * return > 0, number of of blocks already mapped/allocated
4280 *          if create == 0 and these are pre-allocated blocks
4281 *              buffer head is unmapped
4282 *          otherwise blocks are mapped
4283 *
4284 * return = 0, if plain look up failed (blocks have not been allocated)
4285 *          buffer head is unmapped
4286 *
4287 * return < 0, error case.
4288 */
4289int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4290                        struct ext4_map_blocks *map, int flags)
4291{
4292        struct ext4_ext_path *path = NULL;
4293        struct ext4_extent newex, *ex, ex2;
4294        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4295        ext4_fsblk_t newblock = 0;
4296        int free_on_err = 0, err = 0, depth, ret;
4297        unsigned int allocated = 0, offset = 0;
4298        unsigned int allocated_clusters = 0;
4299        struct ext4_allocation_request ar;
4300        ext4_lblk_t cluster_offset;
4301        bool map_from_cluster = false;
4302
4303        ext_debug("blocks %u/%u requested for inode %lu\n",
4304                  map->m_lblk, map->m_len, inode->i_ino);
4305        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4306
4307        /* find extent for this block */
4308        path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4309        if (IS_ERR(path)) {
4310                err = PTR_ERR(path);
4311                path = NULL;
4312                goto out2;
4313        }
4314
4315        depth = ext_depth(inode);
4316
4317        /*
4318         * consistent leaf must not be empty;
4319         * this situation is possible, though, _during_ tree modification;
4320         * this is why assert can't be put in ext4_find_extent()
4321         */
4322        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4323                EXT4_ERROR_INODE(inode, "bad extent address "
4324                                 "lblock: %lu, depth: %d pblock %lld",
4325                                 (unsigned long) map->m_lblk, depth,
4326                                 path[depth].p_block);
4327                err = -EFSCORRUPTED;
4328                goto out2;
4329        }
4330
4331        ex = path[depth].p_ext;
4332        if (ex) {
4333                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4334                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4335                unsigned short ee_len;
4336
4337
4338                /*
4339                 * unwritten extents are treated as holes, except that
4340                 * we split out initialized portions during a write.
4341                 */
4342                ee_len = ext4_ext_get_actual_len(ex);
4343
4344                trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4345
4346                /* if found extent covers block, simply return it */
4347                if (in_range(map->m_lblk, ee_block, ee_len)) {
4348                        newblock = map->m_lblk - ee_block + ee_start;
4349                        /* number of remaining blocks in the extent */
4350                        allocated = ee_len - (map->m_lblk - ee_block);
4351                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4352                                  ee_block, ee_len, newblock);
4353
4354                        /*
4355                         * If the extent is initialized check whether the
4356                         * caller wants to convert it to unwritten.
4357                         */
4358                        if ((!ext4_ext_is_unwritten(ex)) &&
4359                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4360                                err = convert_initialized_extent(handle,
4361                                        inode, map, &path, &allocated);
4362                                goto out2;
4363                        } else if (!ext4_ext_is_unwritten(ex)) {
4364                                goto out;
4365                        }
4366
4367                        ret = ext4_ext_handle_unwritten_extents(
4368                                handle, inode, map, &path, flags,
4369                                allocated, newblock);
4370                        if (ret < 0)
4371                                err = ret;
4372                        else
4373                                allocated = ret;
4374                        goto out2;
4375                }
4376        }
4377
4378        /*
4379         * requested block isn't allocated yet;
4380         * we couldn't try to create block if create flag is zero
4381         */
4382        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4383                ext4_lblk_t hole_start, hole_len;
4384
4385                hole_start = map->m_lblk;
4386                hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4387                /*
4388                 * put just found gap into cache to speed up
4389                 * subsequent requests
4390                 */
4391                ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4392
4393                /* Update hole_len to reflect hole size after map->m_lblk */
4394                if (hole_start != map->m_lblk)
4395                        hole_len -= map->m_lblk - hole_start;
4396                map->m_pblk = 0;
4397                map->m_len = min_t(unsigned int, map->m_len, hole_len);
4398
4399                goto out2;
4400        }
4401
4402        /*
4403         * Okay, we need to do block allocation.
4404         */
4405        newex.ee_block = cpu_to_le32(map->m_lblk);
4406        cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4407
4408        /*
4409         * If we are doing bigalloc, check to see if the extent returned
4410         * by ext4_find_extent() implies a cluster we can use.
4411         */
4412        if (cluster_offset && ex &&
4413            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4414                ar.len = allocated = map->m_len;
4415                newblock = map->m_pblk;
4416                map_from_cluster = true;
4417                goto got_allocated_blocks;
4418        }
4419
4420        /* find neighbour allocated blocks */
4421        ar.lleft = map->m_lblk;
4422        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4423        if (err)
4424                goto out2;
4425        ar.lright = map->m_lblk;
4426        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4427        if (err < 0)
4428                goto out2;
4429
4430        /* Check if the extent after searching to the right implies a
4431         * cluster we can use. */
4432        if ((sbi->s_cluster_ratio > 1) && err &&
4433            get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
4434                ar.len = allocated = map->m_len;
4435                newblock = map->m_pblk;
4436                map_from_cluster = true;
4437                goto got_allocated_blocks;
4438        }
4439
4440        /*
4441         * See if request is beyond maximum number of blocks we can have in
4442         * a single extent. For an initialized extent this limit is
4443         * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4444         * EXT_UNWRITTEN_MAX_LEN.
4445         */
4446        if (map->m_len > EXT_INIT_MAX_LEN &&
4447            !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4448                map->m_len = EXT_INIT_MAX_LEN;
4449        else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4450                 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4451                map->m_len = EXT_UNWRITTEN_MAX_LEN;
4452
4453        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4454        newex.ee_len = cpu_to_le16(map->m_len);
4455        err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4456        if (err)
4457                allocated = ext4_ext_get_actual_len(&newex);
4458        else
4459                allocated = map->m_len;
4460
4461        /* allocate new block */
4462        ar.inode = inode;
4463        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4464        ar.logical = map->m_lblk;
4465        /*
4466         * We calculate the offset from the beginning of the cluster
4467         * for the logical block number, since when we allocate a
4468         * physical cluster, the physical block should start at the
4469         * same offset from the beginning of the cluster.  This is
4470         * needed so that future calls to get_implied_cluster_alloc()
4471         * work correctly.
4472         */
4473        offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4474        ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4475        ar.goal -= offset;
4476        ar.logical -= offset;
4477        if (S_ISREG(inode->i_mode))
4478                ar.flags = EXT4_MB_HINT_DATA;
4479        else
4480                /* disable in-core preallocation for non-regular files */
4481                ar.flags = 0;
4482        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4483                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4484        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4485                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4486        if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
4487                ar.flags |= EXT4_MB_USE_RESERVED;
4488        newblock = ext4_mb_new_blocks(handle, &ar, &err);
4489        if (!newblock)
4490                goto out2;
4491        ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4492                  ar.goal, newblock, allocated);
4493        free_on_err = 1;
4494        allocated_clusters = ar.len;
4495        ar.len = EXT4_C2B(sbi, ar.len) - offset;
4496        if (ar.len > allocated)
4497                ar.len = allocated;
4498
4499got_allocated_blocks:
4500        /* try to insert new extent into found leaf and return */
4501        ext4_ext_store_pblock(&newex, newblock + offset);
4502        newex.ee_len = cpu_to_le16(ar.len);
4503        /* Mark unwritten */
4504        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4505                ext4_ext_mark_unwritten(&newex);
4506                map->m_flags |= EXT4_MAP_UNWRITTEN;
4507        }
4508
4509        err = 0;
4510        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4511                err = check_eofblocks_fl(handle, inode, map->m_lblk,
4512                                         path, ar.len);
4513        if (!err)
4514                err = ext4_ext_insert_extent(handle, inode, &path,
4515                                             &newex, flags);
4516
4517        if (err && free_on_err) {
4518                int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4519                        EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4520                /* free data blocks we just allocated */
4521                /* not a good idea to call discard here directly,
4522                 * but otherwise we'd need to call it every free() */
4523                ext4_discard_preallocations(inode);
4524                ext4_free_blocks(handle, inode, NULL, newblock,
4525                                 EXT4_C2B(sbi, allocated_clusters), fb_flags);
4526                goto out2;
4527        }
4528
4529        /* previous routine could use block we allocated */
4530        newblock = ext4_ext_pblock(&newex);
4531        allocated = ext4_ext_get_actual_len(&newex);
4532        if (allocated > map->m_len)
4533                allocated = map->m_len;
4534        map->m_flags |= EXT4_MAP_NEW;
4535
4536        /*
4537         * Reduce the reserved cluster count to reflect successful deferred
4538         * allocation of delayed allocated clusters or direct allocation of
4539         * clusters discovered to be delayed allocated.  Once allocated, a
4540         * cluster is not included in the reserved count.
4541         */
4542        if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
4543                if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4544                        /*
4545                         * When allocating delayed allocated clusters, simply
4546                         * reduce the reserved cluster count and claim quota
4547                         */
4548                        ext4_da_update_reserve_space(inode, allocated_clusters,
4549                                                        1);
4550                } else {
4551                        ext4_lblk_t lblk, len;
4552                        unsigned int n;
4553
4554                        /*
4555                         * When allocating non-delayed allocated clusters
4556                         * (from fallocate, filemap, DIO, or clusters
4557                         * allocated when delalloc has been disabled by
4558                         * ext4_nonda_switch), reduce the reserved cluster
4559                         * count by the number of allocated clusters that
4560                         * have previously been delayed allocated.  Quota
4561                         * has been claimed by ext4_mb_new_blocks() above,
4562                         * so release the quota reservations made for any
4563                         * previously delayed allocated clusters.
4564                         */
4565                        lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
4566                        len = allocated_clusters << sbi->s_cluster_bits;
4567                        n = ext4_es_delayed_clu(inode, lblk, len);
4568                        if (n > 0)
4569                                ext4_da_update_reserve_space(inode, (int) n, 0);
4570                }
4571        }
4572
4573        /*
4574         * Cache the extent and update transaction to commit on fdatasync only
4575         * when it is _not_ an unwritten extent.
4576         */
4577        if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4578                ext4_update_inode_fsync_trans(handle, inode, 1);
4579        else
4580                ext4_update_inode_fsync_trans(handle, inode, 0);
4581out:
4582        if (allocated > map->m_len)
4583                allocated = map->m_len;
4584        ext4_ext_show_leaf(inode, path);
4585        map->m_flags |= EXT4_MAP_MAPPED;
4586        map->m_pblk = newblock;
4587        map->m_len = allocated;
4588out2:
4589        ext4_ext_drop_refs(path);
4590        kfree(path);
4591
4592        trace_ext4_ext_map_blocks_exit(inode, flags, map,
4593                                       err ? err : allocated);
4594        return err ? err : allocated;
4595}
4596
4597int ext4_ext_truncate(handle_t *handle, struct inode *inode)
4598{
4599        struct super_block *sb = inode->i_sb;
4600        ext4_lblk_t last_block;
4601        int err = 0;
4602
4603        /*
4604         * TODO: optimization is possible here.
4605         * Probably we need not scan at all,
4606         * because page truncation is enough.
4607         */
4608
4609        /* we have to know where to truncate from in crash case */
4610        EXT4_I(inode)->i_disksize = inode->i_size;
4611        err = ext4_mark_inode_dirty(handle, inode);
4612        if (err)
4613                return err;
4614
4615        last_block = (inode->i_size + sb->s_blocksize - 1)
4616                        >> EXT4_BLOCK_SIZE_BITS(sb);
4617retry:
4618        err = ext4_es_remove_extent(inode, last_block,
4619                                    EXT_MAX_BLOCKS - last_block);
4620        if (err == -ENOMEM) {
4621                cond_resched();
4622                congestion_wait(BLK_RW_ASYNC, HZ/50);
4623                goto retry;
4624        }
4625        if (err)
4626                return err;
4627retry_remove_space:
4628        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4629        if (err == -ENOMEM) {
4630                cond_resched();
4631                congestion_wait(BLK_RW_ASYNC, HZ/50);
4632                goto retry_remove_space;
4633        }
4634        return err;
4635}
4636
4637static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4638                                  ext4_lblk_t len, loff_t new_size,
4639                                  int flags)
4640{
4641        struct inode *inode = file_inode(file);
4642        handle_t *handle;
4643        int ret = 0;
4644        int ret2 = 0;
4645        int retries = 0;
4646        int depth = 0;
4647        struct ext4_map_blocks map;
4648        unsigned int credits;
4649        loff_t epos;
4650
4651        BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
4652        map.m_lblk = offset;
4653        map.m_len = len;
4654        /*
4655         * Don't normalize the request if it can fit in one extent so
4656         * that it doesn't get unnecessarily split into multiple
4657         * extents.
4658         */
4659        if (len <= EXT_UNWRITTEN_MAX_LEN)
4660                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4661
4662        /*
4663         * credits to insert 1 extent into extent tree
4664         */
4665        credits = ext4_chunk_trans_blocks(inode, len);
4666        depth = ext_depth(inode);
4667
4668retry:
4669        while (ret >= 0 && len) {
4670                /*
4671                 * Recalculate credits when extent tree depth changes.
4672                 */
4673                if (depth != ext_depth(inode)) {
4674                        credits = ext4_chunk_trans_blocks(inode, len);
4675                        depth = ext_depth(inode);
4676                }
4677
4678                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4679                                            credits);
4680                if (IS_ERR(handle)) {
4681                        ret = PTR_ERR(handle);
4682                        break;
4683                }
4684                ret = ext4_map_blocks(handle, inode, &map, flags);
4685                if (ret <= 0) {
4686                        ext4_debug("inode #%lu: block %u: len %u: "
4687                                   "ext4_ext_map_blocks returned %d",
4688                                   inode->i_ino, map.m_lblk,
4689                                   map.m_len, ret);
4690                        ext4_mark_inode_dirty(handle, inode);
4691                        ret2 = ext4_journal_stop(handle);
4692                        break;
4693                }
4694                map.m_lblk += ret;
4695                map.m_len = len = len - ret;
4696                epos = (loff_t)map.m_lblk << inode->i_blkbits;
4697                inode->i_ctime = current_time(inode);
4698                if (new_size) {
4699                        if (epos > new_size)
4700                                epos = new_size;
4701                        if (ext4_update_inode_size(inode, epos) & 0x1)
4702                                inode->i_mtime = inode->i_ctime;
4703                } else {
4704                        if (epos > inode->i_size)
4705                                ext4_set_inode_flag(inode,
4706                                                    EXT4_INODE_EOFBLOCKS);
4707                }
4708                ext4_mark_inode_dirty(handle, inode);
4709                ext4_update_inode_fsync_trans(handle, inode, 1);
4710                ret2 = ext4_journal_stop(handle);
4711                if (ret2)
4712                        break;
4713        }
4714        if (ret == -ENOSPC &&
4715                        ext4_should_retry_alloc(inode->i_sb, &retries)) {
4716                ret = 0;
4717                goto retry;
4718        }
4719
4720        return ret > 0 ? ret2 : ret;
4721}
4722
4723static long ext4_zero_range(struct file *file, loff_t offset,
4724                            loff_t len, int mode)
4725{
4726        struct inode *inode = file_inode(file);
4727        handle_t *handle = NULL;
4728        unsigned int max_blocks;
4729        loff_t new_size = 0;
4730        int ret = 0;
4731        int flags;
4732        int credits;
4733        int partial_begin, partial_end;
4734        loff_t start, end;
4735        ext4_lblk_t lblk;
4736        unsigned int blkbits = inode->i_blkbits;
4737
4738        trace_ext4_zero_range(inode, offset, len, mode);
4739
4740        if (!S_ISREG(inode->i_mode))
4741                return -EINVAL;
4742
4743        /* Call ext4_force_commit to flush all data in case of data=journal. */
4744        if (ext4_should_journal_data(inode)) {
4745                ret = ext4_force_commit(inode->i_sb);
4746                if (ret)
4747                        return ret;
4748        }
4749
4750        /*
4751         * Round up offset. This is not fallocate, we neet to zero out
4752         * blocks, so convert interior block aligned part of the range to
4753         * unwritten and possibly manually zero out unaligned parts of the
4754         * range.
4755         */
4756        start = round_up(offset, 1 << blkbits);
4757        end = round_down((offset + len), 1 << blkbits);
4758
4759        if (start < offset || end > offset + len)
4760                return -EINVAL;
4761        partial_begin = offset & ((1 << blkbits) - 1);
4762        partial_end = (offset + len) & ((1 << blkbits) - 1);
4763
4764        lblk = start >> blkbits;
4765        max_blocks = (end >> blkbits);
4766        if (max_blocks < lblk)
4767                max_blocks = 0;
4768        else
4769                max_blocks -= lblk;
4770
4771        inode_lock(inode);
4772
4773        /*
4774         * Indirect files do not support unwritten extnets
4775         */
4776        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4777                ret = -EOPNOTSUPP;
4778                goto out_mutex;
4779        }
4780
4781        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4782            (offset + len > i_size_read(inode) ||
4783             offset + len > EXT4_I(inode)->i_disksize)) {
4784                new_size = offset + len;
4785                ret = inode_newsize_ok(inode, new_size);
4786                if (ret)
4787                        goto out_mutex;
4788        }
4789
4790        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4791        if (mode & FALLOC_FL_KEEP_SIZE)
4792                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4793
4794        /* Wait all existing dio workers, newcomers will block on i_mutex */
4795        inode_dio_wait(inode);
4796
4797        /* Preallocate the range including the unaligned edges */
4798        if (partial_begin || partial_end) {
4799                ret = ext4_alloc_file_blocks(file,
4800                                round_down(offset, 1 << blkbits) >> blkbits,
4801                                (round_up((offset + len), 1 << blkbits) -
4802                                 round_down(offset, 1 << blkbits)) >> blkbits,
4803                                new_size, flags);
4804                if (ret)
4805                        goto out_mutex;
4806
4807        }
4808
4809        /* Zero range excluding the unaligned edges */
4810        if (max_blocks > 0) {
4811                flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4812                          EXT4_EX_NOCACHE);
4813
4814                /*
4815                 * Prevent page faults from reinstantiating pages we have
4816                 * released from page cache.
4817                 */
4818                down_write(&EXT4_I(inode)->i_mmap_sem);
4819
4820                ret = ext4_break_layouts(inode);
4821                if (ret) {
4822                        up_write(&EXT4_I(inode)->i_mmap_sem);
4823                        goto out_mutex;
4824                }
4825
4826                ret = ext4_update_disksize_before_punch(inode, offset, len);
4827                if (ret) {
4828                        up_write(&EXT4_I(inode)->i_mmap_sem);
4829                        goto out_mutex;
4830                }
4831                /* Now release the pages and zero block aligned part of pages */
4832                truncate_pagecache_range(inode, start, end - 1);
4833                inode->i_mtime = inode->i_ctime = current_time(inode);
4834
4835                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4836                                             flags);
4837                up_write(&EXT4_I(inode)->i_mmap_sem);
4838                if (ret)
4839                        goto out_mutex;
4840        }
4841        if (!partial_begin && !partial_end)
4842                goto out_mutex;
4843
4844        /*
4845         * In worst case we have to writeout two nonadjacent unwritten
4846         * blocks and update the inode
4847         */
4848        credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4849        if (ext4_should_journal_data(inode))
4850                credits += 2;
4851        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4852        if (IS_ERR(handle)) {
4853                ret = PTR_ERR(handle);
4854                ext4_std_error(inode->i_sb, ret);
4855                goto out_mutex;
4856        }
4857
4858        inode->i_mtime = inode->i_ctime = current_time(inode);
4859        if (new_size) {
4860                ext4_update_inode_size(inode, new_size);
4861        } else {
4862                /*
4863                * Mark that we allocate beyond EOF so the subsequent truncate
4864                * can proceed even if the new size is the same as i_size.
4865                */
4866                if ((offset + len) > i_size_read(inode))
4867                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4868        }
4869        ext4_mark_inode_dirty(handle, inode);
4870
4871        /* Zero out partial block at the edges of the range */
4872        ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4873        if (ret >= 0)
4874                ext4_update_inode_fsync_trans(handle, inode, 1);
4875
4876        if (file->f_flags & O_SYNC)
4877                ext4_handle_sync(handle);
4878
4879        ext4_journal_stop(handle);
4880out_mutex:
4881        inode_unlock(inode);
4882        return ret;
4883}
4884
4885/*
4886 * preallocate space for a file. This implements ext4's fallocate file
4887 * operation, which gets called from sys_fallocate system call.
4888 * For block-mapped files, posix_fallocate should fall back to the method
4889 * of writing zeroes to the required new blocks (the same behavior which is
4890 * expected for file systems which do not support fallocate() system call).
4891 */
4892long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4893{
4894        struct inode *inode = file_inode(file);
4895        loff_t new_size = 0;
4896        unsigned int max_blocks;
4897        int ret = 0;
4898        int flags;
4899        ext4_lblk_t lblk;
4900        unsigned int blkbits = inode->i_blkbits;
4901
4902        /*
4903         * Encrypted inodes can't handle collapse range or insert
4904         * range since we would need to re-encrypt blocks with a
4905         * different IV or XTS tweak (which are based on the logical
4906         * block number).
4907         *
4908         * XXX It's not clear why zero range isn't working, but we'll
4909         * leave it disabled for encrypted inodes for now.  This is a
4910         * bug we should fix....
4911         */
4912        if (IS_ENCRYPTED(inode) &&
4913            (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
4914                     FALLOC_FL_ZERO_RANGE)))
4915                return -EOPNOTSUPP;
4916
4917        /* Return error if mode is not supported */
4918        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4919                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
4920                     FALLOC_FL_INSERT_RANGE))
4921                return -EOPNOTSUPP;
4922
4923        if (mode & FALLOC_FL_PUNCH_HOLE)
4924                return ext4_punch_hole(inode, offset, len);
4925
4926        ret = ext4_convert_inline_data(inode);
4927        if (ret)
4928                return ret;
4929
4930        if (mode & FALLOC_FL_COLLAPSE_RANGE)
4931                return ext4_collapse_range(inode, offset, len);
4932
4933        if (mode & FALLOC_FL_INSERT_RANGE)
4934                return ext4_insert_range(inode, offset, len);
4935
4936        if (mode & FALLOC_FL_ZERO_RANGE)
4937                return ext4_zero_range(file, offset, len, mode);
4938
4939        trace_ext4_fallocate_enter(inode, offset, len, mode);
4940        lblk = offset >> blkbits;
4941
4942        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4943        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4944        if (mode & FALLOC_FL_KEEP_SIZE)
4945                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4946
4947        inode_lock(inode);
4948
4949        /*
4950         * We only support preallocation for extent-based files only
4951         */
4952        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4953                ret = -EOPNOTSUPP;
4954                goto out;
4955        }
4956
4957        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4958            (offset + len > i_size_read(inode) ||
4959             offset + len > EXT4_I(inode)->i_disksize)) {
4960                new_size = offset + len;
4961                ret = inode_newsize_ok(inode, new_size);
4962                if (ret)
4963                        goto out;
4964        }
4965
4966        /* Wait all existing dio workers, newcomers will block on i_mutex */
4967        inode_dio_wait(inode);
4968
4969        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
4970        if (ret)
4971                goto out;
4972
4973        if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4974                ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
4975                                                EXT4_I(inode)->i_sync_tid);
4976        }
4977out:
4978        inode_unlock(inode);
4979        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4980        return ret;
4981}
4982
4983/*
4984 * This function convert a range of blocks to written extents
4985 * The caller of this function will pass the start offset and the size.
4986 * all unwritten extents within this range will be converted to
4987 * written extents.
4988 *
4989 * This function is called from the direct IO end io call back
4990 * function, to convert the fallocated extents after IO is completed.
4991 * Returns 0 on success.
4992 */
4993int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4994                                   loff_t offset, ssize_t len)
4995{
4996        unsigned int max_blocks;
4997        int ret = 0;
4998        int ret2 = 0;
4999        struct ext4_map_blocks map;
5000        unsigned int credits, blkbits = inode->i_blkbits;
5001
5002        map.m_lblk = offset >> blkbits;
5003        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
5004
5005        /*
5006         * This is somewhat ugly but the idea is clear: When transaction is
5007         * reserved, everything goes into it. Otherwise we rather start several
5008         * smaller transactions for conversion of each extent separately.
5009         */
5010        if (handle) {
5011                handle = ext4_journal_start_reserved(handle,
5012                                                     EXT4_HT_EXT_CONVERT);
5013                if (IS_ERR(handle))
5014                        return PTR_ERR(handle);
5015                credits = 0;
5016        } else {
5017                /*
5018                 * credits to insert 1 extent into extent tree
5019                 */
5020                credits = ext4_chunk_trans_blocks(inode, max_blocks);
5021        }
5022        while (ret >= 0 && ret < max_blocks) {
5023                map.m_lblk += ret;
5024                map.m_len = (max_blocks -= ret);
5025                if (credits) {
5026                        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
5027                                                    credits);
5028                        if (IS_ERR(handle)) {
5029                                ret = PTR_ERR(handle);
5030                                break;
5031                        }
5032                }
5033                ret = ext4_map_blocks(handle, inode, &map,
5034                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
5035                if (ret <= 0)
5036                        ext4_warning(inode->i_sb,
5037                                     "inode #%lu: block %u: len %u: "
5038                                     "ext4_ext_map_blocks returned %d",
5039                                     inode->i_ino, map.m_lblk,
5040                                     map.m_len, ret);
5041                ext4_mark_inode_dirty(handle, inode);
5042                if (credits)
5043                        ret2 = ext4_journal_stop(handle);
5044                if (ret <= 0 || ret2)
5045                        break;
5046        }
5047        if (!credits)
5048                ret2 = ext4_journal_stop(handle);
5049        return ret > 0 ? ret2 : ret;
5050}
5051
5052/*
5053 * If newes is not existing extent (newes->ec_pblk equals zero) find
5054 * delayed extent at start of newes and update newes accordingly and
5055 * return start of the next delayed extent.
5056 *
5057 * If newes is existing extent (newes->ec_pblk is not equal zero)
5058 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
5059 * extent found. Leave newes unmodified.
5060 */
5061static int ext4_find_delayed_extent(struct inode *inode,
5062                                    struct extent_status *newes)
5063{
5064        struct extent_status es;
5065        ext4_lblk_t block, next_del;
5066
5067        if (newes->es_pblk == 0) {
5068                ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
5069                                          newes->es_lblk,
5070                                          newes->es_lblk + newes->es_len - 1,
5071                                          &es);
5072
5073                /*
5074                 * No extent in extent-tree contains block @newes->es_pblk,
5075                 * then the block may stay in 1)a hole or 2)delayed-extent.
5076                 */
5077                if (es.es_len == 0)
5078                        /* A hole found. */
5079                        return 0;
5080
5081                if (es.es_lblk > newes->es_lblk) {
5082                        /* A hole found. */
5083                        newes->es_len = min(es.es_lblk - newes->es_lblk,
5084                                            newes->es_len);
5085                        return 0;
5086                }
5087
5088                newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
5089        }
5090
5091        block = newes->es_lblk + newes->es_len;
5092        ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
5093                                  EXT_MAX_BLOCKS, &es);
5094        if (es.es_len == 0)
5095                next_del = EXT_MAX_BLOCKS;
5096        else
5097                next_del = es.es_lblk;
5098
5099        return next_del;
5100}
5101/* fiemap flags we can handle specified here */
5102#define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
5103
5104static int ext4_xattr_fiemap(struct inode *inode,
5105                                struct fiemap_extent_info *fieinfo)
5106{
5107        __u64 physical = 0;
5108        __u64 length;
5109        __u32 flags = FIEMAP_EXTENT_LAST;
5110        int blockbits = inode->i_sb->s_blocksize_bits;
5111        int error = 0;
5112
5113        /* in-inode? */
5114        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
5115                struct ext4_iloc iloc;
5116                int offset;     /* offset of xattr in inode */
5117
5118                error = ext4_get_inode_loc(inode, &iloc);
5119                if (error)
5120                        return error;
5121                physical = (__u64)iloc.bh->b_blocknr << blockbits;
5122                offset = EXT4_GOOD_OLD_INODE_SIZE +
5123                                EXT4_I(inode)->i_extra_isize;
5124                physical += offset;
5125                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
5126                flags |= FIEMAP_EXTENT_DATA_INLINE;
5127                brelse(iloc.bh);
5128        } else { /* external block */
5129                physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
5130                length = inode->i_sb->s_blocksize;
5131        }
5132
5133        if (physical)
5134                error = fiemap_fill_next_extent(fieinfo, 0, physical,
5135                                                length, flags);
5136        return (error < 0 ? error : 0);
5137}
5138
5139int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5140                __u64 start, __u64 len)
5141{
5142        ext4_lblk_t start_blk;
5143        int error = 0;
5144
5145        if (ext4_has_inline_data(inode)) {
5146                int has_inline = 1;
5147
5148                error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
5149                                                start, len);
5150
5151                if (has_inline)
5152                        return error;
5153        }
5154
5155        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5156                error = ext4_ext_precache(inode);
5157                if (error)
5158                        return error;
5159        }
5160
5161        /* fallback to generic here if not in extents fmt */
5162        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5163                return generic_block_fiemap(inode, fieinfo, start, len,
5164                        ext4_get_block);
5165
5166        if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
5167                return -EBADR;
5168
5169        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
5170                error = ext4_xattr_fiemap(inode, fieinfo);
5171        } else {
5172                ext4_lblk_t len_blks;
5173                __u64 last_blk;
5174
5175                start_blk = start >> inode->i_sb->s_blocksize_bits;
5176                last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5177                if (last_blk >= EXT_MAX_BLOCKS)
5178                        last_blk = EXT_MAX_BLOCKS-1;
5179                len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5180
5181                /*
5182                 * Walk the extent tree gathering extent information
5183                 * and pushing extents back to the user.
5184                 */
5185                error = ext4_fill_fiemap_extents(inode, start_blk,
5186                                                 len_blks, fieinfo);
5187        }
5188        return error;
5189}
5190
5191/*
5192 * ext4_access_path:
5193 * Function to access the path buffer for marking it dirty.
5194 * It also checks if there are sufficient credits left in the journal handle
5195 * to update path.
5196 */
5197static int
5198ext4_access_path(handle_t *handle, struct inode *inode,
5199                struct ext4_ext_path *path)
5200{
5201        int credits, err;
5202
5203        if (!ext4_handle_valid(handle))
5204                return 0;
5205
5206        /*
5207         * Check if need to extend journal credits
5208         * 3 for leaf, sb, and inode plus 2 (bmap and group
5209         * descriptor) for each block group; assume two block
5210         * groups
5211         */
5212        credits = ext4_writepage_trans_blocks(inode);
5213        err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
5214        if (err < 0)
5215                return err;
5216
5217        err = ext4_ext_get_access(handle, inode, path);
5218        return err;
5219}
5220
5221/*
5222 * ext4_ext_shift_path_extents:
5223 * Shift the extents of a path structure lying between path[depth].p_ext
5224 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
5225 * if it is right shift or left shift operation.
5226 */
5227static int
5228ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5229                            struct inode *inode, handle_t *handle,
5230                            enum SHIFT_DIRECTION SHIFT)
5231{
5232        int depth, err = 0;
5233        struct ext4_extent *ex_start, *ex_last;
5234        bool update = 0;
5235        depth = path->p_depth;
5236
5237        while (depth >= 0) {
5238                if (depth == path->p_depth) {
5239                        ex_start = path[depth].p_ext;
5240                        if (!ex_start)
5241                                return -EFSCORRUPTED;
5242
5243                        ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5244
5245                        err = ext4_access_path(handle, inode, path + depth);
5246                        if (err)
5247                                goto out;
5248
5249                        if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5250                                update = 1;
5251
5252                        while (ex_start <= ex_last) {
5253                                if (SHIFT == SHIFT_LEFT) {
5254                                        le32_add_cpu(&ex_start->ee_block,
5255                                                -shift);
5256                                        /* Try to merge to the left. */
5257                                        if ((ex_start >
5258                                            EXT_FIRST_EXTENT(path[depth].p_hdr))
5259                                            &&
5260                                            ext4_ext_try_to_merge_right(inode,
5261                                            path, ex_start - 1))
5262                                                ex_last--;
5263                                        else
5264                                                ex_start++;
5265                                } else {
5266                                        le32_add_cpu(&ex_last->ee_block, shift);
5267                                        ext4_ext_try_to_merge_right(inode, path,
5268                                                ex_last);
5269                                        ex_last--;
5270                                }
5271                        }
5272                        err = ext4_ext_dirty(handle, inode, path + depth);
5273                        if (err)
5274                                goto out;
5275
5276                        if (--depth < 0 || !update)
5277                                break;
5278                }
5279
5280                /* Update index too */
5281                err = ext4_access_path(handle, inode, path + depth);
5282                if (err)
5283                        goto out;
5284
5285                if (SHIFT == SHIFT_LEFT)
5286                        le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5287                else
5288                        le32_add_cpu(&path[depth].p_idx->ei_block, shift);
5289                err = ext4_ext_dirty(handle, inode, path + depth);
5290                if (err)
5291                        goto out;
5292
5293                /* we are done if current index is not a starting index */
5294                if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5295                        break;
5296
5297                depth--;
5298        }
5299
5300out:
5301        return err;
5302}
5303
5304/*
5305 * ext4_ext_shift_extents:
5306 * All the extents which lies in the range from @start to the last allocated
5307 * block for the @inode are shifted either towards left or right (depending
5308 * upon @SHIFT) by @shift blocks.
5309 * On success, 0 is returned, error otherwise.
5310 */
5311static int
5312ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5313                       ext4_lblk_t start, ext4_lblk_t shift,
5314                       enum SHIFT_DIRECTION SHIFT)
5315{
5316        struct ext4_ext_path *path;
5317        int ret = 0, depth;
5318        struct ext4_extent *extent;
5319        ext4_lblk_t stop, *iterator, ex_start, ex_end;
5320
5321        /* Let path point to the last extent */
5322        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
5323                                EXT4_EX_NOCACHE);
5324        if (IS_ERR(path))
5325                return PTR_ERR(path);
5326
5327        depth = path->p_depth;
5328        extent = path[depth].p_ext;
5329        if (!extent)
5330                goto out;
5331
5332        stop = le32_to_cpu(extent->ee_block);
5333
5334       /*
5335        * For left shifts, make sure the hole on the left is big enough to
5336        * accommodate the shift.  For right shifts, make sure the last extent
5337        * won't be shifted beyond EXT_MAX_BLOCKS.
5338        */
5339        if (SHIFT == SHIFT_LEFT) {
5340                path = ext4_find_extent(inode, start - 1, &path,
5341                                        EXT4_EX_NOCACHE);
5342                if (IS_ERR(path))
5343                        return PTR_ERR(path);
5344                depth = path->p_depth;
5345                extent =  path[depth].p_ext;
5346                if (extent) {
5347                        ex_start = le32_to_cpu(extent->ee_block);
5348                        ex_end = le32_to_cpu(extent->ee_block) +
5349                                ext4_ext_get_actual_len(extent);
5350                } else {
5351                        ex_start = 0;
5352                        ex_end = 0;
5353                }
5354
5355                if ((start == ex_start && shift > ex_start) ||
5356                    (shift > start - ex_end)) {
5357                        ret = -EINVAL;
5358                        goto out;
5359                }
5360        } else {
5361                if (shift > EXT_MAX_BLOCKS -
5362                    (stop + ext4_ext_get_actual_len(extent))) {
5363                        ret = -EINVAL;
5364                        goto out;
5365                }
5366        }
5367
5368        /*
5369         * In case of left shift, iterator points to start and it is increased
5370         * till we reach stop. In case of right shift, iterator points to stop
5371         * and it is decreased till we reach start.
5372         */
5373        if (SHIFT == SHIFT_LEFT)
5374                iterator = &start;
5375        else
5376                iterator = &stop;
5377
5378        /*
5379         * Its safe to start updating extents.  Start and stop are unsigned, so
5380         * in case of right shift if extent with 0 block is reached, iterator
5381         * becomes NULL to indicate the end of the loop.
5382         */
5383        while (iterator && start <= stop) {
5384                path = ext4_find_extent(inode, *iterator, &path,
5385                                        EXT4_EX_NOCACHE);
5386                if (IS_ERR(path))
5387                        return PTR_ERR(path);
5388                depth = path->p_depth;
5389                extent = path[depth].p_ext;
5390                if (!extent) {
5391                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5392                                         (unsigned long) *iterator);
5393                        return -EFSCORRUPTED;
5394                }
5395                if (SHIFT == SHIFT_LEFT && *iterator >
5396                    le32_to_cpu(extent->ee_block)) {
5397                        /* Hole, move to the next extent */
5398                        if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5399                                path[depth].p_ext++;
5400                        } else {
5401                                *iterator = ext4_ext_next_allocated_block(path);
5402                                continue;
5403                        }
5404                }
5405
5406                if (SHIFT == SHIFT_LEFT) {
5407                        extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5408                        *iterator = le32_to_cpu(extent->ee_block) +
5409                                        ext4_ext_get_actual_len(extent);
5410                } else {
5411                        extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
5412                        if (le32_to_cpu(extent->ee_block) > 0)
5413                                *iterator = le32_to_cpu(extent->ee_block) - 1;
5414                        else
5415                                /* Beginning is reached, end of the loop */
5416                                iterator = NULL;
5417                        /* Update path extent in case we need to stop */
5418                        while (le32_to_cpu(extent->ee_block) < start)
5419                                extent++;
5420                        path[depth].p_ext = extent;
5421                }
5422                ret = ext4_ext_shift_path_extents(path, shift, inode,
5423                                handle, SHIFT);
5424                if (ret)
5425                        break;
5426        }
5427out:
5428        ext4_ext_drop_refs(path);
5429        kfree(path);
5430        return ret;
5431}
5432
5433/*
5434 * ext4_collapse_range:
5435 * This implements the fallocate's collapse range functionality for ext4
5436 * Returns: 0 and non-zero on error.
5437 */
5438int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5439{
5440        struct super_block *sb = inode->i_sb;
5441        ext4_lblk_t punch_start, punch_stop;
5442        handle_t *handle;
5443        unsigned int credits;
5444        loff_t new_size, ioffset;
5445        int ret;
5446
5447        /*
5448         * We need to test this early because xfstests assumes that a
5449         * collapse range of (0, 1) will return EOPNOTSUPP if the file
5450         * system does not support collapse range.
5451         */
5452        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5453                return -EOPNOTSUPP;
5454
5455        /* Collapse range works only on fs block size aligned offsets. */
5456        if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
5457            len & (EXT4_CLUSTER_SIZE(sb) - 1))
5458                return -EINVAL;
5459
5460        if (!S_ISREG(inode->i_mode))
5461                return -EINVAL;
5462
5463        trace_ext4_collapse_range(inode, offset, len);
5464
5465        punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5466        punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5467
5468        /* Call ext4_force_commit to flush all data in case of data=journal. */
5469        if (ext4_should_journal_data(inode)) {
5470                ret = ext4_force_commit(inode->i_sb);
5471                if (ret)
5472                        return ret;
5473        }
5474
5475        inode_lock(inode);
5476        /*
5477         * There is no need to overlap collapse range with EOF, in which case
5478         * it is effectively a truncate operation
5479         */
5480        if (offset + len >= i_size_read(inode)) {
5481                ret = -EINVAL;
5482                goto out_mutex;
5483        }
5484
5485        /* Currently just for extent based files */
5486        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5487                ret = -EOPNOTSUPP;
5488                goto out_mutex;
5489        }
5490
5491        /* Wait for existing dio to complete */
5492        inode_dio_wait(inode);
5493
5494        /*
5495         * Prevent page faults from reinstantiating pages we have released from
5496         * page cache.
5497         */
5498        down_write(&EXT4_I(inode)->i_mmap_sem);
5499
5500        ret = ext4_break_layouts(inode);
5501        if (ret)
5502                goto out_mmap;
5503
5504        /*
5505         * Need to round down offset to be aligned with page size boundary
5506         * for page size > block size.
5507         */
5508        ioffset = round_down(offset, PAGE_SIZE);
5509        /*
5510         * Write tail of the last page before removed range since it will get
5511         * removed from the page cache below.
5512         */
5513        ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
5514        if (ret)
5515                goto out_mmap;
5516        /*
5517         * Write data that will be shifted to preserve them when discarding
5518         * page cache below. We are also protected from pages becoming dirty
5519         * by i_mmap_sem.
5520         */
5521        ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
5522                                           LLONG_MAX);
5523        if (ret)
5524                goto out_mmap;
5525        truncate_pagecache(inode, ioffset);
5526
5527        credits = ext4_writepage_trans_blocks(inode);
5528        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5529        if (IS_ERR(handle)) {
5530                ret = PTR_ERR(handle);
5531                goto out_mmap;
5532        }
5533
5534        down_write(&EXT4_I(inode)->i_data_sem);
5535        ext4_discard_preallocations(inode);
5536
5537        ret = ext4_es_remove_extent(inode, punch_start,
5538                                    EXT_MAX_BLOCKS - punch_start);
5539        if (ret) {
5540                up_write(&EXT4_I(inode)->i_data_sem);
5541                goto out_stop;
5542        }
5543
5544        ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5545        if (ret) {
5546                up_write(&EXT4_I(inode)->i_data_sem);
5547                goto out_stop;
5548        }
5549        ext4_discard_preallocations(inode);
5550
5551        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5552                                     punch_stop - punch_start, SHIFT_LEFT);
5553        if (ret) {
5554                up_write(&EXT4_I(inode)->i_data_sem);
5555                goto out_stop;
5556        }
5557
5558        new_size = i_size_read(inode) - len;
5559        i_size_write(inode, new_size);
5560        EXT4_I(inode)->i_disksize = new_size;
5561
5562        up_write(&EXT4_I(inode)->i_data_sem);
5563        if (IS_SYNC(inode))
5564                ext4_handle_sync(handle);
5565        inode->i_mtime = inode->i_ctime = current_time(inode);
5566        ext4_mark_inode_dirty(handle, inode);
5567        ext4_update_inode_fsync_trans(handle, inode, 1);
5568
5569out_stop:
5570        ext4_journal_stop(handle);
5571out_mmap:
5572        up_write(&EXT4_I(inode)->i_mmap_sem);
5573out_mutex:
5574        inode_unlock(inode);
5575        return ret;
5576}
5577
5578/*
5579 * ext4_insert_range:
5580 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5581 * The data blocks starting from @offset to the EOF are shifted by @len
5582 * towards right to create a hole in the @inode. Inode size is increased
5583 * by len bytes.
5584 * Returns 0 on success, error otherwise.
5585 */
5586int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5587{
5588        struct super_block *sb = inode->i_sb;
5589        handle_t *handle;
5590        struct ext4_ext_path *path;
5591        struct ext4_extent *extent;
5592        ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
5593        unsigned int credits, ee_len;
5594        int ret = 0, depth, split_flag = 0;
5595        loff_t ioffset;
5596
5597        /*
5598         * We need to test this early because xfstests assumes that an
5599         * insert range of (0, 1) will return EOPNOTSUPP if the file
5600         * system does not support insert range.
5601         */
5602        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5603                return -EOPNOTSUPP;
5604
5605        /* Insert range works only on fs block size aligned offsets. */
5606        if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
5607                        len & (EXT4_CLUSTER_SIZE(sb) - 1))
5608                return -EINVAL;
5609
5610        if (!S_ISREG(inode->i_mode))
5611                return -EOPNOTSUPP;
5612
5613        trace_ext4_insert_range(inode, offset, len);
5614
5615        offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5616        len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
5617
5618        /* Call ext4_force_commit to flush all data in case of data=journal */
5619        if (ext4_should_journal_data(inode)) {
5620                ret = ext4_force_commit(inode->i_sb);
5621                if (ret)
5622                        return ret;
5623        }
5624
5625        inode_lock(inode);
5626        /* Currently just for extent based files */
5627        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5628                ret = -EOPNOTSUPP;
5629                goto out_mutex;
5630        }
5631
5632        /* Check for wrap through zero */
5633        if (inode->i_size + len > inode->i_sb->s_maxbytes) {
5634                ret = -EFBIG;
5635                goto out_mutex;
5636        }
5637
5638        /* Offset should be less than i_size */
5639        if (offset >= i_size_read(inode)) {
5640                ret = -EINVAL;
5641                goto out_mutex;
5642        }
5643
5644        /* Wait for existing dio to complete */
5645        inode_dio_wait(inode);
5646
5647        /*
5648         * Prevent page faults from reinstantiating pages we have released from
5649         * page cache.
5650         */
5651        down_write(&EXT4_I(inode)->i_mmap_sem);
5652
5653        ret = ext4_break_layouts(inode);
5654        if (ret)
5655                goto out_mmap;
5656
5657        /*
5658         * Need to round down to align start offset to page size boundary
5659         * for page size > block size.
5660         */
5661        ioffset = round_down(offset, PAGE_SIZE);
5662        /* Write out all dirty pages */
5663        ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5664                        LLONG_MAX);
5665        if (ret)
5666                goto out_mmap;
5667        truncate_pagecache(inode, ioffset);
5668
5669        credits = ext4_writepage_trans_blocks(inode);
5670        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5671        if (IS_ERR(handle)) {
5672                ret = PTR_ERR(handle);
5673                goto out_mmap;
5674        }
5675
5676        /* Expand file to avoid data loss if there is error while shifting */
5677        inode->i_size += len;
5678        EXT4_I(inode)->i_disksize += len;
5679        inode->i_mtime = inode->i_ctime = current_time(inode);
5680        ret = ext4_mark_inode_dirty(handle, inode);
5681        if (ret)
5682                goto out_stop;
5683
5684        down_write(&EXT4_I(inode)->i_data_sem);
5685        ext4_discard_preallocations(inode);
5686
5687        path = ext4_find_extent(inode, offset_lblk, NULL, 0);
5688        if (IS_ERR(path)) {
5689                up_write(&EXT4_I(inode)->i_data_sem);
5690                goto out_stop;
5691        }
5692
5693        depth = ext_depth(inode);
5694        extent = path[depth].p_ext;
5695        if (extent) {
5696                ee_start_lblk = le32_to_cpu(extent->ee_block);
5697                ee_len = ext4_ext_get_actual_len(extent);
5698
5699                /*
5700                 * If offset_lblk is not the starting block of extent, split
5701                 * the extent @offset_lblk
5702                 */
5703                if ((offset_lblk > ee_start_lblk) &&
5704                                (offset_lblk < (ee_start_lblk + ee_len))) {
5705                        if (ext4_ext_is_unwritten(extent))
5706                                split_flag = EXT4_EXT_MARK_UNWRIT1 |
5707                                        EXT4_EXT_MARK_UNWRIT2;
5708                        ret = ext4_split_extent_at(handle, inode, &path,
5709                                        offset_lblk, split_flag,
5710                                        EXT4_EX_NOCACHE |
5711                                        EXT4_GET_BLOCKS_PRE_IO |
5712                                        EXT4_GET_BLOCKS_METADATA_NOFAIL);
5713                }
5714
5715                ext4_ext_drop_refs(path);
5716                kfree(path);
5717                if (ret < 0) {
5718                        up_write(&EXT4_I(inode)->i_data_sem);
5719                        goto out_stop;
5720                }
5721        } else {
5722                ext4_ext_drop_refs(path);
5723                kfree(path);
5724        }
5725
5726        ret = ext4_es_remove_extent(inode, offset_lblk,
5727                        EXT_MAX_BLOCKS - offset_lblk);
5728        if (ret) {
5729                up_write(&EXT4_I(inode)->i_data_sem);
5730                goto out_stop;
5731        }
5732
5733        /*
5734         * if offset_lblk lies in a hole which is at start of file, use
5735         * ee_start_lblk to shift extents
5736         */
5737        ret = ext4_ext_shift_extents(inode, handle,
5738                ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
5739                len_lblk, SHIFT_RIGHT);
5740
5741        up_write(&EXT4_I(inode)->i_data_sem);
5742        if (IS_SYNC(inode))
5743                ext4_handle_sync(handle);
5744        if (ret >= 0)
5745                ext4_update_inode_fsync_trans(handle, inode, 1);
5746
5747out_stop:
5748        ext4_journal_stop(handle);
5749out_mmap:
5750        up_write(&EXT4_I(inode)->i_mmap_sem);
5751out_mutex:
5752        inode_unlock(inode);
5753        return ret;
5754}
5755
5756/**
5757 * ext4_swap_extents - Swap extents between two inodes
5758 *
5759 * @inode1:     First inode
5760 * @inode2:     Second inode
5761 * @lblk1:      Start block for first inode
5762 * @lblk2:      Start block for second inode
5763 * @count:      Number of blocks to swap
5764 * @unwritten: Mark second inode's extents as unwritten after swap
5765 * @erp:        Pointer to save error value
5766 *
5767 * This helper routine does exactly what is promise "swap extents". All other
5768 * stuff such as page-cache locking consistency, bh mapping consistency or
5769 * extent's data copying must be performed by caller.
5770 * Locking:
5771 *              i_mutex is held for both inodes
5772 *              i_data_sem is locked for write for both inodes
5773 * Assumptions:
5774 *              All pages from requested range are locked for both inodes
5775 */
5776int
5777ext4_swap_extents(handle_t *handle, struct inode *inode1,
5778                  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5779                  ext4_lblk_t count, int unwritten, int *erp)
5780{
5781        struct ext4_ext_path *path1 = NULL;
5782        struct ext4_ext_path *path2 = NULL;
5783        int replaced_count = 0;
5784
5785        BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5786        BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5787        BUG_ON(!inode_is_locked(inode1));
5788        BUG_ON(!inode_is_locked(inode2));
5789
5790        *erp = ext4_es_remove_extent(inode1, lblk1, count);
5791        if (unlikely(*erp))
5792                return 0;
5793        *erp = ext4_es_remove_extent(inode2, lblk2, count);
5794        if (unlikely(*erp))
5795                return 0;
5796
5797        while (count) {
5798                struct ext4_extent *ex1, *ex2, tmp_ex;
5799                ext4_lblk_t e1_blk, e2_blk;
5800                int e1_len, e2_len, len;
5801                int split = 0;
5802
5803                path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5804                if (IS_ERR(path1)) {
5805                        *erp = PTR_ERR(path1);
5806                        path1 = NULL;
5807                finish:
5808                        count = 0;
5809                        goto repeat;
5810                }
5811                path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5812                if (IS_ERR(path2)) {
5813                        *erp = PTR_ERR(path2);
5814                        path2 = NULL;
5815                        goto finish;
5816                }
5817                ex1 = path1[path1->p_depth].p_ext;
5818                ex2 = path2[path2->p_depth].p_ext;
5819                /* Do we have somthing to swap ? */
5820                if (unlikely(!ex2 || !ex1))
5821                        goto finish;
5822
5823                e1_blk = le32_to_cpu(ex1->ee_block);
5824                e2_blk = le32_to_cpu(ex2->ee_block);
5825                e1_len = ext4_ext_get_actual_len(ex1);
5826                e2_len = ext4_ext_get_actual_len(ex2);
5827
5828                /* Hole handling */
5829                if (!in_range(lblk1, e1_blk, e1_len) ||
5830                    !in_range(lblk2, e2_blk, e2_len)) {
5831                        ext4_lblk_t next1, next2;
5832
5833                        /* if hole after extent, then go to next extent */
5834                        next1 = ext4_ext_next_allocated_block(path1);
5835                        next2 = ext4_ext_next_allocated_block(path2);
5836                        /* If hole before extent, then shift to that extent */
5837                        if (e1_blk > lblk1)
5838                                next1 = e1_blk;
5839                        if (e2_blk > lblk2)
5840                                next2 = e2_blk;
5841                        /* Do we have something to swap */
5842                        if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5843                                goto finish;
5844                        /* Move to the rightest boundary */
5845                        len = next1 - lblk1;
5846                        if (len < next2 - lblk2)
5847                                len = next2 - lblk2;
5848                        if (len > count)
5849                                len = count;
5850                        lblk1 += len;
5851                        lblk2 += len;
5852                        count -= len;
5853                        goto repeat;
5854                }
5855
5856                /* Prepare left boundary */
5857                if (e1_blk < lblk1) {
5858                        split = 1;
5859                        *erp = ext4_force_split_extent_at(handle, inode1,
5860                                                &path1, lblk1, 0);
5861                        if (unlikely(*erp))
5862                                goto finish;
5863                }
5864                if (e2_blk < lblk2) {
5865                        split = 1;
5866                        *erp = ext4_force_split_extent_at(handle, inode2,
5867                                                &path2,  lblk2, 0);
5868                        if (unlikely(*erp))
5869                                goto finish;
5870                }
5871                /* ext4_split_extent_at() may result in leaf extent split,
5872                 * path must to be revalidated. */
5873                if (split)
5874                        goto repeat;
5875
5876                /* Prepare right boundary */
5877                len = count;
5878                if (len > e1_blk + e1_len - lblk1)
5879                        len = e1_blk + e1_len - lblk1;
5880                if (len > e2_blk + e2_len - lblk2)
5881                        len = e2_blk + e2_len - lblk2;
5882
5883                if (len != e1_len) {
5884                        split = 1;
5885                        *erp = ext4_force_split_extent_at(handle, inode1,
5886                                                &path1, lblk1 + len, 0);
5887                        if (unlikely(*erp))
5888                                goto finish;
5889                }
5890                if (len != e2_len) {
5891                        split = 1;
5892                        *erp = ext4_force_split_extent_at(handle, inode2,
5893                                                &path2, lblk2 + len, 0);
5894                        if (*erp)
5895                                goto finish;
5896                }
5897                /* ext4_split_extent_at() may result in leaf extent split,
5898                 * path must to be revalidated. */
5899                if (split)
5900                        goto repeat;
5901
5902                BUG_ON(e2_len != e1_len);
5903                *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5904                if (unlikely(*erp))
5905                        goto finish;
5906                *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5907                if (unlikely(*erp))
5908                        goto finish;
5909
5910                /* Both extents are fully inside boundaries. Swap it now */
5911                tmp_ex = *ex1;
5912                ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5913                ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5914                ex1->ee_len = cpu_to_le16(e2_len);
5915                ex2->ee_len = cpu_to_le16(e1_len);
5916                if (unwritten)
5917                        ext4_ext_mark_unwritten(ex2);
5918                if (ext4_ext_is_unwritten(&tmp_ex))
5919                        ext4_ext_mark_unwritten(ex1);
5920
5921                ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5922                ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5923                *erp = ext4_ext_dirty(handle, inode2, path2 +
5924                                      path2->p_depth);
5925                if (unlikely(*erp))
5926                        goto finish;
5927                *erp = ext4_ext_dirty(handle, inode1, path1 +
5928                                      path1->p_depth);
5929                /*
5930                 * Looks scarry ah..? second inode already points to new blocks,
5931                 * and it was successfully dirtied. But luckily error may happen
5932                 * only due to journal error, so full transaction will be
5933                 * aborted anyway.
5934                 */
5935                if (unlikely(*erp))
5936                        goto finish;
5937                lblk1 += len;
5938                lblk2 += len;
5939                replaced_count += len;
5940                count -= len;
5941
5942        repeat:
5943                ext4_ext_drop_refs(path1);
5944                kfree(path1);
5945                ext4_ext_drop_refs(path2);
5946                kfree(path2);
5947                path1 = path2 = NULL;
5948        }
5949        return replaced_count;
5950}
5951
5952/*
5953 * ext4_clu_mapped - determine whether any block in a logical cluster has
5954 *                   been mapped to a physical cluster
5955 *
5956 * @inode - file containing the logical cluster
5957 * @lclu - logical cluster of interest
5958 *
5959 * Returns 1 if any block in the logical cluster is mapped, signifying
5960 * that a physical cluster has been allocated for it.  Otherwise,
5961 * returns 0.  Can also return negative error codes.  Derived from
5962 * ext4_ext_map_blocks().
5963 */
5964int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
5965{
5966        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5967        struct ext4_ext_path *path;
5968        int depth, mapped = 0, err = 0;
5969        struct ext4_extent *extent;
5970        ext4_lblk_t first_lblk, first_lclu, last_lclu;
5971
5972        /* search for the extent closest to the first block in the cluster */
5973        path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
5974        if (IS_ERR(path)) {
5975                err = PTR_ERR(path);
5976                path = NULL;
5977                goto out;
5978        }
5979
5980        depth = ext_depth(inode);
5981
5982        /*
5983         * A consistent leaf must not be empty.  This situation is possible,
5984         * though, _during_ tree modification, and it's why an assert can't
5985         * be put in ext4_find_extent().
5986         */
5987        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
5988                EXT4_ERROR_INODE(inode,
5989                    "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
5990                                 (unsigned long) EXT4_C2B(sbi, lclu),
5991                                 depth, path[depth].p_block);
5992                err = -EFSCORRUPTED;
5993                goto out;
5994        }
5995
5996        extent = path[depth].p_ext;
5997
5998        /* can't be mapped if the extent tree is empty */
5999        if (extent == NULL)
6000                goto out;
6001
6002        first_lblk = le32_to_cpu(extent->ee_block);
6003        first_lclu = EXT4_B2C(sbi, first_lblk);
6004
6005        /*
6006         * Three possible outcomes at this point - found extent spanning
6007         * the target cluster, to the left of the target cluster, or to the
6008         * right of the target cluster.  The first two cases are handled here.
6009         * The last case indicates the target cluster is not mapped.
6010         */
6011        if (lclu >= first_lclu) {
6012                last_lclu = EXT4_B2C(sbi, first_lblk +
6013                                     ext4_ext_get_actual_len(extent) - 1);
6014                if (lclu <= last_lclu) {
6015                        mapped = 1;
6016                } else {
6017                        first_lblk = ext4_ext_next_allocated_block(path);
6018                        first_lclu = EXT4_B2C(sbi, first_lblk);
6019                        if (lclu == first_lclu)
6020                                mapped = 1;
6021                }
6022        }
6023
6024out:
6025        ext4_ext_drop_refs(path);
6026        kfree(path);
6027
6028        return err ? err : mapped;
6029}
6030