linux/fs/ext4/extents.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   4 * Written by Alex Tomas <alex@clusterfs.com>
   5 *
   6 * Architecture independence:
   7 *   Copyright (c) 2005, Bull S.A.
   8 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
   9 */
  10
  11/*
  12 * Extents support for EXT4
  13 *
  14 * TODO:
  15 *   - ext4*_error() should be used in some situations
  16 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
  17 *   - smart tree reduction
  18 */
  19
  20#include <linux/fs.h>
  21#include <linux/time.h>
  22#include <linux/jbd2.h>
  23#include <linux/highuid.h>
  24#include <linux/pagemap.h>
  25#include <linux/quotaops.h>
  26#include <linux/string.h>
  27#include <linux/slab.h>
  28#include <linux/uaccess.h>
  29#include <linux/fiemap.h>
  30#include <linux/backing-dev.h>
  31#include <linux/iomap.h>
  32#include "ext4_jbd2.h"
  33#include "ext4_extents.h"
  34#include "xattr.h"
  35
  36#include <trace/events/ext4.h>
  37
  38/*
  39 * used by extent splitting.
  40 */
  41#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
  42                                        due to ENOSPC */
  43#define EXT4_EXT_MARK_UNWRIT1   0x2  /* mark first half unwritten */
  44#define EXT4_EXT_MARK_UNWRIT2   0x4  /* mark second half unwritten */
  45
  46#define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
  47#define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
  48
  49static __le32 ext4_extent_block_csum(struct inode *inode,
  50                                     struct ext4_extent_header *eh)
  51{
  52        struct ext4_inode_info *ei = EXT4_I(inode);
  53        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  54        __u32 csum;
  55
  56        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
  57                           EXT4_EXTENT_TAIL_OFFSET(eh));
  58        return cpu_to_le32(csum);
  59}
  60
  61static int ext4_extent_block_csum_verify(struct inode *inode,
  62                                         struct ext4_extent_header *eh)
  63{
  64        struct ext4_extent_tail *et;
  65
  66        if (!ext4_has_metadata_csum(inode->i_sb))
  67                return 1;
  68
  69        et = find_ext4_extent_tail(eh);
  70        if (et->et_checksum != ext4_extent_block_csum(inode, eh))
  71                return 0;
  72        return 1;
  73}
  74
  75static void ext4_extent_block_csum_set(struct inode *inode,
  76                                       struct ext4_extent_header *eh)
  77{
  78        struct ext4_extent_tail *et;
  79
  80        if (!ext4_has_metadata_csum(inode->i_sb))
  81                return;
  82
  83        et = find_ext4_extent_tail(eh);
  84        et->et_checksum = ext4_extent_block_csum(inode, eh);
  85}
  86
  87static int ext4_split_extent_at(handle_t *handle,
  88                             struct inode *inode,
  89                             struct ext4_ext_path **ppath,
  90                             ext4_lblk_t split,
  91                             int split_flag,
  92                             int flags);
  93
  94static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
  95{
  96        /*
  97         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
  98         * moment, get_block can be called only for blocks inside i_size since
  99         * page cache has been already dropped and writes are blocked by
 100         * i_mutex. So we can safely drop the i_data_sem here.
 101         */
 102        BUG_ON(EXT4_JOURNAL(inode) == NULL);
 103        ext4_discard_preallocations(inode, 0);
 104        up_write(&EXT4_I(inode)->i_data_sem);
 105        *dropped = 1;
 106        return 0;
 107}
 108
 109/*
 110 * Make sure 'handle' has at least 'check_cred' credits. If not, restart
 111 * transaction with 'restart_cred' credits. The function drops i_data_sem
 112 * when restarting transaction and gets it after transaction is restarted.
 113 *
 114 * The function returns 0 on success, 1 if transaction had to be restarted,
 115 * and < 0 in case of fatal error.
 116 */
 117int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
 118                                int check_cred, int restart_cred,
 119                                int revoke_cred)
 120{
 121        int ret;
 122        int dropped = 0;
 123
 124        ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
 125                revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
 126        if (dropped)
 127                down_write(&EXT4_I(inode)->i_data_sem);
 128        return ret;
 129}
 130
 131/*
 132 * could return:
 133 *  - EROFS
 134 *  - ENOMEM
 135 */
 136static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 137                                struct ext4_ext_path *path)
 138{
 139        if (path->p_bh) {
 140                /* path points to block */
 141                BUFFER_TRACE(path->p_bh, "get_write_access");
 142                return ext4_journal_get_write_access(handle, inode->i_sb,
 143                                                     path->p_bh, EXT4_JTR_NONE);
 144        }
 145        /* path points to leaf/index in inode body */
 146        /* we use in-core data, no need to protect them */
 147        return 0;
 148}
 149
 150/*
 151 * could return:
 152 *  - EROFS
 153 *  - ENOMEM
 154 *  - EIO
 155 */
 156static int __ext4_ext_dirty(const char *where, unsigned int line,
 157                            handle_t *handle, struct inode *inode,
 158                            struct ext4_ext_path *path)
 159{
 160        int err;
 161
 162        WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
 163        if (path->p_bh) {
 164                ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
 165                /* path points to block */
 166                err = __ext4_handle_dirty_metadata(where, line, handle,
 167                                                   inode, path->p_bh);
 168        } else {
 169                /* path points to leaf/index in inode body */
 170                err = ext4_mark_inode_dirty(handle, inode);
 171        }
 172        return err;
 173}
 174
 175#define ext4_ext_dirty(handle, inode, path) \
 176                __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
 177
 178static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 179                              struct ext4_ext_path *path,
 180                              ext4_lblk_t block)
 181{
 182        if (path) {
 183                int depth = path->p_depth;
 184                struct ext4_extent *ex;
 185
 186                /*
 187                 * Try to predict block placement assuming that we are
 188                 * filling in a file which will eventually be
 189                 * non-sparse --- i.e., in the case of libbfd writing
 190                 * an ELF object sections out-of-order but in a way
 191                 * the eventually results in a contiguous object or
 192                 * executable file, or some database extending a table
 193                 * space file.  However, this is actually somewhat
 194                 * non-ideal if we are writing a sparse file such as
 195                 * qemu or KVM writing a raw image file that is going
 196                 * to stay fairly sparse, since it will end up
 197                 * fragmenting the file system's free space.  Maybe we
 198                 * should have some hueristics or some way to allow
 199                 * userspace to pass a hint to file system,
 200                 * especially if the latter case turns out to be
 201                 * common.
 202                 */
 203                ex = path[depth].p_ext;
 204                if (ex) {
 205                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
 206                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
 207
 208                        if (block > ext_block)
 209                                return ext_pblk + (block - ext_block);
 210                        else
 211                                return ext_pblk - (ext_block - block);
 212                }
 213
 214                /* it looks like index is empty;
 215                 * try to find starting block from index itself */
 216                if (path[depth].p_bh)
 217                        return path[depth].p_bh->b_blocknr;
 218        }
 219
 220        /* OK. use inode's group */
 221        return ext4_inode_to_goal_block(inode);
 222}
 223
 224/*
 225 * Allocation for a meta data block
 226 */
 227static ext4_fsblk_t
 228ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 229                        struct ext4_ext_path *path,
 230                        struct ext4_extent *ex, int *err, unsigned int flags)
 231{
 232        ext4_fsblk_t goal, newblock;
 233
 234        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
 235        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
 236                                        NULL, err);
 237        return newblock;
 238}
 239
 240static inline int ext4_ext_space_block(struct inode *inode, int check)
 241{
 242        int size;
 243
 244        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 245                        / sizeof(struct ext4_extent);
 246#ifdef AGGRESSIVE_TEST
 247        if (!check && size > 6)
 248                size = 6;
 249#endif
 250        return size;
 251}
 252
 253static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 254{
 255        int size;
 256
 257        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 258                        / sizeof(struct ext4_extent_idx);
 259#ifdef AGGRESSIVE_TEST
 260        if (!check && size > 5)
 261                size = 5;
 262#endif
 263        return size;
 264}
 265
 266static inline int ext4_ext_space_root(struct inode *inode, int check)
 267{
 268        int size;
 269
 270        size = sizeof(EXT4_I(inode)->i_data);
 271        size -= sizeof(struct ext4_extent_header);
 272        size /= sizeof(struct ext4_extent);
 273#ifdef AGGRESSIVE_TEST
 274        if (!check && size > 3)
 275                size = 3;
 276#endif
 277        return size;
 278}
 279
 280static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 281{
 282        int size;
 283
 284        size = sizeof(EXT4_I(inode)->i_data);
 285        size -= sizeof(struct ext4_extent_header);
 286        size /= sizeof(struct ext4_extent_idx);
 287#ifdef AGGRESSIVE_TEST
 288        if (!check && size > 4)
 289                size = 4;
 290#endif
 291        return size;
 292}
 293
 294static inline int
 295ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
 296                           struct ext4_ext_path **ppath, ext4_lblk_t lblk,
 297                           int nofail)
 298{
 299        struct ext4_ext_path *path = *ppath;
 300        int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
 301        int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
 302
 303        if (nofail)
 304                flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
 305
 306        return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
 307                        EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
 308                        flags);
 309}
 310
 311static int
 312ext4_ext_max_entries(struct inode *inode, int depth)
 313{
 314        int max;
 315
 316        if (depth == ext_depth(inode)) {
 317                if (depth == 0)
 318                        max = ext4_ext_space_root(inode, 1);
 319                else
 320                        max = ext4_ext_space_root_idx(inode, 1);
 321        } else {
 322                if (depth == 0)
 323                        max = ext4_ext_space_block(inode, 1);
 324                else
 325                        max = ext4_ext_space_block_idx(inode, 1);
 326        }
 327
 328        return max;
 329}
 330
 331static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 332{
 333        ext4_fsblk_t block = ext4_ext_pblock(ext);
 334        int len = ext4_ext_get_actual_len(ext);
 335        ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
 336
 337        /*
 338         * We allow neither:
 339         *  - zero length
 340         *  - overflow/wrap-around
 341         */
 342        if (lblock + len <= lblock)
 343                return 0;
 344        return ext4_inode_block_valid(inode, block, len);
 345}
 346
 347static int ext4_valid_extent_idx(struct inode *inode,
 348                                struct ext4_extent_idx *ext_idx)
 349{
 350        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
 351
 352        return ext4_inode_block_valid(inode, block, 1);
 353}
 354
 355static int ext4_valid_extent_entries(struct inode *inode,
 356                                     struct ext4_extent_header *eh,
 357                                     ext4_fsblk_t *pblk, int depth)
 358{
 359        unsigned short entries;
 360        if (eh->eh_entries == 0)
 361                return 1;
 362
 363        entries = le16_to_cpu(eh->eh_entries);
 364
 365        if (depth == 0) {
 366                /* leaf entries */
 367                struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
 368                ext4_lblk_t lblock = 0;
 369                ext4_lblk_t prev = 0;
 370                int len = 0;
 371                while (entries) {
 372                        if (!ext4_valid_extent(inode, ext))
 373                                return 0;
 374
 375                        /* Check for overlapping extents */
 376                        lblock = le32_to_cpu(ext->ee_block);
 377                        len = ext4_ext_get_actual_len(ext);
 378                        if ((lblock <= prev) && prev) {
 379                                *pblk = ext4_ext_pblock(ext);
 380                                return 0;
 381                        }
 382                        ext++;
 383                        entries--;
 384                        prev = lblock + len - 1;
 385                }
 386        } else {
 387                struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
 388                while (entries) {
 389                        if (!ext4_valid_extent_idx(inode, ext_idx))
 390                                return 0;
 391                        ext_idx++;
 392                        entries--;
 393                }
 394        }
 395        return 1;
 396}
 397
 398static int __ext4_ext_check(const char *function, unsigned int line,
 399                            struct inode *inode, struct ext4_extent_header *eh,
 400                            int depth, ext4_fsblk_t pblk)
 401{
 402        const char *error_msg;
 403        int max = 0, err = -EFSCORRUPTED;
 404
 405        if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
 406                error_msg = "invalid magic";
 407                goto corrupted;
 408        }
 409        if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
 410                error_msg = "unexpected eh_depth";
 411                goto corrupted;
 412        }
 413        if (unlikely(eh->eh_max == 0)) {
 414                error_msg = "invalid eh_max";
 415                goto corrupted;
 416        }
 417        max = ext4_ext_max_entries(inode, depth);
 418        if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
 419                error_msg = "too large eh_max";
 420                goto corrupted;
 421        }
 422        if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
 423                error_msg = "invalid eh_entries";
 424                goto corrupted;
 425        }
 426        if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
 427                error_msg = "invalid extent entries";
 428                goto corrupted;
 429        }
 430        if (unlikely(depth > 32)) {
 431                error_msg = "too large eh_depth";
 432                goto corrupted;
 433        }
 434        /* Verify checksum on non-root extent tree nodes */
 435        if (ext_depth(inode) != depth &&
 436            !ext4_extent_block_csum_verify(inode, eh)) {
 437                error_msg = "extent tree corrupted";
 438                err = -EFSBADCRC;
 439                goto corrupted;
 440        }
 441        return 0;
 442
 443corrupted:
 444        ext4_error_inode_err(inode, function, line, 0, -err,
 445                             "pblk %llu bad header/extent: %s - magic %x, "
 446                             "entries %u, max %u(%u), depth %u(%u)",
 447                             (unsigned long long) pblk, error_msg,
 448                             le16_to_cpu(eh->eh_magic),
 449                             le16_to_cpu(eh->eh_entries),
 450                             le16_to_cpu(eh->eh_max),
 451                             max, le16_to_cpu(eh->eh_depth), depth);
 452        return err;
 453}
 454
 455#define ext4_ext_check(inode, eh, depth, pblk)                  \
 456        __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
 457
 458int ext4_ext_check_inode(struct inode *inode)
 459{
 460        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
 461}
 462
 463static void ext4_cache_extents(struct inode *inode,
 464                               struct ext4_extent_header *eh)
 465{
 466        struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
 467        ext4_lblk_t prev = 0;
 468        int i;
 469
 470        for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
 471                unsigned int status = EXTENT_STATUS_WRITTEN;
 472                ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
 473                int len = ext4_ext_get_actual_len(ex);
 474
 475                if (prev && (prev != lblk))
 476                        ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
 477                                             EXTENT_STATUS_HOLE);
 478
 479                if (ext4_ext_is_unwritten(ex))
 480                        status = EXTENT_STATUS_UNWRITTEN;
 481                ext4_es_cache_extent(inode, lblk, len,
 482                                     ext4_ext_pblock(ex), status);
 483                prev = lblk + len;
 484        }
 485}
 486
 487static struct buffer_head *
 488__read_extent_tree_block(const char *function, unsigned int line,
 489                         struct inode *inode, ext4_fsblk_t pblk, int depth,
 490                         int flags)
 491{
 492        struct buffer_head              *bh;
 493        int                             err;
 494        gfp_t                           gfp_flags = __GFP_MOVABLE | GFP_NOFS;
 495
 496        if (flags & EXT4_EX_NOFAIL)
 497                gfp_flags |= __GFP_NOFAIL;
 498
 499        bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
 500        if (unlikely(!bh))
 501                return ERR_PTR(-ENOMEM);
 502
 503        if (!bh_uptodate_or_lock(bh)) {
 504                trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
 505                err = ext4_read_bh(bh, 0, NULL);
 506                if (err < 0)
 507                        goto errout;
 508        }
 509        if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
 510                return bh;
 511        err = __ext4_ext_check(function, line, inode,
 512                               ext_block_hdr(bh), depth, pblk);
 513        if (err)
 514                goto errout;
 515        set_buffer_verified(bh);
 516        /*
 517         * If this is a leaf block, cache all of its entries
 518         */
 519        if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
 520                struct ext4_extent_header *eh = ext_block_hdr(bh);
 521                ext4_cache_extents(inode, eh);
 522        }
 523        return bh;
 524errout:
 525        put_bh(bh);
 526        return ERR_PTR(err);
 527
 528}
 529
 530#define read_extent_tree_block(inode, pblk, depth, flags)               \
 531        __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
 532                                 (depth), (flags))
 533
 534/*
 535 * This function is called to cache a file's extent information in the
 536 * extent status tree
 537 */
 538int ext4_ext_precache(struct inode *inode)
 539{
 540        struct ext4_inode_info *ei = EXT4_I(inode);
 541        struct ext4_ext_path *path = NULL;
 542        struct buffer_head *bh;
 543        int i = 0, depth, ret = 0;
 544
 545        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 546                return 0;       /* not an extent-mapped inode */
 547
 548        down_read(&ei->i_data_sem);
 549        depth = ext_depth(inode);
 550
 551        /* Don't cache anything if there are no external extent blocks */
 552        if (!depth) {
 553                up_read(&ei->i_data_sem);
 554                return ret;
 555        }
 556
 557        path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
 558                       GFP_NOFS);
 559        if (path == NULL) {
 560                up_read(&ei->i_data_sem);
 561                return -ENOMEM;
 562        }
 563
 564        path[0].p_hdr = ext_inode_hdr(inode);
 565        ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
 566        if (ret)
 567                goto out;
 568        path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
 569        while (i >= 0) {
 570                /*
 571                 * If this is a leaf block or we've reached the end of
 572                 * the index block, go up
 573                 */
 574                if ((i == depth) ||
 575                    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
 576                        brelse(path[i].p_bh);
 577                        path[i].p_bh = NULL;
 578                        i--;
 579                        continue;
 580                }
 581                bh = read_extent_tree_block(inode,
 582                                            ext4_idx_pblock(path[i].p_idx++),
 583                                            depth - i - 1,
 584                                            EXT4_EX_FORCE_CACHE);
 585                if (IS_ERR(bh)) {
 586                        ret = PTR_ERR(bh);
 587                        break;
 588                }
 589                i++;
 590                path[i].p_bh = bh;
 591                path[i].p_hdr = ext_block_hdr(bh);
 592                path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
 593        }
 594        ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
 595out:
 596        up_read(&ei->i_data_sem);
 597        ext4_ext_drop_refs(path);
 598        kfree(path);
 599        return ret;
 600}
 601
 602#ifdef EXT_DEBUG
 603static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 604{
 605        int k, l = path->p_depth;
 606
 607        ext_debug(inode, "path:");
 608        for (k = 0; k <= l; k++, path++) {
 609                if (path->p_idx) {
 610                        ext_debug(inode, "  %d->%llu",
 611                                  le32_to_cpu(path->p_idx->ei_block),
 612                                  ext4_idx_pblock(path->p_idx));
 613                } else if (path->p_ext) {
 614                        ext_debug(inode, "  %d:[%d]%d:%llu ",
 615                                  le32_to_cpu(path->p_ext->ee_block),
 616                                  ext4_ext_is_unwritten(path->p_ext),
 617                                  ext4_ext_get_actual_len(path->p_ext),
 618                                  ext4_ext_pblock(path->p_ext));
 619                } else
 620                        ext_debug(inode, "  []");
 621        }
 622        ext_debug(inode, "\n");
 623}
 624
 625static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 626{
 627        int depth = ext_depth(inode);
 628        struct ext4_extent_header *eh;
 629        struct ext4_extent *ex;
 630        int i;
 631
 632        if (!path)
 633                return;
 634
 635        eh = path[depth].p_hdr;
 636        ex = EXT_FIRST_EXTENT(eh);
 637
 638        ext_debug(inode, "Displaying leaf extents\n");
 639
 640        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
 641                ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
 642                          ext4_ext_is_unwritten(ex),
 643                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
 644        }
 645        ext_debug(inode, "\n");
 646}
 647
 648static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
 649                        ext4_fsblk_t newblock, int level)
 650{
 651        int depth = ext_depth(inode);
 652        struct ext4_extent *ex;
 653
 654        if (depth != level) {
 655                struct ext4_extent_idx *idx;
 656                idx = path[level].p_idx;
 657                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
 658                        ext_debug(inode, "%d: move %d:%llu in new index %llu\n",
 659                                  level, le32_to_cpu(idx->ei_block),
 660                                  ext4_idx_pblock(idx), newblock);
 661                        idx++;
 662                }
 663
 664                return;
 665        }
 666
 667        ex = path[depth].p_ext;
 668        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
 669                ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n",
 670                                le32_to_cpu(ex->ee_block),
 671                                ext4_ext_pblock(ex),
 672                                ext4_ext_is_unwritten(ex),
 673                                ext4_ext_get_actual_len(ex),
 674                                newblock);
 675                ex++;
 676        }
 677}
 678
 679#else
 680#define ext4_ext_show_path(inode, path)
 681#define ext4_ext_show_leaf(inode, path)
 682#define ext4_ext_show_move(inode, path, newblock, level)
 683#endif
 684
 685void ext4_ext_drop_refs(struct ext4_ext_path *path)
 686{
 687        int depth, i;
 688
 689        if (!path)
 690                return;
 691        depth = path->p_depth;
 692        for (i = 0; i <= depth; i++, path++) {
 693                brelse(path->p_bh);
 694                path->p_bh = NULL;
 695        }
 696}
 697
 698/*
 699 * ext4_ext_binsearch_idx:
 700 * binary search for the closest index of the given block
 701 * the header must be checked before calling this
 702 */
 703static void
 704ext4_ext_binsearch_idx(struct inode *inode,
 705                        struct ext4_ext_path *path, ext4_lblk_t block)
 706{
 707        struct ext4_extent_header *eh = path->p_hdr;
 708        struct ext4_extent_idx *r, *l, *m;
 709
 710
 711        ext_debug(inode, "binsearch for %u(idx):  ", block);
 712
 713        l = EXT_FIRST_INDEX(eh) + 1;
 714        r = EXT_LAST_INDEX(eh);
 715        while (l <= r) {
 716                m = l + (r - l) / 2;
 717                if (block < le32_to_cpu(m->ei_block))
 718                        r = m - 1;
 719                else
 720                        l = m + 1;
 721                ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
 722                          le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
 723                          r, le32_to_cpu(r->ei_block));
 724        }
 725
 726        path->p_idx = l - 1;
 727        ext_debug(inode, "  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
 728                  ext4_idx_pblock(path->p_idx));
 729
 730#ifdef CHECK_BINSEARCH
 731        {
 732                struct ext4_extent_idx *chix, *ix;
 733                int k;
 734
 735                chix = ix = EXT_FIRST_INDEX(eh);
 736                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
 737                        if (k != 0 && le32_to_cpu(ix->ei_block) <=
 738                            le32_to_cpu(ix[-1].ei_block)) {
 739                                printk(KERN_DEBUG "k=%d, ix=0x%p, "
 740                                       "first=0x%p\n", k,
 741                                       ix, EXT_FIRST_INDEX(eh));
 742                                printk(KERN_DEBUG "%u <= %u\n",
 743                                       le32_to_cpu(ix->ei_block),
 744                                       le32_to_cpu(ix[-1].ei_block));
 745                        }
 746                        BUG_ON(k && le32_to_cpu(ix->ei_block)
 747                                           <= le32_to_cpu(ix[-1].ei_block));
 748                        if (block < le32_to_cpu(ix->ei_block))
 749                                break;
 750                        chix = ix;
 751                }
 752                BUG_ON(chix != path->p_idx);
 753        }
 754#endif
 755
 756}
 757
 758/*
 759 * ext4_ext_binsearch:
 760 * binary search for closest extent of the given block
 761 * the header must be checked before calling this
 762 */
 763static void
 764ext4_ext_binsearch(struct inode *inode,
 765                struct ext4_ext_path *path, ext4_lblk_t block)
 766{
 767        struct ext4_extent_header *eh = path->p_hdr;
 768        struct ext4_extent *r, *l, *m;
 769
 770        if (eh->eh_entries == 0) {
 771                /*
 772                 * this leaf is empty:
 773                 * we get such a leaf in split/add case
 774                 */
 775                return;
 776        }
 777
 778        ext_debug(inode, "binsearch for %u:  ", block);
 779
 780        l = EXT_FIRST_EXTENT(eh) + 1;
 781        r = EXT_LAST_EXTENT(eh);
 782
 783        while (l <= r) {
 784                m = l + (r - l) / 2;
 785                if (block < le32_to_cpu(m->ee_block))
 786                        r = m - 1;
 787                else
 788                        l = m + 1;
 789                ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
 790                          le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
 791                          r, le32_to_cpu(r->ee_block));
 792        }
 793
 794        path->p_ext = l - 1;
 795        ext_debug(inode, "  -> %d:%llu:[%d]%d ",
 796                        le32_to_cpu(path->p_ext->ee_block),
 797                        ext4_ext_pblock(path->p_ext),
 798                        ext4_ext_is_unwritten(path->p_ext),
 799                        ext4_ext_get_actual_len(path->p_ext));
 800
 801#ifdef CHECK_BINSEARCH
 802        {
 803                struct ext4_extent *chex, *ex;
 804                int k;
 805
 806                chex = ex = EXT_FIRST_EXTENT(eh);
 807                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
 808                        BUG_ON(k && le32_to_cpu(ex->ee_block)
 809                                          <= le32_to_cpu(ex[-1].ee_block));
 810                        if (block < le32_to_cpu(ex->ee_block))
 811                                break;
 812                        chex = ex;
 813                }
 814                BUG_ON(chex != path->p_ext);
 815        }
 816#endif
 817
 818}
 819
 820void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 821{
 822        struct ext4_extent_header *eh;
 823
 824        eh = ext_inode_hdr(inode);
 825        eh->eh_depth = 0;
 826        eh->eh_entries = 0;
 827        eh->eh_magic = EXT4_EXT_MAGIC;
 828        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
 829        eh->eh_generation = 0;
 830        ext4_mark_inode_dirty(handle, inode);
 831}
 832
 833struct ext4_ext_path *
 834ext4_find_extent(struct inode *inode, ext4_lblk_t block,
 835                 struct ext4_ext_path **orig_path, int flags)
 836{
 837        struct ext4_extent_header *eh;
 838        struct buffer_head *bh;
 839        struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
 840        short int depth, i, ppos = 0;
 841        int ret;
 842        gfp_t gfp_flags = GFP_NOFS;
 843
 844        if (flags & EXT4_EX_NOFAIL)
 845                gfp_flags |= __GFP_NOFAIL;
 846
 847        eh = ext_inode_hdr(inode);
 848        depth = ext_depth(inode);
 849        if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
 850                EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
 851                                 depth);
 852                ret = -EFSCORRUPTED;
 853                goto err;
 854        }
 855
 856        if (path) {
 857                ext4_ext_drop_refs(path);
 858                if (depth > path[0].p_maxdepth) {
 859                        kfree(path);
 860                        *orig_path = path = NULL;
 861                }
 862        }
 863        if (!path) {
 864                /* account possible depth increase */
 865                path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
 866                                gfp_flags);
 867                if (unlikely(!path))
 868                        return ERR_PTR(-ENOMEM);
 869                path[0].p_maxdepth = depth + 1;
 870        }
 871        path[0].p_hdr = eh;
 872        path[0].p_bh = NULL;
 873
 874        i = depth;
 875        if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
 876                ext4_cache_extents(inode, eh);
 877        /* walk through the tree */
 878        while (i) {
 879                ext_debug(inode, "depth %d: num %d, max %d\n",
 880                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 881
 882                ext4_ext_binsearch_idx(inode, path + ppos, block);
 883                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 884                path[ppos].p_depth = i;
 885                path[ppos].p_ext = NULL;
 886
 887                bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
 888                                            flags);
 889                if (IS_ERR(bh)) {
 890                        ret = PTR_ERR(bh);
 891                        goto err;
 892                }
 893
 894                eh = ext_block_hdr(bh);
 895                ppos++;
 896                path[ppos].p_bh = bh;
 897                path[ppos].p_hdr = eh;
 898        }
 899
 900        path[ppos].p_depth = i;
 901        path[ppos].p_ext = NULL;
 902        path[ppos].p_idx = NULL;
 903
 904        /* find extent */
 905        ext4_ext_binsearch(inode, path + ppos, block);
 906        /* if not an empty leaf */
 907        if (path[ppos].p_ext)
 908                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 909
 910        ext4_ext_show_path(inode, path);
 911
 912        return path;
 913
 914err:
 915        ext4_ext_drop_refs(path);
 916        kfree(path);
 917        if (orig_path)
 918                *orig_path = NULL;
 919        return ERR_PTR(ret);
 920}
 921
 922/*
 923 * ext4_ext_insert_index:
 924 * insert new index [@logical;@ptr] into the block at @curp;
 925 * check where to insert: before @curp or after @curp
 926 */
 927static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 928                                 struct ext4_ext_path *curp,
 929                                 int logical, ext4_fsblk_t ptr)
 930{
 931        struct ext4_extent_idx *ix;
 932        int len, err;
 933
 934        err = ext4_ext_get_access(handle, inode, curp);
 935        if (err)
 936                return err;
 937
 938        if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
 939                EXT4_ERROR_INODE(inode,
 940                                 "logical %d == ei_block %d!",
 941                                 logical, le32_to_cpu(curp->p_idx->ei_block));
 942                return -EFSCORRUPTED;
 943        }
 944
 945        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
 946                             >= le16_to_cpu(curp->p_hdr->eh_max))) {
 947                EXT4_ERROR_INODE(inode,
 948                                 "eh_entries %d >= eh_max %d!",
 949                                 le16_to_cpu(curp->p_hdr->eh_entries),
 950                                 le16_to_cpu(curp->p_hdr->eh_max));
 951                return -EFSCORRUPTED;
 952        }
 953
 954        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
 955                /* insert after */
 956                ext_debug(inode, "insert new index %d after: %llu\n",
 957                          logical, ptr);
 958                ix = curp->p_idx + 1;
 959        } else {
 960                /* insert before */
 961                ext_debug(inode, "insert new index %d before: %llu\n",
 962                          logical, ptr);
 963                ix = curp->p_idx;
 964        }
 965
 966        len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
 967        BUG_ON(len < 0);
 968        if (len > 0) {
 969                ext_debug(inode, "insert new index %d: "
 970                                "move %d indices from 0x%p to 0x%p\n",
 971                                logical, len, ix, ix + 1);
 972                memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
 973        }
 974
 975        if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
 976                EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
 977                return -EFSCORRUPTED;
 978        }
 979
 980        ix->ei_block = cpu_to_le32(logical);
 981        ext4_idx_store_pblock(ix, ptr);
 982        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
 983
 984        if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
 985                EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
 986                return -EFSCORRUPTED;
 987        }
 988
 989        err = ext4_ext_dirty(handle, inode, curp);
 990        ext4_std_error(inode->i_sb, err);
 991
 992        return err;
 993}
 994
 995/*
 996 * ext4_ext_split:
 997 * inserts new subtree into the path, using free index entry
 998 * at depth @at:
 999 * - allocates all needed blocks (new leaf and all intermediate index blocks)
1000 * - makes decision where to split
1001 * - moves remaining extents and index entries (right to the split point)
1002 *   into the newly allocated blocks
1003 * - initializes subtree
1004 */
1005static int ext4_ext_split(handle_t *handle, struct inode *inode,
1006                          unsigned int flags,
1007                          struct ext4_ext_path *path,
1008                          struct ext4_extent *newext, int at)
1009{
1010        struct buffer_head *bh = NULL;
1011        int depth = ext_depth(inode);
1012        struct ext4_extent_header *neh;
1013        struct ext4_extent_idx *fidx;
1014        int i = at, k, m, a;
1015        ext4_fsblk_t newblock, oldblock;
1016        __le32 border;
1017        ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1018        gfp_t gfp_flags = GFP_NOFS;
1019        int err = 0;
1020        size_t ext_size = 0;
1021
1022        if (flags & EXT4_EX_NOFAIL)
1023                gfp_flags |= __GFP_NOFAIL;
1024
1025        /* make decision: where to split? */
1026        /* FIXME: now decision is simplest: at current extent */
1027
1028        /* if current leaf will be split, then we should use
1029         * border from split point */
1030        if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1031                EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1032                return -EFSCORRUPTED;
1033        }
1034        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1035                border = path[depth].p_ext[1].ee_block;
1036                ext_debug(inode, "leaf will be split."
1037                                " next leaf starts at %d\n",
1038                                  le32_to_cpu(border));
1039        } else {
1040                border = newext->ee_block;
1041                ext_debug(inode, "leaf will be added."
1042                                " next leaf starts at %d\n",
1043                                le32_to_cpu(border));
1044        }
1045
1046        /*
1047         * If error occurs, then we break processing
1048         * and mark filesystem read-only. index won't
1049         * be inserted and tree will be in consistent
1050         * state. Next mount will repair buffers too.
1051         */
1052
1053        /*
1054         * Get array to track all allocated blocks.
1055         * We need this to handle errors and free blocks
1056         * upon them.
1057         */
1058        ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
1059        if (!ablocks)
1060                return -ENOMEM;
1061
1062        /* allocate all needed blocks */
1063        ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
1064        for (a = 0; a < depth - at; a++) {
1065                newblock = ext4_ext_new_meta_block(handle, inode, path,
1066                                                   newext, &err, flags);
1067                if (newblock == 0)
1068                        goto cleanup;
1069                ablocks[a] = newblock;
1070        }
1071
1072        /* initialize new leaf */
1073        newblock = ablocks[--a];
1074        if (unlikely(newblock == 0)) {
1075                EXT4_ERROR_INODE(inode, "newblock == 0!");
1076                err = -EFSCORRUPTED;
1077                goto cleanup;
1078        }
1079        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1080        if (unlikely(!bh)) {
1081                err = -ENOMEM;
1082                goto cleanup;
1083        }
1084        lock_buffer(bh);
1085
1086        err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
1087                                             EXT4_JTR_NONE);
1088        if (err)
1089                goto cleanup;
1090
1091        neh = ext_block_hdr(bh);
1092        neh->eh_entries = 0;
1093        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1094        neh->eh_magic = EXT4_EXT_MAGIC;
1095        neh->eh_depth = 0;
1096        neh->eh_generation = 0;
1097
1098        /* move remainder of path[depth] to the new leaf */
1099        if (unlikely(path[depth].p_hdr->eh_entries !=
1100                     path[depth].p_hdr->eh_max)) {
1101                EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1102                                 path[depth].p_hdr->eh_entries,
1103                                 path[depth].p_hdr->eh_max);
1104                err = -EFSCORRUPTED;
1105                goto cleanup;
1106        }
1107        /* start copy from next extent */
1108        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1109        ext4_ext_show_move(inode, path, newblock, depth);
1110        if (m) {
1111                struct ext4_extent *ex;
1112                ex = EXT_FIRST_EXTENT(neh);
1113                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1114                le16_add_cpu(&neh->eh_entries, m);
1115        }
1116
1117        /* zero out unused area in the extent block */
1118        ext_size = sizeof(struct ext4_extent_header) +
1119                sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
1120        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1121        ext4_extent_block_csum_set(inode, neh);
1122        set_buffer_uptodate(bh);
1123        unlock_buffer(bh);
1124
1125        err = ext4_handle_dirty_metadata(handle, inode, bh);
1126        if (err)
1127                goto cleanup;
1128        brelse(bh);
1129        bh = NULL;
1130
1131        /* correct old leaf */
1132        if (m) {
1133                err = ext4_ext_get_access(handle, inode, path + depth);
1134                if (err)
1135                        goto cleanup;
1136                le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1137                err = ext4_ext_dirty(handle, inode, path + depth);
1138                if (err)
1139                        goto cleanup;
1140
1141        }
1142
1143        /* create intermediate indexes */
1144        k = depth - at - 1;
1145        if (unlikely(k < 0)) {
1146                EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1147                err = -EFSCORRUPTED;
1148                goto cleanup;
1149        }
1150        if (k)
1151                ext_debug(inode, "create %d intermediate indices\n", k);
1152        /* insert new index into current index block */
1153        /* current depth stored in i var */
1154        i = depth - 1;
1155        while (k--) {
1156                oldblock = newblock;
1157                newblock = ablocks[--a];
1158                bh = sb_getblk(inode->i_sb, newblock);
1159                if (unlikely(!bh)) {
1160                        err = -ENOMEM;
1161                        goto cleanup;
1162                }
1163                lock_buffer(bh);
1164
1165                err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
1166                                                     EXT4_JTR_NONE);
1167                if (err)
1168                        goto cleanup;
1169
1170                neh = ext_block_hdr(bh);
1171                neh->eh_entries = cpu_to_le16(1);
1172                neh->eh_magic = EXT4_EXT_MAGIC;
1173                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1174                neh->eh_depth = cpu_to_le16(depth - i);
1175                neh->eh_generation = 0;
1176                fidx = EXT_FIRST_INDEX(neh);
1177                fidx->ei_block = border;
1178                ext4_idx_store_pblock(fidx, oldblock);
1179
1180                ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n",
1181                                i, newblock, le32_to_cpu(border), oldblock);
1182
1183                /* move remainder of path[i] to the new index block */
1184                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1185                                        EXT_LAST_INDEX(path[i].p_hdr))) {
1186                        EXT4_ERROR_INODE(inode,
1187                                         "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1188                                         le32_to_cpu(path[i].p_ext->ee_block));
1189                        err = -EFSCORRUPTED;
1190                        goto cleanup;
1191                }
1192                /* start copy indexes */
1193                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1194                ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
1195                                EXT_MAX_INDEX(path[i].p_hdr));
1196                ext4_ext_show_move(inode, path, newblock, i);
1197                if (m) {
1198                        memmove(++fidx, path[i].p_idx,
1199                                sizeof(struct ext4_extent_idx) * m);
1200                        le16_add_cpu(&neh->eh_entries, m);
1201                }
1202                /* zero out unused area in the extent block */
1203                ext_size = sizeof(struct ext4_extent_header) +
1204                   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
1205                memset(bh->b_data + ext_size, 0,
1206                        inode->i_sb->s_blocksize - ext_size);
1207                ext4_extent_block_csum_set(inode, neh);
1208                set_buffer_uptodate(bh);
1209                unlock_buffer(bh);
1210
1211                err = ext4_handle_dirty_metadata(handle, inode, bh);
1212                if (err)
1213                        goto cleanup;
1214                brelse(bh);
1215                bh = NULL;
1216
1217                /* correct old index */
1218                if (m) {
1219                        err = ext4_ext_get_access(handle, inode, path + i);
1220                        if (err)
1221                                goto cleanup;
1222                        le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1223                        err = ext4_ext_dirty(handle, inode, path + i);
1224                        if (err)
1225                                goto cleanup;
1226                }
1227
1228                i--;
1229        }
1230
1231        /* insert new index */
1232        err = ext4_ext_insert_index(handle, inode, path + at,
1233                                    le32_to_cpu(border), newblock);
1234
1235cleanup:
1236        if (bh) {
1237                if (buffer_locked(bh))
1238                        unlock_buffer(bh);
1239                brelse(bh);
1240        }
1241
1242        if (err) {
1243                /* free all allocated blocks in error case */
1244                for (i = 0; i < depth; i++) {
1245                        if (!ablocks[i])
1246                                continue;
1247                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1248                                         EXT4_FREE_BLOCKS_METADATA);
1249                }
1250        }
1251        kfree(ablocks);
1252
1253        return err;
1254}
1255
1256/*
1257 * ext4_ext_grow_indepth:
1258 * implements tree growing procedure:
1259 * - allocates new block
1260 * - moves top-level data (index block or leaf) into the new block
1261 * - initializes new top-level, creating index that points to the
1262 *   just created block
1263 */
1264static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1265                                 unsigned int flags)
1266{
1267        struct ext4_extent_header *neh;
1268        struct buffer_head *bh;
1269        ext4_fsblk_t newblock, goal = 0;
1270        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1271        int err = 0;
1272        size_t ext_size = 0;
1273
1274        /* Try to prepend new index to old one */
1275        if (ext_depth(inode))
1276                goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1277        if (goal > le32_to_cpu(es->s_first_data_block)) {
1278                flags |= EXT4_MB_HINT_TRY_GOAL;
1279                goal--;
1280        } else
1281                goal = ext4_inode_to_goal_block(inode);
1282        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1283                                        NULL, &err);
1284        if (newblock == 0)
1285                return err;
1286
1287        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1288        if (unlikely(!bh))
1289                return -ENOMEM;
1290        lock_buffer(bh);
1291
1292        err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
1293                                             EXT4_JTR_NONE);
1294        if (err) {
1295                unlock_buffer(bh);
1296                goto out;
1297        }
1298
1299        ext_size = sizeof(EXT4_I(inode)->i_data);
1300        /* move top-level index/leaf into new block */
1301        memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
1302        /* zero out unused area in the extent block */
1303        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1304
1305        /* set size of new block */
1306        neh = ext_block_hdr(bh);
1307        /* old root could have indexes or leaves
1308         * so calculate e_max right way */
1309        if (ext_depth(inode))
1310                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1311        else
1312                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1313        neh->eh_magic = EXT4_EXT_MAGIC;
1314        ext4_extent_block_csum_set(inode, neh);
1315        set_buffer_uptodate(bh);
1316        set_buffer_verified(bh);
1317        unlock_buffer(bh);
1318
1319        err = ext4_handle_dirty_metadata(handle, inode, bh);
1320        if (err)
1321                goto out;
1322
1323        /* Update top-level index: num,max,pointer */
1324        neh = ext_inode_hdr(inode);
1325        neh->eh_entries = cpu_to_le16(1);
1326        ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1327        if (neh->eh_depth == 0) {
1328                /* Root extent block becomes index block */
1329                neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1330                EXT_FIRST_INDEX(neh)->ei_block =
1331                        EXT_FIRST_EXTENT(neh)->ee_block;
1332        }
1333        ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n",
1334                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1335                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1336                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1337
1338        le16_add_cpu(&neh->eh_depth, 1);
1339        err = ext4_mark_inode_dirty(handle, inode);
1340out:
1341        brelse(bh);
1342
1343        return err;
1344}
1345
1346/*
1347 * ext4_ext_create_new_leaf:
1348 * finds empty index and adds new leaf.
1349 * if no free index is found, then it requests in-depth growing.
1350 */
1351static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1352                                    unsigned int mb_flags,
1353                                    unsigned int gb_flags,
1354                                    struct ext4_ext_path **ppath,
1355                                    struct ext4_extent *newext)
1356{
1357        struct ext4_ext_path *path = *ppath;
1358        struct ext4_ext_path *curp;
1359        int depth, i, err = 0;
1360
1361repeat:
1362        i = depth = ext_depth(inode);
1363
1364        /* walk up to the tree and look for free index entry */
1365        curp = path + depth;
1366        while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1367                i--;
1368                curp--;
1369        }
1370
1371        /* we use already allocated block for index block,
1372         * so subsequent data blocks should be contiguous */
1373        if (EXT_HAS_FREE_INDEX(curp)) {
1374                /* if we found index with free entry, then use that
1375                 * entry: create all needed subtree and add new leaf */
1376                err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1377                if (err)
1378                        goto out;
1379
1380                /* refill path */
1381                path = ext4_find_extent(inode,
1382                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1383                                    ppath, gb_flags);
1384                if (IS_ERR(path))
1385                        err = PTR_ERR(path);
1386        } else {
1387                /* tree is full, time to grow in depth */
1388                err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1389                if (err)
1390                        goto out;
1391
1392                /* refill path */
1393                path = ext4_find_extent(inode,
1394                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1395                                    ppath, gb_flags);
1396                if (IS_ERR(path)) {
1397                        err = PTR_ERR(path);
1398                        goto out;
1399                }
1400
1401                /*
1402                 * only first (depth 0 -> 1) produces free space;
1403                 * in all other cases we have to split the grown tree
1404                 */
1405                depth = ext_depth(inode);
1406                if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1407                        /* now we need to split */
1408                        goto repeat;
1409                }
1410        }
1411
1412out:
1413        return err;
1414}
1415
1416/*
1417 * search the closest allocated block to the left for *logical
1418 * and returns it at @logical + it's physical address at @phys
1419 * if *logical is the smallest allocated block, the function
1420 * returns 0 at @phys
1421 * return value contains 0 (success) or error code
1422 */
1423static int ext4_ext_search_left(struct inode *inode,
1424                                struct ext4_ext_path *path,
1425                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
1426{
1427        struct ext4_extent_idx *ix;
1428        struct ext4_extent *ex;
1429        int depth, ee_len;
1430
1431        if (unlikely(path == NULL)) {
1432                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1433                return -EFSCORRUPTED;
1434        }
1435        depth = path->p_depth;
1436        *phys = 0;
1437
1438        if (depth == 0 && path->p_ext == NULL)
1439                return 0;
1440
1441        /* usually extent in the path covers blocks smaller
1442         * then *logical, but it can be that extent is the
1443         * first one in the file */
1444
1445        ex = path[depth].p_ext;
1446        ee_len = ext4_ext_get_actual_len(ex);
1447        if (*logical < le32_to_cpu(ex->ee_block)) {
1448                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1449                        EXT4_ERROR_INODE(inode,
1450                                         "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1451                                         *logical, le32_to_cpu(ex->ee_block));
1452                        return -EFSCORRUPTED;
1453                }
1454                while (--depth >= 0) {
1455                        ix = path[depth].p_idx;
1456                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1457                                EXT4_ERROR_INODE(inode,
1458                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1459                                  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1460                                  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1461                le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1462                                  depth);
1463                                return -EFSCORRUPTED;
1464                        }
1465                }
1466                return 0;
1467        }
1468
1469        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1470                EXT4_ERROR_INODE(inode,
1471                                 "logical %d < ee_block %d + ee_len %d!",
1472                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
1473                return -EFSCORRUPTED;
1474        }
1475
1476        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1477        *phys = ext4_ext_pblock(ex) + ee_len - 1;
1478        return 0;
1479}
1480
1481/*
1482 * Search the closest allocated block to the right for *logical
1483 * and returns it at @logical + it's physical address at @phys.
1484 * If not exists, return 0 and @phys is set to 0. We will return
1485 * 1 which means we found an allocated block and ret_ex is valid.
1486 * Or return a (< 0) error code.
1487 */
1488static int ext4_ext_search_right(struct inode *inode,
1489                                 struct ext4_ext_path *path,
1490                                 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1491                                 struct ext4_extent *ret_ex)
1492{
1493        struct buffer_head *bh = NULL;
1494        struct ext4_extent_header *eh;
1495        struct ext4_extent_idx *ix;
1496        struct ext4_extent *ex;
1497        ext4_fsblk_t block;
1498        int depth;      /* Note, NOT eh_depth; depth from top of tree */
1499        int ee_len;
1500
1501        if (unlikely(path == NULL)) {
1502                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1503                return -EFSCORRUPTED;
1504        }
1505        depth = path->p_depth;
1506        *phys = 0;
1507
1508        if (depth == 0 && path->p_ext == NULL)
1509                return 0;
1510
1511        /* usually extent in the path covers blocks smaller
1512         * then *logical, but it can be that extent is the
1513         * first one in the file */
1514
1515        ex = path[depth].p_ext;
1516        ee_len = ext4_ext_get_actual_len(ex);
1517        if (*logical < le32_to_cpu(ex->ee_block)) {
1518                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1519                        EXT4_ERROR_INODE(inode,
1520                                         "first_extent(path[%d].p_hdr) != ex",
1521                                         depth);
1522                        return -EFSCORRUPTED;
1523                }
1524                while (--depth >= 0) {
1525                        ix = path[depth].p_idx;
1526                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1527                                EXT4_ERROR_INODE(inode,
1528                                                 "ix != EXT_FIRST_INDEX *logical %d!",
1529                                                 *logical);
1530                                return -EFSCORRUPTED;
1531                        }
1532                }
1533                goto found_extent;
1534        }
1535
1536        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1537                EXT4_ERROR_INODE(inode,
1538                                 "logical %d < ee_block %d + ee_len %d!",
1539                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
1540                return -EFSCORRUPTED;
1541        }
1542
1543        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1544                /* next allocated block in this leaf */
1545                ex++;
1546                goto found_extent;
1547        }
1548
1549        /* go up and search for index to the right */
1550        while (--depth >= 0) {
1551                ix = path[depth].p_idx;
1552                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1553                        goto got_index;
1554        }
1555
1556        /* we've gone up to the root and found no index to the right */
1557        return 0;
1558
1559got_index:
1560        /* we've found index to the right, let's
1561         * follow it and find the closest allocated
1562         * block to the right */
1563        ix++;
1564        block = ext4_idx_pblock(ix);
1565        while (++depth < path->p_depth) {
1566                /* subtract from p_depth to get proper eh_depth */
1567                bh = read_extent_tree_block(inode, block,
1568                                            path->p_depth - depth, 0);
1569                if (IS_ERR(bh))
1570                        return PTR_ERR(bh);
1571                eh = ext_block_hdr(bh);
1572                ix = EXT_FIRST_INDEX(eh);
1573                block = ext4_idx_pblock(ix);
1574                put_bh(bh);
1575        }
1576
1577        bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1578        if (IS_ERR(bh))
1579                return PTR_ERR(bh);
1580        eh = ext_block_hdr(bh);
1581        ex = EXT_FIRST_EXTENT(eh);
1582found_extent:
1583        *logical = le32_to_cpu(ex->ee_block);
1584        *phys = ext4_ext_pblock(ex);
1585        if (ret_ex)
1586                *ret_ex = *ex;
1587        if (bh)
1588                put_bh(bh);
1589        return 1;
1590}
1591
1592/*
1593 * ext4_ext_next_allocated_block:
1594 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1595 * NOTE: it considers block number from index entry as
1596 * allocated block. Thus, index entries have to be consistent
1597 * with leaves.
1598 */
1599ext4_lblk_t
1600ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1601{
1602        int depth;
1603
1604        BUG_ON(path == NULL);
1605        depth = path->p_depth;
1606
1607        if (depth == 0 && path->p_ext == NULL)
1608                return EXT_MAX_BLOCKS;
1609
1610        while (depth >= 0) {
1611                struct ext4_ext_path *p = &path[depth];
1612
1613                if (depth == path->p_depth) {
1614                        /* leaf */
1615                        if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
1616                                return le32_to_cpu(p->p_ext[1].ee_block);
1617                } else {
1618                        /* index */
1619                        if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
1620                                return le32_to_cpu(p->p_idx[1].ei_block);
1621                }
1622                depth--;
1623        }
1624
1625        return EXT_MAX_BLOCKS;
1626}
1627
1628/*
1629 * ext4_ext_next_leaf_block:
1630 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1631 */
1632static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1633{
1634        int depth;
1635
1636        BUG_ON(path == NULL);
1637        depth = path->p_depth;
1638
1639        /* zero-tree has no leaf blocks at all */
1640        if (depth == 0)
1641                return EXT_MAX_BLOCKS;
1642
1643        /* go to index block */
1644        depth--;
1645
1646        while (depth >= 0) {
1647                if (path[depth].p_idx !=
1648                                EXT_LAST_INDEX(path[depth].p_hdr))
1649                        return (ext4_lblk_t)
1650                                le32_to_cpu(path[depth].p_idx[1].ei_block);
1651                depth--;
1652        }
1653
1654        return EXT_MAX_BLOCKS;
1655}
1656
1657/*
1658 * ext4_ext_correct_indexes:
1659 * if leaf gets modified and modified extent is first in the leaf,
1660 * then we have to correct all indexes above.
1661 * TODO: do we need to correct tree in all cases?
1662 */
1663static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1664                                struct ext4_ext_path *path)
1665{
1666        struct ext4_extent_header *eh;
1667        int depth = ext_depth(inode);
1668        struct ext4_extent *ex;
1669        __le32 border;
1670        int k, err = 0;
1671
1672        eh = path[depth].p_hdr;
1673        ex = path[depth].p_ext;
1674
1675        if (unlikely(ex == NULL || eh == NULL)) {
1676                EXT4_ERROR_INODE(inode,
1677                                 "ex %p == NULL or eh %p == NULL", ex, eh);
1678                return -EFSCORRUPTED;
1679        }
1680
1681        if (depth == 0) {
1682                /* there is no tree at all */
1683                return 0;
1684        }
1685
1686        if (ex != EXT_FIRST_EXTENT(eh)) {
1687                /* we correct tree if first leaf got modified only */
1688                return 0;
1689        }
1690
1691        /*
1692         * TODO: we need correction if border is smaller than current one
1693         */
1694        k = depth - 1;
1695        border = path[depth].p_ext->ee_block;
1696        err = ext4_ext_get_access(handle, inode, path + k);
1697        if (err)
1698                return err;
1699        path[k].p_idx->ei_block = border;
1700        err = ext4_ext_dirty(handle, inode, path + k);
1701        if (err)
1702                return err;
1703
1704        while (k--) {
1705                /* change all left-side indexes */
1706                if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1707                        break;
1708                err = ext4_ext_get_access(handle, inode, path + k);
1709                if (err)
1710                        break;
1711                path[k].p_idx->ei_block = border;
1712                err = ext4_ext_dirty(handle, inode, path + k);
1713                if (err)
1714                        break;
1715        }
1716
1717        return err;
1718}
1719
1720static int ext4_can_extents_be_merged(struct inode *inode,
1721                                      struct ext4_extent *ex1,
1722                                      struct ext4_extent *ex2)
1723{
1724        unsigned short ext1_ee_len, ext2_ee_len;
1725
1726        if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1727                return 0;
1728
1729        ext1_ee_len = ext4_ext_get_actual_len(ex1);
1730        ext2_ee_len = ext4_ext_get_actual_len(ex2);
1731
1732        if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1733                        le32_to_cpu(ex2->ee_block))
1734                return 0;
1735
1736        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1737                return 0;
1738
1739        if (ext4_ext_is_unwritten(ex1) &&
1740            ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
1741                return 0;
1742#ifdef AGGRESSIVE_TEST
1743        if (ext1_ee_len >= 4)
1744                return 0;
1745#endif
1746
1747        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1748                return 1;
1749        return 0;
1750}
1751
1752/*
1753 * This function tries to merge the "ex" extent to the next extent in the tree.
1754 * It always tries to merge towards right. If you want to merge towards
1755 * left, pass "ex - 1" as argument instead of "ex".
1756 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1757 * 1 if they got merged.
1758 */
1759static int ext4_ext_try_to_merge_right(struct inode *inode,
1760                                 struct ext4_ext_path *path,
1761                                 struct ext4_extent *ex)
1762{
1763        struct ext4_extent_header *eh;
1764        unsigned int depth, len;
1765        int merge_done = 0, unwritten;
1766
1767        depth = ext_depth(inode);
1768        BUG_ON(path[depth].p_hdr == NULL);
1769        eh = path[depth].p_hdr;
1770
1771        while (ex < EXT_LAST_EXTENT(eh)) {
1772                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1773                        break;
1774                /* merge with next extent! */
1775                unwritten = ext4_ext_is_unwritten(ex);
1776                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1777                                + ext4_ext_get_actual_len(ex + 1));
1778                if (unwritten)
1779                        ext4_ext_mark_unwritten(ex);
1780
1781                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1782                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
1783                                * sizeof(struct ext4_extent);
1784                        memmove(ex + 1, ex + 2, len);
1785                }
1786                le16_add_cpu(&eh->eh_entries, -1);
1787                merge_done = 1;
1788                WARN_ON(eh->eh_entries == 0);
1789                if (!eh->eh_entries)
1790                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1791        }
1792
1793        return merge_done;
1794}
1795
1796/*
1797 * This function does a very simple check to see if we can collapse
1798 * an extent tree with a single extent tree leaf block into the inode.
1799 */
1800static void ext4_ext_try_to_merge_up(handle_t *handle,
1801                                     struct inode *inode,
1802                                     struct ext4_ext_path *path)
1803{
1804        size_t s;
1805        unsigned max_root = ext4_ext_space_root(inode, 0);
1806        ext4_fsblk_t blk;
1807
1808        if ((path[0].p_depth != 1) ||
1809            (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1810            (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1811                return;
1812
1813        /*
1814         * We need to modify the block allocation bitmap and the block
1815         * group descriptor to release the extent tree block.  If we
1816         * can't get the journal credits, give up.
1817         */
1818        if (ext4_journal_extend(handle, 2,
1819                        ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
1820                return;
1821
1822        /*
1823         * Copy the extent data up to the inode
1824         */
1825        blk = ext4_idx_pblock(path[0].p_idx);
1826        s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1827                sizeof(struct ext4_extent_idx);
1828        s += sizeof(struct ext4_extent_header);
1829
1830        path[1].p_maxdepth = path[0].p_maxdepth;
1831        memcpy(path[0].p_hdr, path[1].p_hdr, s);
1832        path[0].p_depth = 0;
1833        path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1834                (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1835        path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1836
1837        brelse(path[1].p_bh);
1838        ext4_free_blocks(handle, inode, NULL, blk, 1,
1839                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1840}
1841
1842/*
1843 * This function tries to merge the @ex extent to neighbours in the tree, then
1844 * tries to collapse the extent tree into the inode.
1845 */
1846static void ext4_ext_try_to_merge(handle_t *handle,
1847                                  struct inode *inode,
1848                                  struct ext4_ext_path *path,
1849                                  struct ext4_extent *ex)
1850{
1851        struct ext4_extent_header *eh;
1852        unsigned int depth;
1853        int merge_done = 0;
1854
1855        depth = ext_depth(inode);
1856        BUG_ON(path[depth].p_hdr == NULL);
1857        eh = path[depth].p_hdr;
1858
1859        if (ex > EXT_FIRST_EXTENT(eh))
1860                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1861
1862        if (!merge_done)
1863                (void) ext4_ext_try_to_merge_right(inode, path, ex);
1864
1865        ext4_ext_try_to_merge_up(handle, inode, path);
1866}
1867
1868/*
1869 * check if a portion of the "newext" extent overlaps with an
1870 * existing extent.
1871 *
1872 * If there is an overlap discovered, it updates the length of the newext
1873 * such that there will be no overlap, and then returns 1.
1874 * If there is no overlap found, it returns 0.
1875 */
1876static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1877                                           struct inode *inode,
1878                                           struct ext4_extent *newext,
1879                                           struct ext4_ext_path *path)
1880{
1881        ext4_lblk_t b1, b2;
1882        unsigned int depth, len1;
1883        unsigned int ret = 0;
1884
1885        b1 = le32_to_cpu(newext->ee_block);
1886        len1 = ext4_ext_get_actual_len(newext);
1887        depth = ext_depth(inode);
1888        if (!path[depth].p_ext)
1889                goto out;
1890        b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1891
1892        /*
1893         * get the next allocated block if the extent in the path
1894         * is before the requested block(s)
1895         */
1896        if (b2 < b1) {
1897                b2 = ext4_ext_next_allocated_block(path);
1898                if (b2 == EXT_MAX_BLOCKS)
1899                        goto out;
1900                b2 = EXT4_LBLK_CMASK(sbi, b2);
1901        }
1902
1903        /* check for wrap through zero on extent logical start block*/
1904        if (b1 + len1 < b1) {
1905                len1 = EXT_MAX_BLOCKS - b1;
1906                newext->ee_len = cpu_to_le16(len1);
1907                ret = 1;
1908        }
1909
1910        /* check for overlap */
1911        if (b1 + len1 > b2) {
1912                newext->ee_len = cpu_to_le16(b2 - b1);
1913                ret = 1;
1914        }
1915out:
1916        return ret;
1917}
1918
1919/*
1920 * ext4_ext_insert_extent:
1921 * tries to merge requested extent into the existing extent or
1922 * inserts requested extent as new one into the tree,
1923 * creating new leaf in the no-space case.
1924 */
1925int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1926                                struct ext4_ext_path **ppath,
1927                                struct ext4_extent *newext, int gb_flags)
1928{
1929        struct ext4_ext_path *path = *ppath;
1930        struct ext4_extent_header *eh;
1931        struct ext4_extent *ex, *fex;
1932        struct ext4_extent *nearex; /* nearest extent */
1933        struct ext4_ext_path *npath = NULL;
1934        int depth, len, err;
1935        ext4_lblk_t next;
1936        int mb_flags = 0, unwritten;
1937
1938        if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1939                mb_flags |= EXT4_MB_DELALLOC_RESERVED;
1940        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1941                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1942                return -EFSCORRUPTED;
1943        }
1944        depth = ext_depth(inode);
1945        ex = path[depth].p_ext;
1946        eh = path[depth].p_hdr;
1947        if (unlikely(path[depth].p_hdr == NULL)) {
1948                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1949                return -EFSCORRUPTED;
1950        }
1951
1952        /* try to insert block into found extent and return */
1953        if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1954
1955                /*
1956                 * Try to see whether we should rather test the extent on
1957                 * right from ex, or from the left of ex. This is because
1958                 * ext4_find_extent() can return either extent on the
1959                 * left, or on the right from the searched position. This
1960                 * will make merging more effective.
1961                 */
1962                if (ex < EXT_LAST_EXTENT(eh) &&
1963                    (le32_to_cpu(ex->ee_block) +
1964                    ext4_ext_get_actual_len(ex) <
1965                    le32_to_cpu(newext->ee_block))) {
1966                        ex += 1;
1967                        goto prepend;
1968                } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1969                           (le32_to_cpu(newext->ee_block) +
1970                           ext4_ext_get_actual_len(newext) <
1971                           le32_to_cpu(ex->ee_block)))
1972                        ex -= 1;
1973
1974                /* Try to append newex to the ex */
1975                if (ext4_can_extents_be_merged(inode, ex, newext)) {
1976                        ext_debug(inode, "append [%d]%d block to %u:[%d]%d"
1977                                  "(from %llu)\n",
1978                                  ext4_ext_is_unwritten(newext),
1979                                  ext4_ext_get_actual_len(newext),
1980                                  le32_to_cpu(ex->ee_block),
1981                                  ext4_ext_is_unwritten(ex),
1982                                  ext4_ext_get_actual_len(ex),
1983                                  ext4_ext_pblock(ex));
1984                        err = ext4_ext_get_access(handle, inode,
1985                                                  path + depth);
1986                        if (err)
1987                                return err;
1988                        unwritten = ext4_ext_is_unwritten(ex);
1989                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1990                                        + ext4_ext_get_actual_len(newext));
1991                        if (unwritten)
1992                                ext4_ext_mark_unwritten(ex);
1993                        eh = path[depth].p_hdr;
1994                        nearex = ex;
1995                        goto merge;
1996                }
1997
1998prepend:
1999                /* Try to prepend newex to the ex */
2000                if (ext4_can_extents_be_merged(inode, newext, ex)) {
2001                        ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d"
2002                                  "(from %llu)\n",
2003                                  le32_to_cpu(newext->ee_block),
2004                                  ext4_ext_is_unwritten(newext),
2005                                  ext4_ext_get_actual_len(newext),
2006                                  le32_to_cpu(ex->ee_block),
2007                                  ext4_ext_is_unwritten(ex),
2008                                  ext4_ext_get_actual_len(ex),
2009                                  ext4_ext_pblock(ex));
2010                        err = ext4_ext_get_access(handle, inode,
2011                                                  path + depth);
2012                        if (err)
2013                                return err;
2014
2015                        unwritten = ext4_ext_is_unwritten(ex);
2016                        ex->ee_block = newext->ee_block;
2017                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
2018                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2019                                        + ext4_ext_get_actual_len(newext));
2020                        if (unwritten)
2021                                ext4_ext_mark_unwritten(ex);
2022                        eh = path[depth].p_hdr;
2023                        nearex = ex;
2024                        goto merge;
2025                }
2026        }
2027
2028        depth = ext_depth(inode);
2029        eh = path[depth].p_hdr;
2030        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2031                goto has_space;
2032
2033        /* probably next leaf has space for us? */
2034        fex = EXT_LAST_EXTENT(eh);
2035        next = EXT_MAX_BLOCKS;
2036        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2037                next = ext4_ext_next_leaf_block(path);
2038        if (next != EXT_MAX_BLOCKS) {
2039                ext_debug(inode, "next leaf block - %u\n", next);
2040                BUG_ON(npath != NULL);
2041                npath = ext4_find_extent(inode, next, NULL, gb_flags);
2042                if (IS_ERR(npath))
2043                        return PTR_ERR(npath);
2044                BUG_ON(npath->p_depth != path->p_depth);
2045                eh = npath[depth].p_hdr;
2046                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2047                        ext_debug(inode, "next leaf isn't full(%d)\n",
2048                                  le16_to_cpu(eh->eh_entries));
2049                        path = npath;
2050                        goto has_space;
2051                }
2052                ext_debug(inode, "next leaf has no free space(%d,%d)\n",
2053                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2054        }
2055
2056        /*
2057         * There is no free space in the found leaf.
2058         * We're gonna add a new leaf in the tree.
2059         */
2060        if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2061                mb_flags |= EXT4_MB_USE_RESERVED;
2062        err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2063                                       ppath, newext);
2064        if (err)
2065                goto cleanup;
2066        depth = ext_depth(inode);
2067        eh = path[depth].p_hdr;
2068
2069has_space:
2070        nearex = path[depth].p_ext;
2071
2072        err = ext4_ext_get_access(handle, inode, path + depth);
2073        if (err)
2074                goto cleanup;
2075
2076        if (!nearex) {
2077                /* there is no extent in this leaf, create first one */
2078                ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n",
2079                                le32_to_cpu(newext->ee_block),
2080                                ext4_ext_pblock(newext),
2081                                ext4_ext_is_unwritten(newext),
2082                                ext4_ext_get_actual_len(newext));
2083                nearex = EXT_FIRST_EXTENT(eh);
2084        } else {
2085                if (le32_to_cpu(newext->ee_block)
2086                           > le32_to_cpu(nearex->ee_block)) {
2087                        /* Insert after */
2088                        ext_debug(inode, "insert %u:%llu:[%d]%d before: "
2089                                        "nearest %p\n",
2090                                        le32_to_cpu(newext->ee_block),
2091                                        ext4_ext_pblock(newext),
2092                                        ext4_ext_is_unwritten(newext),
2093                                        ext4_ext_get_actual_len(newext),
2094                                        nearex);
2095                        nearex++;
2096                } else {
2097                        /* Insert before */
2098                        BUG_ON(newext->ee_block == nearex->ee_block);
2099                        ext_debug(inode, "insert %u:%llu:[%d]%d after: "
2100                                        "nearest %p\n",
2101                                        le32_to_cpu(newext->ee_block),
2102                                        ext4_ext_pblock(newext),
2103                                        ext4_ext_is_unwritten(newext),
2104                                        ext4_ext_get_actual_len(newext),
2105                                        nearex);
2106                }
2107                len = EXT_LAST_EXTENT(eh) - nearex + 1;
2108                if (len > 0) {
2109                        ext_debug(inode, "insert %u:%llu:[%d]%d: "
2110                                        "move %d extents from 0x%p to 0x%p\n",
2111                                        le32_to_cpu(newext->ee_block),
2112                                        ext4_ext_pblock(newext),
2113                                        ext4_ext_is_unwritten(newext),
2114                                        ext4_ext_get_actual_len(newext),
2115                                        len, nearex, nearex + 1);
2116                        memmove(nearex + 1, nearex,
2117                                len * sizeof(struct ext4_extent));
2118                }
2119        }
2120
2121        le16_add_cpu(&eh->eh_entries, 1);
2122        path[depth].p_ext = nearex;
2123        nearex->ee_block = newext->ee_block;
2124        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2125        nearex->ee_len = newext->ee_len;
2126
2127merge:
2128        /* try to merge extents */
2129        if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2130                ext4_ext_try_to_merge(handle, inode, path, nearex);
2131
2132
2133        /* time to correct all indexes above */
2134        err = ext4_ext_correct_indexes(handle, inode, path);
2135        if (err)
2136                goto cleanup;
2137
2138        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2139
2140cleanup:
2141        ext4_ext_drop_refs(npath);
2142        kfree(npath);
2143        return err;
2144}
2145
2146static int ext4_fill_es_cache_info(struct inode *inode,
2147                                   ext4_lblk_t block, ext4_lblk_t num,
2148                                   struct fiemap_extent_info *fieinfo)
2149{
2150        ext4_lblk_t next, end = block + num - 1;
2151        struct extent_status es;
2152        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2153        unsigned int flags;
2154        int err;
2155
2156        while (block <= end) {
2157                next = 0;
2158                flags = 0;
2159                if (!ext4_es_lookup_extent(inode, block, &next, &es))
2160                        break;
2161                if (ext4_es_is_unwritten(&es))
2162                        flags |= FIEMAP_EXTENT_UNWRITTEN;
2163                if (ext4_es_is_delayed(&es))
2164                        flags |= (FIEMAP_EXTENT_DELALLOC |
2165                                  FIEMAP_EXTENT_UNKNOWN);
2166                if (ext4_es_is_hole(&es))
2167                        flags |= EXT4_FIEMAP_EXTENT_HOLE;
2168                if (next == 0)
2169                        flags |= FIEMAP_EXTENT_LAST;
2170                if (flags & (FIEMAP_EXTENT_DELALLOC|
2171                             EXT4_FIEMAP_EXTENT_HOLE))
2172                        es.es_pblk = 0;
2173                else
2174                        es.es_pblk = ext4_es_pblock(&es);
2175                err = fiemap_fill_next_extent(fieinfo,
2176                                (__u64)es.es_lblk << blksize_bits,
2177                                (__u64)es.es_pblk << blksize_bits,
2178                                (__u64)es.es_len << blksize_bits,
2179                                flags);
2180                if (next == 0)
2181                        break;
2182                block = next;
2183                if (err < 0)
2184                        return err;
2185                if (err == 1)
2186                        return 0;
2187        }
2188        return 0;
2189}
2190
2191
2192/*
2193 * ext4_ext_determine_hole - determine hole around given block
2194 * @inode:      inode we lookup in
2195 * @path:       path in extent tree to @lblk
2196 * @lblk:       pointer to logical block around which we want to determine hole
2197 *
2198 * Determine hole length (and start if easily possible) around given logical
2199 * block. We don't try too hard to find the beginning of the hole but @path
2200 * actually points to extent before @lblk, we provide it.
2201 *
2202 * The function returns the length of a hole starting at @lblk. We update @lblk
2203 * to the beginning of the hole if we managed to find it.
2204 */
2205static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2206                                           struct ext4_ext_path *path,
2207                                           ext4_lblk_t *lblk)
2208{
2209        int depth = ext_depth(inode);
2210        struct ext4_extent *ex;
2211        ext4_lblk_t len;
2212
2213        ex = path[depth].p_ext;
2214        if (ex == NULL) {
2215                /* there is no extent yet, so gap is [0;-] */
2216                *lblk = 0;
2217                len = EXT_MAX_BLOCKS;
2218        } else if (*lblk < le32_to_cpu(ex->ee_block)) {
2219                len = le32_to_cpu(ex->ee_block) - *lblk;
2220        } else if (*lblk >= le32_to_cpu(ex->ee_block)
2221                        + ext4_ext_get_actual_len(ex)) {
2222                ext4_lblk_t next;
2223
2224                *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2225                next = ext4_ext_next_allocated_block(path);
2226                BUG_ON(next == *lblk);
2227                len = next - *lblk;
2228        } else {
2229                BUG();
2230        }
2231        return len;
2232}
2233
2234/*
2235 * ext4_ext_put_gap_in_cache:
2236 * calculate boundaries of the gap that the requested block fits into
2237 * and cache this gap
2238 */
2239static void
2240ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2241                          ext4_lblk_t hole_len)
2242{
2243        struct extent_status es;
2244
2245        ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
2246                                  hole_start + hole_len - 1, &es);
2247        if (es.es_len) {
2248                /* There's delayed extent containing lblock? */
2249                if (es.es_lblk <= hole_start)
2250                        return;
2251                hole_len = min(es.es_lblk - hole_start, hole_len);
2252        }
2253        ext_debug(inode, " -> %u:%u\n", hole_start, hole_len);
2254        ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2255                              EXTENT_STATUS_HOLE);
2256}
2257
2258/*
2259 * ext4_ext_rm_idx:
2260 * removes index from the index block.
2261 */
2262static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2263                        struct ext4_ext_path *path, int depth)
2264{
2265        int err;
2266        ext4_fsblk_t leaf;
2267
2268        /* free index block */
2269        depth--;
2270        path = path + depth;
2271        leaf = ext4_idx_pblock(path->p_idx);
2272        if (unlikely(path->p_hdr->eh_entries == 0)) {
2273                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2274                return -EFSCORRUPTED;
2275        }
2276        err = ext4_ext_get_access(handle, inode, path);
2277        if (err)
2278                return err;
2279
2280        if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2281                int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2282                len *= sizeof(struct ext4_extent_idx);
2283                memmove(path->p_idx, path->p_idx + 1, len);
2284        }
2285
2286        le16_add_cpu(&path->p_hdr->eh_entries, -1);
2287        err = ext4_ext_dirty(handle, inode, path);
2288        if (err)
2289                return err;
2290        ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
2291        trace_ext4_ext_rm_idx(inode, leaf);
2292
2293        ext4_free_blocks(handle, inode, NULL, leaf, 1,
2294                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2295
2296        while (--depth >= 0) {
2297                if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2298                        break;
2299                path--;
2300                err = ext4_ext_get_access(handle, inode, path);
2301                if (err)
2302                        break;
2303                path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2304                err = ext4_ext_dirty(handle, inode, path);
2305                if (err)
2306                        break;
2307        }
2308        return err;
2309}
2310
2311/*
2312 * ext4_ext_calc_credits_for_single_extent:
2313 * This routine returns max. credits that needed to insert an extent
2314 * to the extent tree.
2315 * When pass the actual path, the caller should calculate credits
2316 * under i_data_sem.
2317 */
2318int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2319                                                struct ext4_ext_path *path)
2320{
2321        if (path) {
2322                int depth = ext_depth(inode);
2323                int ret = 0;
2324
2325                /* probably there is space in leaf? */
2326                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2327                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2328
2329                        /*
2330                         *  There are some space in the leaf tree, no
2331                         *  need to account for leaf block credit
2332                         *
2333                         *  bitmaps and block group descriptor blocks
2334                         *  and other metadata blocks still need to be
2335                         *  accounted.
2336                         */
2337                        /* 1 bitmap, 1 block group descriptor */
2338                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2339                        return ret;
2340                }
2341        }
2342
2343        return ext4_chunk_trans_blocks(inode, nrblocks);
2344}
2345
2346/*
2347 * How many index/leaf blocks need to change/allocate to add @extents extents?
2348 *
2349 * If we add a single extent, then in the worse case, each tree level
2350 * index/leaf need to be changed in case of the tree split.
2351 *
2352 * If more extents are inserted, they could cause the whole tree split more
2353 * than once, but this is really rare.
2354 */
2355int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2356{
2357        int index;
2358        int depth;
2359
2360        /* If we are converting the inline data, only one is needed here. */
2361        if (ext4_has_inline_data(inode))
2362                return 1;
2363
2364        depth = ext_depth(inode);
2365
2366        if (extents <= 1)
2367                index = depth * 2;
2368        else
2369                index = depth * 3;
2370
2371        return index;
2372}
2373
2374static inline int get_default_free_blocks_flags(struct inode *inode)
2375{
2376        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
2377            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
2378                return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2379        else if (ext4_should_journal_data(inode))
2380                return EXT4_FREE_BLOCKS_FORGET;
2381        return 0;
2382}
2383
2384/*
2385 * ext4_rereserve_cluster - increment the reserved cluster count when
2386 *                          freeing a cluster with a pending reservation
2387 *
2388 * @inode - file containing the cluster
2389 * @lblk - logical block in cluster to be reserved
2390 *
2391 * Increments the reserved cluster count and adjusts quota in a bigalloc
2392 * file system when freeing a partial cluster containing at least one
2393 * delayed and unwritten block.  A partial cluster meeting that
2394 * requirement will have a pending reservation.  If so, the
2395 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2396 * defer reserved and allocated space accounting to a subsequent call
2397 * to this function.
2398 */
2399static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
2400{
2401        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2402        struct ext4_inode_info *ei = EXT4_I(inode);
2403
2404        dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
2405
2406        spin_lock(&ei->i_block_reservation_lock);
2407        ei->i_reserved_data_blocks++;
2408        percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
2409        spin_unlock(&ei->i_block_reservation_lock);
2410
2411        percpu_counter_add(&sbi->s_freeclusters_counter, 1);
2412        ext4_remove_pending(inode, lblk);
2413}
2414
2415static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2416                              struct ext4_extent *ex,
2417                              struct partial_cluster *partial,
2418                              ext4_lblk_t from, ext4_lblk_t to)
2419{
2420        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2421        unsigned short ee_len = ext4_ext_get_actual_len(ex);
2422        ext4_fsblk_t last_pblk, pblk;
2423        ext4_lblk_t num;
2424        int flags;
2425
2426        /* only extent tail removal is allowed */
2427        if (from < le32_to_cpu(ex->ee_block) ||
2428            to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
2429                ext4_error(sbi->s_sb,
2430                           "strange request: removal(2) %u-%u from %u:%u",
2431                           from, to, le32_to_cpu(ex->ee_block), ee_len);
2432                return 0;
2433        }
2434
2435#ifdef EXTENTS_STATS
2436        spin_lock(&sbi->s_ext_stats_lock);
2437        sbi->s_ext_blocks += ee_len;
2438        sbi->s_ext_extents++;
2439        if (ee_len < sbi->s_ext_min)
2440                sbi->s_ext_min = ee_len;
2441        if (ee_len > sbi->s_ext_max)
2442                sbi->s_ext_max = ee_len;
2443        if (ext_depth(inode) > sbi->s_depth_max)
2444                sbi->s_depth_max = ext_depth(inode);
2445        spin_unlock(&sbi->s_ext_stats_lock);
2446#endif
2447
2448        trace_ext4_remove_blocks(inode, ex, from, to, partial);
2449
2450        /*
2451         * if we have a partial cluster, and it's different from the
2452         * cluster of the last block in the extent, we free it
2453         */
2454        last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
2455
2456        if (partial->state != initial &&
2457            partial->pclu != EXT4_B2C(sbi, last_pblk)) {
2458                if (partial->state == tofree) {
2459                        flags = get_default_free_blocks_flags(inode);
2460                        if (ext4_is_pending(inode, partial->lblk))
2461                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2462                        ext4_free_blocks(handle, inode, NULL,
2463                                         EXT4_C2B(sbi, partial->pclu),
2464                                         sbi->s_cluster_ratio, flags);
2465                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2466                                ext4_rereserve_cluster(inode, partial->lblk);
2467                }
2468                partial->state = initial;
2469        }
2470
2471        num = le32_to_cpu(ex->ee_block) + ee_len - from;
2472        pblk = ext4_ext_pblock(ex) + ee_len - num;
2473
2474        /*
2475         * We free the partial cluster at the end of the extent (if any),
2476         * unless the cluster is used by another extent (partial_cluster
2477         * state is nofree).  If a partial cluster exists here, it must be
2478         * shared with the last block in the extent.
2479         */
2480        flags = get_default_free_blocks_flags(inode);
2481
2482        /* partial, left end cluster aligned, right end unaligned */
2483        if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
2484            (EXT4_LBLK_CMASK(sbi, to) >= from) &&
2485            (partial->state != nofree)) {
2486                if (ext4_is_pending(inode, to))
2487                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2488                ext4_free_blocks(handle, inode, NULL,
2489                                 EXT4_PBLK_CMASK(sbi, last_pblk),
2490                                 sbi->s_cluster_ratio, flags);
2491                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2492                        ext4_rereserve_cluster(inode, to);
2493                partial->state = initial;
2494                flags = get_default_free_blocks_flags(inode);
2495        }
2496
2497        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2498
2499        /*
2500         * For bigalloc file systems, we never free a partial cluster
2501         * at the beginning of the extent.  Instead, we check to see if we
2502         * need to free it on a subsequent call to ext4_remove_blocks,
2503         * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2504         */
2505        flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2506        ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2507
2508        /* reset the partial cluster if we've freed past it */
2509        if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
2510                partial->state = initial;
2511
2512        /*
2513         * If we've freed the entire extent but the beginning is not left
2514         * cluster aligned and is not marked as ineligible for freeing we
2515         * record the partial cluster at the beginning of the extent.  It
2516         * wasn't freed by the preceding ext4_free_blocks() call, and we
2517         * need to look farther to the left to determine if it's to be freed
2518         * (not shared with another extent). Else, reset the partial
2519         * cluster - we're either  done freeing or the beginning of the
2520         * extent is left cluster aligned.
2521         */
2522        if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
2523                if (partial->state == initial) {
2524                        partial->pclu = EXT4_B2C(sbi, pblk);
2525                        partial->lblk = from;
2526                        partial->state = tofree;
2527                }
2528        } else {
2529                partial->state = initial;
2530        }
2531
2532        return 0;
2533}
2534
2535/*
2536 * ext4_ext_rm_leaf() Removes the extents associated with the
2537 * blocks appearing between "start" and "end".  Both "start"
2538 * and "end" must appear in the same extent or EIO is returned.
2539 *
2540 * @handle: The journal handle
2541 * @inode:  The files inode
2542 * @path:   The path to the leaf
2543 * @partial_cluster: The cluster which we'll have to free if all extents
2544 *                   has been released from it.  However, if this value is
2545 *                   negative, it's a cluster just to the right of the
2546 *                   punched region and it must not be freed.
2547 * @start:  The first block to remove
2548 * @end:   The last block to remove
2549 */
2550static int
2551ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2552                 struct ext4_ext_path *path,
2553                 struct partial_cluster *partial,
2554                 ext4_lblk_t start, ext4_lblk_t end)
2555{
2556        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2557        int err = 0, correct_index = 0;
2558        int depth = ext_depth(inode), credits, revoke_credits;
2559        struct ext4_extent_header *eh;
2560        ext4_lblk_t a, b;
2561        unsigned num;
2562        ext4_lblk_t ex_ee_block;
2563        unsigned short ex_ee_len;
2564        unsigned unwritten = 0;
2565        struct ext4_extent *ex;
2566        ext4_fsblk_t pblk;
2567
2568        /* the header must be checked already in ext4_ext_remove_space() */
2569        ext_debug(inode, "truncate since %u in leaf to %u\n", start, end);
2570        if (!path[depth].p_hdr)
2571                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2572        eh = path[depth].p_hdr;
2573        if (unlikely(path[depth].p_hdr == NULL)) {
2574                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2575                return -EFSCORRUPTED;
2576        }
2577        /* find where to start removing */
2578        ex = path[depth].p_ext;
2579        if (!ex)
2580                ex = EXT_LAST_EXTENT(eh);
2581
2582        ex_ee_block = le32_to_cpu(ex->ee_block);
2583        ex_ee_len = ext4_ext_get_actual_len(ex);
2584
2585        trace_ext4_ext_rm_leaf(inode, start, ex, partial);
2586
2587        while (ex >= EXT_FIRST_EXTENT(eh) &&
2588                        ex_ee_block + ex_ee_len > start) {
2589
2590                if (ext4_ext_is_unwritten(ex))
2591                        unwritten = 1;
2592                else
2593                        unwritten = 0;
2594
2595                ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block,
2596                          unwritten, ex_ee_len);
2597                path[depth].p_ext = ex;
2598
2599                a = ex_ee_block > start ? ex_ee_block : start;
2600                b = ex_ee_block+ex_ee_len - 1 < end ?
2601                        ex_ee_block+ex_ee_len - 1 : end;
2602
2603                ext_debug(inode, "  border %u:%u\n", a, b);
2604
2605                /* If this extent is beyond the end of the hole, skip it */
2606                if (end < ex_ee_block) {
2607                        /*
2608                         * We're going to skip this extent and move to another,
2609                         * so note that its first cluster is in use to avoid
2610                         * freeing it when removing blocks.  Eventually, the
2611                         * right edge of the truncated/punched region will
2612                         * be just to the left.
2613                         */
2614                        if (sbi->s_cluster_ratio > 1) {
2615                                pblk = ext4_ext_pblock(ex);
2616                                partial->pclu = EXT4_B2C(sbi, pblk);
2617                                partial->state = nofree;
2618                        }
2619                        ex--;
2620                        ex_ee_block = le32_to_cpu(ex->ee_block);
2621                        ex_ee_len = ext4_ext_get_actual_len(ex);
2622                        continue;
2623                } else if (b != ex_ee_block + ex_ee_len - 1) {
2624                        EXT4_ERROR_INODE(inode,
2625                                         "can not handle truncate %u:%u "
2626                                         "on extent %u:%u",
2627                                         start, end, ex_ee_block,
2628                                         ex_ee_block + ex_ee_len - 1);
2629                        err = -EFSCORRUPTED;
2630                        goto out;
2631                } else if (a != ex_ee_block) {
2632                        /* remove tail of the extent */
2633                        num = a - ex_ee_block;
2634                } else {
2635                        /* remove whole extent: excellent! */
2636                        num = 0;
2637                }
2638                /*
2639                 * 3 for leaf, sb, and inode plus 2 (bmap and group
2640                 * descriptor) for each block group; assume two block
2641                 * groups plus ex_ee_len/blocks_per_block_group for
2642                 * the worst case
2643                 */
2644                credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2645                if (ex == EXT_FIRST_EXTENT(eh)) {
2646                        correct_index = 1;
2647                        credits += (ext_depth(inode)) + 1;
2648                }
2649                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2650                /*
2651                 * We may end up freeing some index blocks and data from the
2652                 * punched range. Note that partial clusters are accounted for
2653                 * by ext4_free_data_revoke_credits().
2654                 */
2655                revoke_credits =
2656                        ext4_free_metadata_revoke_credits(inode->i_sb,
2657                                                          ext_depth(inode)) +
2658                        ext4_free_data_revoke_credits(inode, b - a + 1);
2659
2660                err = ext4_datasem_ensure_credits(handle, inode, credits,
2661                                                  credits, revoke_credits);
2662                if (err) {
2663                        if (err > 0)
2664                                err = -EAGAIN;
2665                        goto out;
2666                }
2667
2668                err = ext4_ext_get_access(handle, inode, path + depth);
2669                if (err)
2670                        goto out;
2671
2672                err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
2673                if (err)
2674                        goto out;
2675
2676                if (num == 0)
2677                        /* this extent is removed; mark slot entirely unused */
2678                        ext4_ext_store_pblock(ex, 0);
2679
2680                ex->ee_len = cpu_to_le16(num);
2681                /*
2682                 * Do not mark unwritten if all the blocks in the
2683                 * extent have been removed.
2684                 */
2685                if (unwritten && num)
2686                        ext4_ext_mark_unwritten(ex);
2687                /*
2688                 * If the extent was completely released,
2689                 * we need to remove it from the leaf
2690                 */
2691                if (num == 0) {
2692                        if (end != EXT_MAX_BLOCKS - 1) {
2693                                /*
2694                                 * For hole punching, we need to scoot all the
2695                                 * extents up when an extent is removed so that
2696                                 * we dont have blank extents in the middle
2697                                 */
2698                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2699                                        sizeof(struct ext4_extent));
2700
2701                                /* Now get rid of the one at the end */
2702                                memset(EXT_LAST_EXTENT(eh), 0,
2703                                        sizeof(struct ext4_extent));
2704                        }
2705                        le16_add_cpu(&eh->eh_entries, -1);
2706                }
2707
2708                err = ext4_ext_dirty(handle, inode, path + depth);
2709                if (err)
2710                        goto out;
2711
2712                ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num,
2713                                ext4_ext_pblock(ex));
2714                ex--;
2715                ex_ee_block = le32_to_cpu(ex->ee_block);
2716                ex_ee_len = ext4_ext_get_actual_len(ex);
2717        }
2718
2719        if (correct_index && eh->eh_entries)
2720                err = ext4_ext_correct_indexes(handle, inode, path);
2721
2722        /*
2723         * If there's a partial cluster and at least one extent remains in
2724         * the leaf, free the partial cluster if it isn't shared with the
2725         * current extent.  If it is shared with the current extent
2726         * we reset the partial cluster because we've reached the start of the
2727         * truncated/punched region and we're done removing blocks.
2728         */
2729        if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
2730                pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2731                if (partial->pclu != EXT4_B2C(sbi, pblk)) {
2732                        int flags = get_default_free_blocks_flags(inode);
2733
2734                        if (ext4_is_pending(inode, partial->lblk))
2735                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2736                        ext4_free_blocks(handle, inode, NULL,
2737                                         EXT4_C2B(sbi, partial->pclu),
2738                                         sbi->s_cluster_ratio, flags);
2739                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2740                                ext4_rereserve_cluster(inode, partial->lblk);
2741                }
2742                partial->state = initial;
2743        }
2744
2745        /* if this leaf is free, then we should
2746         * remove it from index block above */
2747        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2748                err = ext4_ext_rm_idx(handle, inode, path, depth);
2749
2750out:
2751        return err;
2752}
2753
2754/*
2755 * ext4_ext_more_to_rm:
2756 * returns 1 if current index has to be freed (even partial)
2757 */
2758static int
2759ext4_ext_more_to_rm(struct ext4_ext_path *path)
2760{
2761        BUG_ON(path->p_idx == NULL);
2762
2763        if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2764                return 0;
2765
2766        /*
2767         * if truncate on deeper level happened, it wasn't partial,
2768         * so we have to consider current index for truncation
2769         */
2770        if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2771                return 0;
2772        return 1;
2773}
2774
2775int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2776                          ext4_lblk_t end)
2777{
2778        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2779        int depth = ext_depth(inode);
2780        struct ext4_ext_path *path = NULL;
2781        struct partial_cluster partial;
2782        handle_t *handle;
2783        int i = 0, err = 0;
2784
2785        partial.pclu = 0;
2786        partial.lblk = 0;
2787        partial.state = initial;
2788
2789        ext_debug(inode, "truncate since %u to %u\n", start, end);
2790
2791        /* probably first extent we're gonna free will be last in block */
2792        handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
2793                        depth + 1,
2794                        ext4_free_metadata_revoke_credits(inode->i_sb, depth));
2795        if (IS_ERR(handle))
2796                return PTR_ERR(handle);
2797
2798again:
2799        trace_ext4_ext_remove_space(inode, start, end, depth);
2800
2801        /*
2802         * Check if we are removing extents inside the extent tree. If that
2803         * is the case, we are going to punch a hole inside the extent tree
2804         * so we have to check whether we need to split the extent covering
2805         * the last block to remove so we can easily remove the part of it
2806         * in ext4_ext_rm_leaf().
2807         */
2808        if (end < EXT_MAX_BLOCKS - 1) {
2809                struct ext4_extent *ex;
2810                ext4_lblk_t ee_block, ex_end, lblk;
2811                ext4_fsblk_t pblk;
2812
2813                /* find extent for or closest extent to this block */
2814                path = ext4_find_extent(inode, end, NULL,
2815                                        EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
2816                if (IS_ERR(path)) {
2817                        ext4_journal_stop(handle);
2818                        return PTR_ERR(path);
2819                }
2820                depth = ext_depth(inode);
2821                /* Leaf not may not exist only if inode has no blocks at all */
2822                ex = path[depth].p_ext;
2823                if (!ex) {
2824                        if (depth) {
2825                                EXT4_ERROR_INODE(inode,
2826                                                 "path[%d].p_hdr == NULL",
2827                                                 depth);
2828                                err = -EFSCORRUPTED;
2829                        }
2830                        goto out;
2831                }
2832
2833                ee_block = le32_to_cpu(ex->ee_block);
2834                ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2835
2836                /*
2837                 * See if the last block is inside the extent, if so split
2838                 * the extent at 'end' block so we can easily remove the
2839                 * tail of the first part of the split extent in
2840                 * ext4_ext_rm_leaf().
2841                 */
2842                if (end >= ee_block && end < ex_end) {
2843
2844                        /*
2845                         * If we're going to split the extent, note that
2846                         * the cluster containing the block after 'end' is
2847                         * in use to avoid freeing it when removing blocks.
2848                         */
2849                        if (sbi->s_cluster_ratio > 1) {
2850                                pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
2851                                partial.pclu = EXT4_B2C(sbi, pblk);
2852                                partial.state = nofree;
2853                        }
2854
2855                        /*
2856                         * Split the extent in two so that 'end' is the last
2857                         * block in the first new extent. Also we should not
2858                         * fail removing space due to ENOSPC so try to use
2859                         * reserved block if that happens.
2860                         */
2861                        err = ext4_force_split_extent_at(handle, inode, &path,
2862                                                         end + 1, 1);
2863                        if (err < 0)
2864                                goto out;
2865
2866                } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
2867                           partial.state == initial) {
2868                        /*
2869                         * If we're punching, there's an extent to the right.
2870                         * If the partial cluster hasn't been set, set it to
2871                         * that extent's first cluster and its state to nofree
2872                         * so it won't be freed should it contain blocks to be
2873                         * removed. If it's already set (tofree/nofree), we're
2874                         * retrying and keep the original partial cluster info
2875                         * so a cluster marked tofree as a result of earlier
2876                         * extent removal is not lost.
2877                         */
2878                        lblk = ex_end + 1;
2879                        err = ext4_ext_search_right(inode, path, &lblk, &pblk,
2880                                                    NULL);
2881                        if (err < 0)
2882                                goto out;
2883                        if (pblk) {
2884                                partial.pclu = EXT4_B2C(sbi, pblk);
2885                                partial.state = nofree;
2886                        }
2887                }
2888        }
2889        /*
2890         * We start scanning from right side, freeing all the blocks
2891         * after i_size and walking into the tree depth-wise.
2892         */
2893        depth = ext_depth(inode);
2894        if (path) {
2895                int k = i = depth;
2896                while (--k > 0)
2897                        path[k].p_block =
2898                                le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2899        } else {
2900                path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
2901                               GFP_NOFS | __GFP_NOFAIL);
2902                if (path == NULL) {
2903                        ext4_journal_stop(handle);
2904                        return -ENOMEM;
2905                }
2906                path[0].p_maxdepth = path[0].p_depth = depth;
2907                path[0].p_hdr = ext_inode_hdr(inode);
2908                i = 0;
2909
2910                if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2911                        err = -EFSCORRUPTED;
2912                        goto out;
2913                }
2914        }
2915        err = 0;
2916
2917        while (i >= 0 && err == 0) {
2918                if (i == depth) {
2919                        /* this is leaf block */
2920                        err = ext4_ext_rm_leaf(handle, inode, path,
2921                                               &partial, start, end);
2922                        /* root level has p_bh == NULL, brelse() eats this */
2923                        brelse(path[i].p_bh);
2924                        path[i].p_bh = NULL;
2925                        i--;
2926                        continue;
2927                }
2928
2929                /* this is index block */
2930                if (!path[i].p_hdr) {
2931                        ext_debug(inode, "initialize header\n");
2932                        path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2933                }
2934
2935                if (!path[i].p_idx) {
2936                        /* this level hasn't been touched yet */
2937                        path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2938                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2939                        ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
2940                                  path[i].p_hdr,
2941                                  le16_to_cpu(path[i].p_hdr->eh_entries));
2942                } else {
2943                        /* we were already here, see at next index */
2944                        path[i].p_idx--;
2945                }
2946
2947                ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
2948                                i, EXT_FIRST_INDEX(path[i].p_hdr),
2949                                path[i].p_idx);
2950                if (ext4_ext_more_to_rm(path + i)) {
2951                        struct buffer_head *bh;
2952                        /* go to the next level */
2953                        ext_debug(inode, "move to level %d (block %llu)\n",
2954                                  i + 1, ext4_idx_pblock(path[i].p_idx));
2955                        memset(path + i + 1, 0, sizeof(*path));
2956                        bh = read_extent_tree_block(inode,
2957                                ext4_idx_pblock(path[i].p_idx), depth - i - 1,
2958                                EXT4_EX_NOCACHE);
2959                        if (IS_ERR(bh)) {
2960                                /* should we reset i_size? */
2961                                err = PTR_ERR(bh);
2962                                break;
2963                        }
2964                        /* Yield here to deal with large extent trees.
2965                         * Should be a no-op if we did IO above. */
2966                        cond_resched();
2967                        if (WARN_ON(i + 1 > depth)) {
2968                                err = -EFSCORRUPTED;
2969                                break;
2970                        }
2971                        path[i + 1].p_bh = bh;
2972
2973                        /* save actual number of indexes since this
2974                         * number is changed at the next iteration */
2975                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2976                        i++;
2977                } else {
2978                        /* we finished processing this index, go up */
2979                        if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2980                                /* index is empty, remove it;
2981                                 * handle must be already prepared by the
2982                                 * truncatei_leaf() */
2983                                err = ext4_ext_rm_idx(handle, inode, path, i);
2984                        }
2985                        /* root level has p_bh == NULL, brelse() eats this */
2986                        brelse(path[i].p_bh);
2987                        path[i].p_bh = NULL;
2988                        i--;
2989                        ext_debug(inode, "return to level %d\n", i);
2990                }
2991        }
2992
2993        trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
2994                                         path->p_hdr->eh_entries);
2995
2996        /*
2997         * if there's a partial cluster and we have removed the first extent
2998         * in the file, then we also free the partial cluster, if any
2999         */
3000        if (partial.state == tofree && err == 0) {
3001                int flags = get_default_free_blocks_flags(inode);
3002
3003                if (ext4_is_pending(inode, partial.lblk))
3004                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
3005                ext4_free_blocks(handle, inode, NULL,
3006                                 EXT4_C2B(sbi, partial.pclu),
3007                                 sbi->s_cluster_ratio, flags);
3008                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
3009                        ext4_rereserve_cluster(inode, partial.lblk);
3010                partial.state = initial;
3011        }
3012
3013        /* TODO: flexible tree reduction should be here */
3014        if (path->p_hdr->eh_entries == 0) {
3015                /*
3016                 * truncate to zero freed all the tree,
3017                 * so we need to correct eh_depth
3018                 */
3019                err = ext4_ext_get_access(handle, inode, path);
3020                if (err == 0) {
3021                        ext_inode_hdr(inode)->eh_depth = 0;
3022                        ext_inode_hdr(inode)->eh_max =
3023                                cpu_to_le16(ext4_ext_space_root(inode, 0));
3024                        err = ext4_ext_dirty(handle, inode, path);
3025                }
3026        }
3027out:
3028        ext4_ext_drop_refs(path);
3029        kfree(path);
3030        path = NULL;
3031        if (err == -EAGAIN)
3032                goto again;
3033        ext4_journal_stop(handle);
3034
3035        return err;
3036}
3037
3038/*
3039 * called at mount time
3040 */
3041void ext4_ext_init(struct super_block *sb)
3042{
3043        /*
3044         * possible initialization would be here
3045         */
3046
3047        if (ext4_has_feature_extents(sb)) {
3048#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3049                printk(KERN_INFO "EXT4-fs: file extents enabled"
3050#ifdef AGGRESSIVE_TEST
3051                       ", aggressive tests"
3052#endif
3053#ifdef CHECK_BINSEARCH
3054                       ", check binsearch"
3055#endif
3056#ifdef EXTENTS_STATS
3057                       ", stats"
3058#endif
3059                       "\n");
3060#endif
3061#ifdef EXTENTS_STATS
3062                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3063                EXT4_SB(sb)->s_ext_min = 1 << 30;
3064                EXT4_SB(sb)->s_ext_max = 0;
3065#endif
3066        }
3067}
3068
3069/*
3070 * called at umount time
3071 */
3072void ext4_ext_release(struct super_block *sb)
3073{
3074        if (!ext4_has_feature_extents(sb))
3075                return;
3076
3077#ifdef EXTENTS_STATS
3078        if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3079                struct ext4_sb_info *sbi = EXT4_SB(sb);
3080                printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3081                        sbi->s_ext_blocks, sbi->s_ext_extents,
3082                        sbi->s_ext_blocks / sbi->s_ext_extents);
3083                printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3084                        sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3085        }
3086#endif
3087}
3088
3089static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3090{
3091        ext4_lblk_t  ee_block;
3092        ext4_fsblk_t ee_pblock;
3093        unsigned int ee_len;
3094
3095        ee_block  = le32_to_cpu(ex->ee_block);
3096        ee_len    = ext4_ext_get_actual_len(ex);
3097        ee_pblock = ext4_ext_pblock(ex);
3098
3099        if (ee_len == 0)
3100                return 0;
3101
3102        return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3103                                     EXTENT_STATUS_WRITTEN);
3104}
3105
3106/* FIXME!! we need to try to merge to left or right after zero-out  */
3107static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3108{
3109        ext4_fsblk_t ee_pblock;
3110        unsigned int ee_len;
3111
3112        ee_len    = ext4_ext_get_actual_len(ex);
3113        ee_pblock = ext4_ext_pblock(ex);
3114        return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
3115                                  ee_len);
3116}
3117
3118/*
3119 * ext4_split_extent_at() splits an extent at given block.
3120 *
3121 * @handle: the journal handle
3122 * @inode: the file inode
3123 * @path: the path to the extent
3124 * @split: the logical block where the extent is splitted.
3125 * @split_flags: indicates if the extent could be zeroout if split fails, and
3126 *               the states(init or unwritten) of new extents.
3127 * @flags: flags used to insert new extent to extent tree.
3128 *
3129 *
3130 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3131 * of which are determined by split_flag.
3132 *
3133 * There are two cases:
3134 *  a> the extent are splitted into two extent.
3135 *  b> split is not needed, and just mark the extent.
3136 *
3137 * return 0 on success.
3138 */
3139static int ext4_split_extent_at(handle_t *handle,
3140                             struct inode *inode,
3141                             struct ext4_ext_path **ppath,
3142                             ext4_lblk_t split,
3143                             int split_flag,
3144                             int flags)
3145{
3146        struct ext4_ext_path *path = *ppath;
3147        ext4_fsblk_t newblock;
3148        ext4_lblk_t ee_block;
3149        struct ext4_extent *ex, newex, orig_ex, zero_ex;
3150        struct ext4_extent *ex2 = NULL;
3151        unsigned int ee_len, depth;
3152        int err = 0;
3153
3154        BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3155               (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3156
3157        ext_debug(inode, "logical block %llu\n", (unsigned long long)split);
3158
3159        ext4_ext_show_leaf(inode, path);
3160
3161        depth = ext_depth(inode);
3162        ex = path[depth].p_ext;
3163        ee_block = le32_to_cpu(ex->ee_block);
3164        ee_len = ext4_ext_get_actual_len(ex);
3165        newblock = split - ee_block + ext4_ext_pblock(ex);
3166
3167        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3168        BUG_ON(!ext4_ext_is_unwritten(ex) &&
3169               split_flag & (EXT4_EXT_MAY_ZEROOUT |
3170                             EXT4_EXT_MARK_UNWRIT1 |
3171                             EXT4_EXT_MARK_UNWRIT2));
3172
3173        err = ext4_ext_get_access(handle, inode, path + depth);
3174        if (err)
3175                goto out;
3176
3177        if (split == ee_block) {
3178                /*
3179                 * case b: block @split is the block that the extent begins with
3180                 * then we just change the state of the extent, and splitting
3181                 * is not needed.
3182                 */
3183                if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3184                        ext4_ext_mark_unwritten(ex);
3185                else
3186                        ext4_ext_mark_initialized(ex);
3187
3188                if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3189                        ext4_ext_try_to_merge(handle, inode, path, ex);
3190
3191                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3192                goto out;
3193        }
3194
3195        /* case a */
3196        memcpy(&orig_ex, ex, sizeof(orig_ex));
3197        ex->ee_len = cpu_to_le16(split - ee_block);
3198        if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3199                ext4_ext_mark_unwritten(ex);
3200
3201        /*
3202         * path may lead to new leaf, not to original leaf any more
3203         * after ext4_ext_insert_extent() returns,
3204         */
3205        err = ext4_ext_dirty(handle, inode, path + depth);
3206        if (err)
3207                goto fix_extent_len;
3208
3209        ex2 = &newex;
3210        ex2->ee_block = cpu_to_le32(split);
3211        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3212        ext4_ext_store_pblock(ex2, newblock);
3213        if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3214                ext4_ext_mark_unwritten(ex2);
3215
3216        err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3217        if (err != -ENOSPC && err != -EDQUOT)
3218                goto out;
3219
3220        if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
3221                if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3222                        if (split_flag & EXT4_EXT_DATA_VALID1) {
3223                                err = ext4_ext_zeroout(inode, ex2);
3224                                zero_ex.ee_block = ex2->ee_block;
3225                                zero_ex.ee_len = cpu_to_le16(
3226                                                ext4_ext_get_actual_len(ex2));
3227                                ext4_ext_store_pblock(&zero_ex,
3228                                                      ext4_ext_pblock(ex2));
3229                        } else {
3230                                err = ext4_ext_zeroout(inode, ex);
3231                                zero_ex.ee_block = ex->ee_block;
3232                                zero_ex.ee_len = cpu_to_le16(
3233                                                ext4_ext_get_actual_len(ex));
3234                                ext4_ext_store_pblock(&zero_ex,
3235                                                      ext4_ext_pblock(ex));
3236                        }
3237                } else {
3238                        err = ext4_ext_zeroout(inode, &orig_ex);
3239                        zero_ex.ee_block = orig_ex.ee_block;
3240                        zero_ex.ee_len = cpu_to_le16(
3241                                                ext4_ext_get_actual_len(&orig_ex));
3242                        ext4_ext_store_pblock(&zero_ex,
3243                                              ext4_ext_pblock(&orig_ex));
3244                }
3245
3246                if (!err) {
3247                        /* update the extent length and mark as initialized */
3248                        ex->ee_len = cpu_to_le16(ee_len);
3249                        ext4_ext_try_to_merge(handle, inode, path, ex);
3250                        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3251                        if (!err)
3252                                /* update extent status tree */
3253                                err = ext4_zeroout_es(inode, &zero_ex);
3254                        /* If we failed at this point, we don't know in which
3255                         * state the extent tree exactly is so don't try to fix
3256                         * length of the original extent as it may do even more
3257                         * damage.
3258                         */
3259                        goto out;
3260                }
3261        }
3262
3263fix_extent_len:
3264        ex->ee_len = orig_ex.ee_len;
3265        /*
3266         * Ignore ext4_ext_dirty return value since we are already in error path
3267         * and err is a non-zero error code.
3268         */
3269        ext4_ext_dirty(handle, inode, path + path->p_depth);
3270        return err;
3271out:
3272        ext4_ext_show_leaf(inode, path);
3273        return err;
3274}
3275
3276/*
3277 * ext4_split_extents() splits an extent and mark extent which is covered
3278 * by @map as split_flags indicates
3279 *
3280 * It may result in splitting the extent into multiple extents (up to three)
3281 * There are three possibilities:
3282 *   a> There is no split required
3283 *   b> Splits in two extents: Split is happening at either end of the extent
3284 *   c> Splits in three extents: Somone is splitting in middle of the extent
3285 *
3286 */
3287static int ext4_split_extent(handle_t *handle,
3288                              struct inode *inode,
3289                              struct ext4_ext_path **ppath,
3290                              struct ext4_map_blocks *map,
3291                              int split_flag,
3292                              int flags)
3293{
3294        struct ext4_ext_path *path = *ppath;
3295        ext4_lblk_t ee_block;
3296        struct ext4_extent *ex;
3297        unsigned int ee_len, depth;
3298        int err = 0;
3299        int unwritten;
3300        int split_flag1, flags1;
3301        int allocated = map->m_len;
3302
3303        depth = ext_depth(inode);
3304        ex = path[depth].p_ext;
3305        ee_block = le32_to_cpu(ex->ee_block);
3306        ee_len = ext4_ext_get_actual_len(ex);
3307        unwritten = ext4_ext_is_unwritten(ex);
3308
3309        if (map->m_lblk + map->m_len < ee_block + ee_len) {
3310                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3311                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3312                if (unwritten)
3313                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3314                                       EXT4_EXT_MARK_UNWRIT2;
3315                if (split_flag & EXT4_EXT_DATA_VALID2)
3316                        split_flag1 |= EXT4_EXT_DATA_VALID1;
3317                err = ext4_split_extent_at(handle, inode, ppath,
3318                                map->m_lblk + map->m_len, split_flag1, flags1);
3319                if (err)
3320                        goto out;
3321        } else {
3322                allocated = ee_len - (map->m_lblk - ee_block);
3323        }
3324        /*
3325         * Update path is required because previous ext4_split_extent_at() may
3326         * result in split of original leaf or extent zeroout.
3327         */
3328        path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
3329        if (IS_ERR(path))
3330                return PTR_ERR(path);
3331        depth = ext_depth(inode);
3332        ex = path[depth].p_ext;
3333        if (!ex) {
3334                EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3335                                 (unsigned long) map->m_lblk);
3336                return -EFSCORRUPTED;
3337        }
3338        unwritten = ext4_ext_is_unwritten(ex);
3339        split_flag1 = 0;
3340
3341        if (map->m_lblk >= ee_block) {
3342                split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3343                if (unwritten) {
3344                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3345                        split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3346                                                     EXT4_EXT_MARK_UNWRIT2);
3347                }
3348                err = ext4_split_extent_at(handle, inode, ppath,
3349                                map->m_lblk, split_flag1, flags);
3350                if (err)
3351                        goto out;
3352        }
3353
3354        ext4_ext_show_leaf(inode, path);
3355out:
3356        return err ? err : allocated;
3357}
3358
3359/*
3360 * This function is called by ext4_ext_map_blocks() if someone tries to write
3361 * to an unwritten extent. It may result in splitting the unwritten
3362 * extent into multiple extents (up to three - one initialized and two
3363 * unwritten).
3364 * There are three possibilities:
3365 *   a> There is no split required: Entire extent should be initialized
3366 *   b> Splits in two extents: Write is happening at either end of the extent
3367 *   c> Splits in three extents: Somone is writing in middle of the extent
3368 *
3369 * Pre-conditions:
3370 *  - The extent pointed to by 'path' is unwritten.
3371 *  - The extent pointed to by 'path' contains a superset
3372 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3373 *
3374 * Post-conditions on success:
3375 *  - the returned value is the number of blocks beyond map->l_lblk
3376 *    that are allocated and initialized.
3377 *    It is guaranteed to be >= map->m_len.
3378 */
3379static int ext4_ext_convert_to_initialized(handle_t *handle,
3380                                           struct inode *inode,
3381                                           struct ext4_map_blocks *map,
3382                                           struct ext4_ext_path **ppath,
3383                                           int flags)
3384{
3385        struct ext4_ext_path *path = *ppath;
3386        struct ext4_sb_info *sbi;
3387        struct ext4_extent_header *eh;
3388        struct ext4_map_blocks split_map;
3389        struct ext4_extent zero_ex1, zero_ex2;
3390        struct ext4_extent *ex, *abut_ex;
3391        ext4_lblk_t ee_block, eof_block;
3392        unsigned int ee_len, depth, map_len = map->m_len;
3393        int allocated = 0, max_zeroout = 0;
3394        int err = 0;
3395        int split_flag = EXT4_EXT_DATA_VALID2;
3396
3397        ext_debug(inode, "logical block %llu, max_blocks %u\n",
3398                  (unsigned long long)map->m_lblk, map_len);
3399
3400        sbi = EXT4_SB(inode->i_sb);
3401        eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3402                        >> inode->i_sb->s_blocksize_bits;
3403        if (eof_block < map->m_lblk + map_len)
3404                eof_block = map->m_lblk + map_len;
3405
3406        depth = ext_depth(inode);
3407        eh = path[depth].p_hdr;
3408        ex = path[depth].p_ext;
3409        ee_block = le32_to_cpu(ex->ee_block);
3410        ee_len = ext4_ext_get_actual_len(ex);
3411        zero_ex1.ee_len = 0;
3412        zero_ex2.ee_len = 0;
3413
3414        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3415
3416        /* Pre-conditions */
3417        BUG_ON(!ext4_ext_is_unwritten(ex));
3418        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3419
3420        /*
3421         * Attempt to transfer newly initialized blocks from the currently
3422         * unwritten extent to its neighbor. This is much cheaper
3423         * than an insertion followed by a merge as those involve costly
3424         * memmove() calls. Transferring to the left is the common case in
3425         * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3426         * followed by append writes.
3427         *
3428         * Limitations of the current logic:
3429         *  - L1: we do not deal with writes covering the whole extent.
3430         *    This would require removing the extent if the transfer
3431         *    is possible.
3432         *  - L2: we only attempt to merge with an extent stored in the
3433         *    same extent tree node.
3434         */
3435        if ((map->m_lblk == ee_block) &&
3436                /* See if we can merge left */
3437                (map_len < ee_len) &&           /*L1*/
3438                (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3439                ext4_lblk_t prev_lblk;
3440                ext4_fsblk_t prev_pblk, ee_pblk;
3441                unsigned int prev_len;
3442
3443                abut_ex = ex - 1;
3444                prev_lblk = le32_to_cpu(abut_ex->ee_block);
3445                prev_len = ext4_ext_get_actual_len(abut_ex);
3446                prev_pblk = ext4_ext_pblock(abut_ex);
3447                ee_pblk = ext4_ext_pblock(ex);
3448
3449                /*
3450                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3451                 * upon those conditions:
3452                 * - C1: abut_ex is initialized,
3453                 * - C2: abut_ex is logically abutting ex,
3454                 * - C3: abut_ex is physically abutting ex,
3455                 * - C4: abut_ex can receive the additional blocks without
3456                 *   overflowing the (initialized) length limit.
3457                 */
3458                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3459                        ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3460                        ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3461                        (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3462                        err = ext4_ext_get_access(handle, inode, path + depth);
3463                        if (err)
3464                                goto out;
3465
3466                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
3467                                map, ex, abut_ex);
3468
3469                        /* Shift the start of ex by 'map_len' blocks */
3470                        ex->ee_block = cpu_to_le32(ee_block + map_len);
3471                        ext4_ext_store_pblock(ex, ee_pblk + map_len);
3472                        ex->ee_len = cpu_to_le16(ee_len - map_len);
3473                        ext4_ext_mark_unwritten(ex); /* Restore the flag */
3474
3475                        /* Extend abut_ex by 'map_len' blocks */
3476                        abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3477
3478                        /* Result: number of initialized blocks past m_lblk */
3479                        allocated = map_len;
3480                }
3481        } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3482                   (map_len < ee_len) &&        /*L1*/
3483                   ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3484                /* See if we can merge right */
3485                ext4_lblk_t next_lblk;
3486                ext4_fsblk_t next_pblk, ee_pblk;
3487                unsigned int next_len;
3488
3489                abut_ex = ex + 1;
3490                next_lblk = le32_to_cpu(abut_ex->ee_block);
3491                next_len = ext4_ext_get_actual_len(abut_ex);
3492                next_pblk = ext4_ext_pblock(abut_ex);
3493                ee_pblk = ext4_ext_pblock(ex);
3494
3495                /*
3496                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3497                 * upon those conditions:
3498                 * - C1: abut_ex is initialized,
3499                 * - C2: abut_ex is logically abutting ex,
3500                 * - C3: abut_ex is physically abutting ex,
3501                 * - C4: abut_ex can receive the additional blocks without
3502                 *   overflowing the (initialized) length limit.
3503                 */
3504                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3505                    ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3506                    ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3507                    (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3508                        err = ext4_ext_get_access(handle, inode, path + depth);
3509                        if (err)
3510                                goto out;
3511
3512                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
3513                                map, ex, abut_ex);
3514
3515                        /* Shift the start of abut_ex by 'map_len' blocks */
3516                        abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3517                        ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3518                        ex->ee_len = cpu_to_le16(ee_len - map_len);
3519                        ext4_ext_mark_unwritten(ex); /* Restore the flag */
3520
3521                        /* Extend abut_ex by 'map_len' blocks */
3522                        abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3523
3524                        /* Result: number of initialized blocks past m_lblk */
3525                        allocated = map_len;
3526                }
3527        }
3528        if (allocated) {
3529                /* Mark the block containing both extents as dirty */
3530                err = ext4_ext_dirty(handle, inode, path + depth);
3531
3532                /* Update path to point to the right extent */
3533                path[depth].p_ext = abut_ex;
3534                goto out;
3535        } else
3536                allocated = ee_len - (map->m_lblk - ee_block);
3537
3538        WARN_ON(map->m_lblk < ee_block);
3539        /*
3540         * It is safe to convert extent to initialized via explicit
3541         * zeroout only if extent is fully inside i_size or new_size.
3542         */
3543        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3544
3545        if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3546                max_zeroout = sbi->s_extent_max_zeroout_kb >>
3547                        (inode->i_sb->s_blocksize_bits - 10);
3548
3549        /*
3550         * five cases:
3551         * 1. split the extent into three extents.
3552         * 2. split the extent into two extents, zeroout the head of the first
3553         *    extent.
3554         * 3. split the extent into two extents, zeroout the tail of the second
3555         *    extent.
3556         * 4. split the extent into two extents with out zeroout.
3557         * 5. no splitting needed, just possibly zeroout the head and / or the
3558         *    tail of the extent.
3559         */
3560        split_map.m_lblk = map->m_lblk;
3561        split_map.m_len = map->m_len;
3562
3563        if (max_zeroout && (allocated > split_map.m_len)) {
3564                if (allocated <= max_zeroout) {
3565                        /* case 3 or 5 */
3566                        zero_ex1.ee_block =
3567                                 cpu_to_le32(split_map.m_lblk +
3568                                             split_map.m_len);
3569                        zero_ex1.ee_len =
3570                                cpu_to_le16(allocated - split_map.m_len);
3571                        ext4_ext_store_pblock(&zero_ex1,
3572                                ext4_ext_pblock(ex) + split_map.m_lblk +
3573                                split_map.m_len - ee_block);
3574                        err = ext4_ext_zeroout(inode, &zero_ex1);
3575                        if (err)
3576                                goto fallback;
3577                        split_map.m_len = allocated;
3578                }
3579                if (split_map.m_lblk - ee_block + split_map.m_len <
3580                                                                max_zeroout) {
3581                        /* case 2 or 5 */
3582                        if (split_map.m_lblk != ee_block) {
3583                                zero_ex2.ee_block = ex->ee_block;
3584                                zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
3585                                                        ee_block);
3586                                ext4_ext_store_pblock(&zero_ex2,
3587                                                      ext4_ext_pblock(ex));
3588                                err = ext4_ext_zeroout(inode, &zero_ex2);
3589                                if (err)
3590                                        goto fallback;
3591                        }
3592
3593                        split_map.m_len += split_map.m_lblk - ee_block;
3594                        split_map.m_lblk = ee_block;
3595                        allocated = map->m_len;
3596                }
3597        }
3598
3599fallback:
3600        err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3601                                flags);
3602        if (err > 0)
3603                err = 0;
3604out:
3605        /* If we have gotten a failure, don't zero out status tree */
3606        if (!err) {
3607                err = ext4_zeroout_es(inode, &zero_ex1);
3608                if (!err)
3609                        err = ext4_zeroout_es(inode, &zero_ex2);
3610        }
3611        return err ? err : allocated;
3612}
3613
3614/*
3615 * This function is called by ext4_ext_map_blocks() from
3616 * ext4_get_blocks_dio_write() when DIO to write
3617 * to an unwritten extent.
3618 *
3619 * Writing to an unwritten extent may result in splitting the unwritten
3620 * extent into multiple initialized/unwritten extents (up to three)
3621 * There are three possibilities:
3622 *   a> There is no split required: Entire extent should be unwritten
3623 *   b> Splits in two extents: Write is happening at either end of the extent
3624 *   c> Splits in three extents: Somone is writing in middle of the extent
3625 *
3626 * This works the same way in the case of initialized -> unwritten conversion.
3627 *
3628 * One of more index blocks maybe needed if the extent tree grow after
3629 * the unwritten extent split. To prevent ENOSPC occur at the IO
3630 * complete, we need to split the unwritten extent before DIO submit
3631 * the IO. The unwritten extent called at this time will be split
3632 * into three unwritten extent(at most). After IO complete, the part
3633 * being filled will be convert to initialized by the end_io callback function
3634 * via ext4_convert_unwritten_extents().
3635 *
3636 * Returns the size of unwritten extent to be written on success.
3637 */
3638static int ext4_split_convert_extents(handle_t *handle,
3639                                        struct inode *inode,
3640                                        struct ext4_map_blocks *map,
3641                                        struct ext4_ext_path **ppath,
3642                                        int flags)
3643{
3644        struct ext4_ext_path *path = *ppath;
3645        ext4_lblk_t eof_block;
3646        ext4_lblk_t ee_block;
3647        struct ext4_extent *ex;
3648        unsigned int ee_len;
3649        int split_flag = 0, depth;
3650
3651        ext_debug(inode, "logical block %llu, max_blocks %u\n",
3652                  (unsigned long long)map->m_lblk, map->m_len);
3653
3654        eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3655                        >> inode->i_sb->s_blocksize_bits;
3656        if (eof_block < map->m_lblk + map->m_len)
3657                eof_block = map->m_lblk + map->m_len;
3658        /*
3659         * It is safe to convert extent to initialized via explicit
3660         * zeroout only if extent is fully inside i_size or new_size.
3661         */
3662        depth = ext_depth(inode);
3663        ex = path[depth].p_ext;
3664        ee_block = le32_to_cpu(ex->ee_block);
3665        ee_len = ext4_ext_get_actual_len(ex);
3666
3667        /* Convert to unwritten */
3668        if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3669                split_flag |= EXT4_EXT_DATA_VALID1;
3670        /* Convert to initialized */
3671        } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3672                split_flag |= ee_block + ee_len <= eof_block ?
3673                              EXT4_EXT_MAY_ZEROOUT : 0;
3674                split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3675        }
3676        flags |= EXT4_GET_BLOCKS_PRE_IO;
3677        return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3678}
3679
3680static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3681                                                struct inode *inode,
3682                                                struct ext4_map_blocks *map,
3683                                                struct ext4_ext_path **ppath)
3684{
3685        struct ext4_ext_path *path = *ppath;
3686        struct ext4_extent *ex;
3687        ext4_lblk_t ee_block;
3688        unsigned int ee_len;
3689        int depth;
3690        int err = 0;
3691
3692        depth = ext_depth(inode);
3693        ex = path[depth].p_ext;
3694        ee_block = le32_to_cpu(ex->ee_block);
3695        ee_len = ext4_ext_get_actual_len(ex);
3696
3697        ext_debug(inode, "logical block %llu, max_blocks %u\n",
3698                  (unsigned long long)ee_block, ee_len);
3699
3700        /* If extent is larger than requested it is a clear sign that we still
3701         * have some extent state machine issues left. So extent_split is still
3702         * required.
3703         * TODO: Once all related issues will be fixed this situation should be
3704         * illegal.
3705         */
3706        if (ee_block != map->m_lblk || ee_len > map->m_len) {
3707#ifdef CONFIG_EXT4_DEBUG
3708                ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
3709                             " len %u; IO logical block %llu, len %u",
3710                             inode->i_ino, (unsigned long long)ee_block, ee_len,
3711                             (unsigned long long)map->m_lblk, map->m_len);
3712#endif
3713                err = ext4_split_convert_extents(handle, inode, map, ppath,
3714                                                 EXT4_GET_BLOCKS_CONVERT);
3715                if (err < 0)
3716                        return err;
3717                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3718                if (IS_ERR(path))
3719                        return PTR_ERR(path);
3720                depth = ext_depth(inode);
3721                ex = path[depth].p_ext;
3722        }
3723
3724        err = ext4_ext_get_access(handle, inode, path + depth);
3725        if (err)
3726                goto out;
3727        /* first mark the extent as initialized */
3728        ext4_ext_mark_initialized(ex);
3729
3730        /* note: ext4_ext_correct_indexes() isn't needed here because
3731         * borders are not changed
3732         */
3733        ext4_ext_try_to_merge(handle, inode, path, ex);
3734
3735        /* Mark modified extent as dirty */
3736        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3737out:
3738        ext4_ext_show_leaf(inode, path);
3739        return err;
3740}
3741
3742static int
3743convert_initialized_extent(handle_t *handle, struct inode *inode,
3744                           struct ext4_map_blocks *map,
3745                           struct ext4_ext_path **ppath,
3746                           unsigned int *allocated)
3747{
3748        struct ext4_ext_path *path = *ppath;
3749        struct ext4_extent *ex;
3750        ext4_lblk_t ee_block;
3751        unsigned int ee_len;
3752        int depth;
3753        int err = 0;
3754
3755        /*
3756         * Make sure that the extent is no bigger than we support with
3757         * unwritten extent
3758         */
3759        if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3760                map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3761
3762        depth = ext_depth(inode);
3763        ex = path[depth].p_ext;
3764        ee_block = le32_to_cpu(ex->ee_block);
3765        ee_len = ext4_ext_get_actual_len(ex);
3766
3767        ext_debug(inode, "logical block %llu, max_blocks %u\n",
3768                  (unsigned long long)ee_block, ee_len);
3769
3770        if (ee_block != map->m_lblk || ee_len > map->m_len) {
3771                err = ext4_split_convert_extents(handle, inode, map, ppath,
3772                                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3773                if (err < 0)
3774                        return err;
3775                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3776                if (IS_ERR(path))
3777                        return PTR_ERR(path);
3778                depth = ext_depth(inode);
3779                ex = path[depth].p_ext;
3780                if (!ex) {
3781                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3782                                         (unsigned long) map->m_lblk);
3783                        return -EFSCORRUPTED;
3784                }
3785        }
3786
3787        err = ext4_ext_get_access(handle, inode, path + depth);
3788        if (err)
3789                return err;
3790        /* first mark the extent as unwritten */
3791        ext4_ext_mark_unwritten(ex);
3792
3793        /* note: ext4_ext_correct_indexes() isn't needed here because
3794         * borders are not changed
3795         */
3796        ext4_ext_try_to_merge(handle, inode, path, ex);
3797
3798        /* Mark modified extent as dirty */
3799        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3800        if (err)
3801                return err;
3802        ext4_ext_show_leaf(inode, path);
3803
3804        ext4_update_inode_fsync_trans(handle, inode, 1);
3805
3806        map->m_flags |= EXT4_MAP_UNWRITTEN;
3807        if (*allocated > map->m_len)
3808                *allocated = map->m_len;
3809        map->m_len = *allocated;
3810        return 0;
3811}
3812
3813static int
3814ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
3815                        struct ext4_map_blocks *map,
3816                        struct ext4_ext_path **ppath, int flags,
3817                        unsigned int allocated, ext4_fsblk_t newblock)
3818{
3819        struct ext4_ext_path __maybe_unused *path = *ppath;
3820        int ret = 0;
3821        int err = 0;
3822
3823        ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
3824                  (unsigned long long)map->m_lblk, map->m_len, flags,
3825                  allocated);
3826        ext4_ext_show_leaf(inode, path);
3827
3828        /*
3829         * When writing into unwritten space, we should not fail to
3830         * allocate metadata blocks for the new extent block if needed.
3831         */
3832        flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
3833
3834        trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
3835                                                    allocated, newblock);
3836
3837        /* get_block() before submitting IO, split the extent */
3838        if (flags & EXT4_GET_BLOCKS_PRE_IO) {
3839                ret = ext4_split_convert_extents(handle, inode, map, ppath,
3840                                         flags | EXT4_GET_BLOCKS_CONVERT);
3841                if (ret < 0) {
3842                        err = ret;
3843                        goto out2;
3844                }
3845                /*
3846                 * shouldn't get a 0 return when splitting an extent unless
3847                 * m_len is 0 (bug) or extent has been corrupted
3848                 */
3849                if (unlikely(ret == 0)) {
3850                        EXT4_ERROR_INODE(inode,
3851                                         "unexpected ret == 0, m_len = %u",
3852                                         map->m_len);
3853                        err = -EFSCORRUPTED;
3854                        goto out2;
3855                }
3856                map->m_flags |= EXT4_MAP_UNWRITTEN;
3857                goto out;
3858        }
3859        /* IO end_io complete, convert the filled extent to written */
3860        if (flags & EXT4_GET_BLOCKS_CONVERT) {
3861                err = ext4_convert_unwritten_extents_endio(handle, inode, map,
3862                                                           ppath);
3863                if (err < 0)
3864                        goto out2;
3865                ext4_update_inode_fsync_trans(handle, inode, 1);
3866                goto map_out;
3867        }
3868        /* buffered IO cases */
3869        /*
3870         * repeat fallocate creation request
3871         * we already have an unwritten extent
3872         */
3873        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
3874                map->m_flags |= EXT4_MAP_UNWRITTEN;
3875                goto map_out;
3876        }
3877
3878        /* buffered READ or buffered write_begin() lookup */
3879        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3880                /*
3881                 * We have blocks reserved already.  We
3882                 * return allocated blocks so that delalloc
3883                 * won't do block reservation for us.  But
3884                 * the buffer head will be unmapped so that
3885                 * a read from the block returns 0s.
3886                 */
3887                map->m_flags |= EXT4_MAP_UNWRITTEN;
3888                goto out1;
3889        }
3890
3891        /*
3892         * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
3893         * For buffered writes, at writepage time, etc.  Convert a
3894         * discovered unwritten extent to written.
3895         */
3896        ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
3897        if (ret < 0) {
3898                err = ret;
3899                goto out2;
3900        }
3901        ext4_update_inode_fsync_trans(handle, inode, 1);
3902        /*
3903         * shouldn't get a 0 return when converting an unwritten extent
3904         * unless m_len is 0 (bug) or extent has been corrupted
3905         */
3906        if (unlikely(ret == 0)) {
3907                EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
3908                                 map->m_len);
3909                err = -EFSCORRUPTED;
3910                goto out2;
3911        }
3912
3913out:
3914        allocated = ret;
3915        map->m_flags |= EXT4_MAP_NEW;
3916map_out:
3917        map->m_flags |= EXT4_MAP_MAPPED;
3918out1:
3919        map->m_pblk = newblock;
3920        if (allocated > map->m_len)
3921                allocated = map->m_len;
3922        map->m_len = allocated;
3923        ext4_ext_show_leaf(inode, path);
3924out2:
3925        return err ? err : allocated;
3926}
3927
3928/*
3929 * get_implied_cluster_alloc - check to see if the requested
3930 * allocation (in the map structure) overlaps with a cluster already
3931 * allocated in an extent.
3932 *      @sb     The filesystem superblock structure
3933 *      @map    The requested lblk->pblk mapping
3934 *      @ex     The extent structure which might contain an implied
3935 *                      cluster allocation
3936 *
3937 * This function is called by ext4_ext_map_blocks() after we failed to
3938 * find blocks that were already in the inode's extent tree.  Hence,
3939 * we know that the beginning of the requested region cannot overlap
3940 * the extent from the inode's extent tree.  There are three cases we
3941 * want to catch.  The first is this case:
3942 *
3943 *               |--- cluster # N--|
3944 *    |--- extent ---|  |---- requested region ---|
3945 *                      |==========|
3946 *
3947 * The second case that we need to test for is this one:
3948 *
3949 *   |--------- cluster # N ----------------|
3950 *         |--- requested region --|   |------- extent ----|
3951 *         |=======================|
3952 *
3953 * The third case is when the requested region lies between two extents
3954 * within the same cluster:
3955 *          |------------- cluster # N-------------|
3956 * |----- ex -----|                  |---- ex_right ----|
3957 *                  |------ requested region ------|
3958 *                  |================|
3959 *
3960 * In each of the above cases, we need to set the map->m_pblk and
3961 * map->m_len so it corresponds to the return the extent labelled as
3962 * "|====|" from cluster #N, since it is already in use for data in
3963 * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
3964 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3965 * as a new "allocated" block region.  Otherwise, we will return 0 and
3966 * ext4_ext_map_blocks() will then allocate one or more new clusters
3967 * by calling ext4_mb_new_blocks().
3968 */
3969static int get_implied_cluster_alloc(struct super_block *sb,
3970                                     struct ext4_map_blocks *map,
3971                                     struct ext4_extent *ex,
3972                                     struct ext4_ext_path *path)
3973{
3974        struct ext4_sb_info *sbi = EXT4_SB(sb);
3975        ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
3976        ext4_lblk_t ex_cluster_start, ex_cluster_end;
3977        ext4_lblk_t rr_cluster_start;
3978        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3979        ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3980        unsigned short ee_len = ext4_ext_get_actual_len(ex);
3981
3982        /* The extent passed in that we are trying to match */
3983        ex_cluster_start = EXT4_B2C(sbi, ee_block);
3984        ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3985
3986        /* The requested region passed into ext4_map_blocks() */
3987        rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3988
3989        if ((rr_cluster_start == ex_cluster_end) ||
3990            (rr_cluster_start == ex_cluster_start)) {
3991                if (rr_cluster_start == ex_cluster_end)
3992                        ee_start += ee_len - 1;
3993                map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
3994                map->m_len = min(map->m_len,
3995                                 (unsigned) sbi->s_cluster_ratio - c_offset);
3996                /*
3997                 * Check for and handle this case:
3998                 *
3999                 *   |--------- cluster # N-------------|
4000                 *                     |------- extent ----|
4001                 *         |--- requested region ---|
4002                 *         |===========|
4003                 */
4004
4005                if (map->m_lblk < ee_block)
4006                        map->m_len = min(map->m_len, ee_block - map->m_lblk);
4007
4008                /*
4009                 * Check for the case where there is already another allocated
4010                 * block to the right of 'ex' but before the end of the cluster.
4011                 *
4012                 *          |------------- cluster # N-------------|
4013                 * |----- ex -----|                  |---- ex_right ----|
4014                 *                  |------ requested region ------|
4015                 *                  |================|
4016                 */
4017                if (map->m_lblk > ee_block) {
4018                        ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4019                        map->m_len = min(map->m_len, next - map->m_lblk);
4020                }
4021
4022                trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4023                return 1;
4024        }
4025
4026        trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4027        return 0;
4028}
4029
4030
4031/*
4032 * Block allocation/map/preallocation routine for extents based files
4033 *
4034 *
4035 * Need to be called with
4036 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4037 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4038 *
4039 * return > 0, number of blocks already mapped/allocated
4040 *          if create == 0 and these are pre-allocated blocks
4041 *              buffer head is unmapped
4042 *          otherwise blocks are mapped
4043 *
4044 * return = 0, if plain look up failed (blocks have not been allocated)
4045 *          buffer head is unmapped
4046 *
4047 * return < 0, error case.
4048 */
4049int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4050                        struct ext4_map_blocks *map, int flags)
4051{
4052        struct ext4_ext_path *path = NULL;
4053        struct ext4_extent newex, *ex, ex2;
4054        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4055        ext4_fsblk_t newblock = 0, pblk;
4056        int err = 0, depth, ret;
4057        unsigned int allocated = 0, offset = 0;
4058        unsigned int allocated_clusters = 0;
4059        struct ext4_allocation_request ar;
4060        ext4_lblk_t cluster_offset;
4061
4062        ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len);
4063        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4064
4065        /* find extent for this block */
4066        path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4067        if (IS_ERR(path)) {
4068                err = PTR_ERR(path);
4069                path = NULL;
4070                goto out;
4071        }
4072
4073        depth = ext_depth(inode);
4074
4075        /*
4076         * consistent leaf must not be empty;
4077         * this situation is possible, though, _during_ tree modification;
4078         * this is why assert can't be put in ext4_find_extent()
4079         */
4080        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4081                EXT4_ERROR_INODE(inode, "bad extent address "
4082                                 "lblock: %lu, depth: %d pblock %lld",
4083                                 (unsigned long) map->m_lblk, depth,
4084                                 path[depth].p_block);
4085                err = -EFSCORRUPTED;
4086                goto out;
4087        }
4088
4089        ex = path[depth].p_ext;
4090        if (ex) {
4091                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4092                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4093                unsigned short ee_len;
4094
4095
4096                /*
4097                 * unwritten extents are treated as holes, except that
4098                 * we split out initialized portions during a write.
4099                 */
4100                ee_len = ext4_ext_get_actual_len(ex);
4101
4102                trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4103
4104                /* if found extent covers block, simply return it */
4105                if (in_range(map->m_lblk, ee_block, ee_len)) {
4106                        newblock = map->m_lblk - ee_block + ee_start;
4107                        /* number of remaining blocks in the extent */
4108                        allocated = ee_len - (map->m_lblk - ee_block);
4109                        ext_debug(inode, "%u fit into %u:%d -> %llu\n",
4110                                  map->m_lblk, ee_block, ee_len, newblock);
4111
4112                        /*
4113                         * If the extent is initialized check whether the
4114                         * caller wants to convert it to unwritten.
4115                         */
4116                        if ((!ext4_ext_is_unwritten(ex)) &&
4117                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4118                                err = convert_initialized_extent(handle,
4119                                        inode, map, &path, &allocated);
4120                                goto out;
4121                        } else if (!ext4_ext_is_unwritten(ex)) {
4122                                map->m_flags |= EXT4_MAP_MAPPED;
4123                                map->m_pblk = newblock;
4124                                if (allocated > map->m_len)
4125                                        allocated = map->m_len;
4126                                map->m_len = allocated;
4127                                ext4_ext_show_leaf(inode, path);
4128                                goto out;
4129                        }
4130
4131                        ret = ext4_ext_handle_unwritten_extents(
4132                                handle, inode, map, &path, flags,
4133                                allocated, newblock);
4134                        if (ret < 0)
4135                                err = ret;
4136                        else
4137                                allocated = ret;
4138                        goto out;
4139                }
4140        }
4141
4142        /*
4143         * requested block isn't allocated yet;
4144         * we couldn't try to create block if create flag is zero
4145         */
4146        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4147                ext4_lblk_t hole_start, hole_len;
4148
4149                hole_start = map->m_lblk;
4150                hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4151                /*
4152                 * put just found gap into cache to speed up
4153                 * subsequent requests
4154                 */
4155                ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4156
4157                /* Update hole_len to reflect hole size after map->m_lblk */
4158                if (hole_start != map->m_lblk)
4159                        hole_len -= map->m_lblk - hole_start;
4160                map->m_pblk = 0;
4161                map->m_len = min_t(unsigned int, map->m_len, hole_len);
4162
4163                goto out;
4164        }
4165
4166        /*
4167         * Okay, we need to do block allocation.
4168         */
4169        newex.ee_block = cpu_to_le32(map->m_lblk);
4170        cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4171
4172        /*
4173         * If we are doing bigalloc, check to see if the extent returned
4174         * by ext4_find_extent() implies a cluster we can use.
4175         */
4176        if (cluster_offset && ex &&
4177            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4178                ar.len = allocated = map->m_len;
4179                newblock = map->m_pblk;
4180                goto got_allocated_blocks;
4181        }
4182
4183        /* find neighbour allocated blocks */
4184        ar.lleft = map->m_lblk;
4185        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4186        if (err)
4187                goto out;
4188        ar.lright = map->m_lblk;
4189        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4190        if (err < 0)
4191                goto out;
4192
4193        /* Check if the extent after searching to the right implies a
4194         * cluster we can use. */
4195        if ((sbi->s_cluster_ratio > 1) && err &&
4196            get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
4197                ar.len = allocated = map->m_len;
4198                newblock = map->m_pblk;
4199                goto got_allocated_blocks;
4200        }
4201
4202        /*
4203         * See if request is beyond maximum number of blocks we can have in
4204         * a single extent. For an initialized extent this limit is
4205         * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4206         * EXT_UNWRITTEN_MAX_LEN.
4207         */
4208        if (map->m_len > EXT_INIT_MAX_LEN &&
4209            !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4210                map->m_len = EXT_INIT_MAX_LEN;
4211        else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4212                 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4213                map->m_len = EXT_UNWRITTEN_MAX_LEN;
4214
4215        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4216        newex.ee_len = cpu_to_le16(map->m_len);
4217        err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4218        if (err)
4219                allocated = ext4_ext_get_actual_len(&newex);
4220        else
4221                allocated = map->m_len;
4222
4223        /* allocate new block */
4224        ar.inode = inode;
4225        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4226        ar.logical = map->m_lblk;
4227        /*
4228         * We calculate the offset from the beginning of the cluster
4229         * for the logical block number, since when we allocate a
4230         * physical cluster, the physical block should start at the
4231         * same offset from the beginning of the cluster.  This is
4232         * needed so that future calls to get_implied_cluster_alloc()
4233         * work correctly.
4234         */
4235        offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4236        ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4237        ar.goal -= offset;
4238        ar.logical -= offset;
4239        if (S_ISREG(inode->i_mode))
4240                ar.flags = EXT4_MB_HINT_DATA;
4241        else
4242                /* disable in-core preallocation for non-regular files */
4243                ar.flags = 0;
4244        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4245                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4246        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4247                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4248        if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
4249                ar.flags |= EXT4_MB_USE_RESERVED;
4250        newblock = ext4_mb_new_blocks(handle, &ar, &err);
4251        if (!newblock)
4252                goto out;
4253        allocated_clusters = ar.len;
4254        ar.len = EXT4_C2B(sbi, ar.len) - offset;
4255        ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n",
4256                  ar.goal, newblock, ar.len, allocated);
4257        if (ar.len > allocated)
4258                ar.len = allocated;
4259
4260got_allocated_blocks:
4261        /* try to insert new extent into found leaf and return */
4262        pblk = newblock + offset;
4263        ext4_ext_store_pblock(&newex, pblk);
4264        newex.ee_len = cpu_to_le16(ar.len);
4265        /* Mark unwritten */
4266        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4267                ext4_ext_mark_unwritten(&newex);
4268                map->m_flags |= EXT4_MAP_UNWRITTEN;
4269        }
4270
4271        err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
4272        if (err) {
4273                if (allocated_clusters) {
4274                        int fb_flags = 0;
4275
4276                        /*
4277                         * free data blocks we just allocated.
4278                         * not a good idea to call discard here directly,
4279                         * but otherwise we'd need to call it every free().
4280                         */
4281                        ext4_discard_preallocations(inode, 0);
4282                        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4283                                fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
4284                        ext4_free_blocks(handle, inode, NULL, newblock,
4285                                         EXT4_C2B(sbi, allocated_clusters),
4286                                         fb_flags);
4287                }
4288                goto out;
4289        }
4290
4291        /*
4292         * Reduce the reserved cluster count to reflect successful deferred
4293         * allocation of delayed allocated clusters or direct allocation of
4294         * clusters discovered to be delayed allocated.  Once allocated, a
4295         * cluster is not included in the reserved count.
4296         */
4297        if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
4298                if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4299                        /*
4300                         * When allocating delayed allocated clusters, simply
4301                         * reduce the reserved cluster count and claim quota
4302                         */
4303                        ext4_da_update_reserve_space(inode, allocated_clusters,
4304                                                        1);
4305                } else {
4306                        ext4_lblk_t lblk, len;
4307                        unsigned int n;
4308
4309                        /*
4310                         * When allocating non-delayed allocated clusters
4311                         * (from fallocate, filemap, DIO, or clusters
4312                         * allocated when delalloc has been disabled by
4313                         * ext4_nonda_switch), reduce the reserved cluster
4314                         * count by the number of allocated clusters that
4315                         * have previously been delayed allocated.  Quota
4316                         * has been claimed by ext4_mb_new_blocks() above,
4317                         * so release the quota reservations made for any
4318                         * previously delayed allocated clusters.
4319                         */
4320                        lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
4321                        len = allocated_clusters << sbi->s_cluster_bits;
4322                        n = ext4_es_delayed_clu(inode, lblk, len);
4323                        if (n > 0)
4324                                ext4_da_update_reserve_space(inode, (int) n, 0);
4325                }
4326        }
4327
4328        /*
4329         * Cache the extent and update transaction to commit on fdatasync only
4330         * when it is _not_ an unwritten extent.
4331         */
4332        if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4333                ext4_update_inode_fsync_trans(handle, inode, 1);
4334        else
4335                ext4_update_inode_fsync_trans(handle, inode, 0);
4336
4337        map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED);
4338        map->m_pblk = pblk;
4339        map->m_len = ar.len;
4340        allocated = map->m_len;
4341        ext4_ext_show_leaf(inode, path);
4342out:
4343        ext4_ext_drop_refs(path);
4344        kfree(path);
4345
4346        trace_ext4_ext_map_blocks_exit(inode, flags, map,
4347                                       err ? err : allocated);
4348        return err ? err : allocated;
4349}
4350
4351int ext4_ext_truncate(handle_t *handle, struct inode *inode)
4352{
4353        struct super_block *sb = inode->i_sb;
4354        ext4_lblk_t last_block;
4355        int err = 0;
4356
4357        /*
4358         * TODO: optimization is possible here.
4359         * Probably we need not scan at all,
4360         * because page truncation is enough.
4361         */
4362
4363        /* we have to know where to truncate from in crash case */
4364        EXT4_I(inode)->i_disksize = inode->i_size;
4365        err = ext4_mark_inode_dirty(handle, inode);
4366        if (err)
4367                return err;
4368
4369        last_block = (inode->i_size + sb->s_blocksize - 1)
4370                        >> EXT4_BLOCK_SIZE_BITS(sb);
4371retry:
4372        err = ext4_es_remove_extent(inode, last_block,
4373                                    EXT_MAX_BLOCKS - last_block);
4374        if (err == -ENOMEM) {
4375                cond_resched();
4376                congestion_wait(BLK_RW_ASYNC, HZ/50);
4377                goto retry;
4378        }
4379        if (err)
4380                return err;
4381retry_remove_space:
4382        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4383        if (err == -ENOMEM) {
4384                cond_resched();
4385                congestion_wait(BLK_RW_ASYNC, HZ/50);
4386                goto retry_remove_space;
4387        }
4388        return err;
4389}
4390
4391static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4392                                  ext4_lblk_t len, loff_t new_size,
4393                                  int flags)
4394{
4395        struct inode *inode = file_inode(file);
4396        handle_t *handle;
4397        int ret = 0, ret2 = 0, ret3 = 0;
4398        int retries = 0;
4399        int depth = 0;
4400        struct ext4_map_blocks map;
4401        unsigned int credits;
4402        loff_t epos;
4403
4404        BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
4405        map.m_lblk = offset;
4406        map.m_len = len;
4407        /*
4408         * Don't normalize the request if it can fit in one extent so
4409         * that it doesn't get unnecessarily split into multiple
4410         * extents.
4411         */
4412        if (len <= EXT_UNWRITTEN_MAX_LEN)
4413                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4414
4415        /*
4416         * credits to insert 1 extent into extent tree
4417         */
4418        credits = ext4_chunk_trans_blocks(inode, len);
4419        depth = ext_depth(inode);
4420
4421retry:
4422        while (len) {
4423                /*
4424                 * Recalculate credits when extent tree depth changes.
4425                 */
4426                if (depth != ext_depth(inode)) {
4427                        credits = ext4_chunk_trans_blocks(inode, len);
4428                        depth = ext_depth(inode);
4429                }
4430
4431                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4432                                            credits);
4433                if (IS_ERR(handle)) {
4434                        ret = PTR_ERR(handle);
4435                        break;
4436                }
4437                ret = ext4_map_blocks(handle, inode, &map, flags);
4438                if (ret <= 0) {
4439                        ext4_debug("inode #%lu: block %u: len %u: "
4440                                   "ext4_ext_map_blocks returned %d",
4441                                   inode->i_ino, map.m_lblk,
4442                                   map.m_len, ret);
4443                        ext4_mark_inode_dirty(handle, inode);
4444                        ext4_journal_stop(handle);
4445                        break;
4446                }
4447                /*
4448                 * allow a full retry cycle for any remaining allocations
4449                 */
4450                retries = 0;
4451                map.m_lblk += ret;
4452                map.m_len = len = len - ret;
4453                epos = (loff_t)map.m_lblk << inode->i_blkbits;
4454                inode->i_ctime = current_time(inode);
4455                if (new_size) {
4456                        if (epos > new_size)
4457                                epos = new_size;
4458                        if (ext4_update_inode_size(inode, epos) & 0x1)
4459                                inode->i_mtime = inode->i_ctime;
4460                }
4461                ret2 = ext4_mark_inode_dirty(handle, inode);
4462                ext4_update_inode_fsync_trans(handle, inode, 1);
4463                ret3 = ext4_journal_stop(handle);
4464                ret2 = ret3 ? ret3 : ret2;
4465                if (unlikely(ret2))
4466                        break;
4467        }
4468        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
4469                goto retry;
4470
4471        return ret > 0 ? ret2 : ret;
4472}
4473
4474static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
4475
4476static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
4477
4478static long ext4_zero_range(struct file *file, loff_t offset,
4479                            loff_t len, int mode)
4480{
4481        struct inode *inode = file_inode(file);
4482        struct address_space *mapping = file->f_mapping;
4483        handle_t *handle = NULL;
4484        unsigned int max_blocks;
4485        loff_t new_size = 0;
4486        int ret = 0;
4487        int flags;
4488        int credits;
4489        int partial_begin, partial_end;
4490        loff_t start, end;
4491        ext4_lblk_t lblk;
4492        unsigned int blkbits = inode->i_blkbits;
4493
4494        trace_ext4_zero_range(inode, offset, len, mode);
4495
4496        /* Call ext4_force_commit to flush all data in case of data=journal. */
4497        if (ext4_should_journal_data(inode)) {
4498                ret = ext4_force_commit(inode->i_sb);
4499                if (ret)
4500                        return ret;
4501        }
4502
4503        /*
4504         * Round up offset. This is not fallocate, we need to zero out
4505         * blocks, so convert interior block aligned part of the range to
4506         * unwritten and possibly manually zero out unaligned parts of the
4507         * range.
4508         */
4509        start = round_up(offset, 1 << blkbits);
4510        end = round_down((offset + len), 1 << blkbits);
4511
4512        if (start < offset || end > offset + len)
4513                return -EINVAL;
4514        partial_begin = offset & ((1 << blkbits) - 1);
4515        partial_end = (offset + len) & ((1 << blkbits) - 1);
4516
4517        lblk = start >> blkbits;
4518        max_blocks = (end >> blkbits);
4519        if (max_blocks < lblk)
4520                max_blocks = 0;
4521        else
4522                max_blocks -= lblk;
4523
4524        inode_lock(inode);
4525
4526        /*
4527         * Indirect files do not support unwritten extents
4528         */
4529        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4530                ret = -EOPNOTSUPP;
4531                goto out_mutex;
4532        }
4533
4534        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4535            (offset + len > inode->i_size ||
4536             offset + len > EXT4_I(inode)->i_disksize)) {
4537                new_size = offset + len;
4538                ret = inode_newsize_ok(inode, new_size);
4539                if (ret)
4540                        goto out_mutex;
4541        }
4542
4543        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4544
4545        /* Wait all existing dio workers, newcomers will block on i_mutex */
4546        inode_dio_wait(inode);
4547
4548        /* Preallocate the range including the unaligned edges */
4549        if (partial_begin || partial_end) {
4550                ret = ext4_alloc_file_blocks(file,
4551                                round_down(offset, 1 << blkbits) >> blkbits,
4552                                (round_up((offset + len), 1 << blkbits) -
4553                                 round_down(offset, 1 << blkbits)) >> blkbits,
4554                                new_size, flags);
4555                if (ret)
4556                        goto out_mutex;
4557
4558        }
4559
4560        /* Zero range excluding the unaligned edges */
4561        if (max_blocks > 0) {
4562                flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4563                          EXT4_EX_NOCACHE);
4564
4565                /*
4566                 * Prevent page faults from reinstantiating pages we have
4567                 * released from page cache.
4568                 */
4569                filemap_invalidate_lock(mapping);
4570
4571                ret = ext4_break_layouts(inode);
4572                if (ret) {
4573                        filemap_invalidate_unlock(mapping);
4574                        goto out_mutex;
4575                }
4576
4577                ret = ext4_update_disksize_before_punch(inode, offset, len);
4578                if (ret) {
4579                        filemap_invalidate_unlock(mapping);
4580                        goto out_mutex;
4581                }
4582                /* Now release the pages and zero block aligned part of pages */
4583                truncate_pagecache_range(inode, start, end - 1);
4584                inode->i_mtime = inode->i_ctime = current_time(inode);
4585
4586                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4587                                             flags);
4588                filemap_invalidate_unlock(mapping);
4589                if (ret)
4590                        goto out_mutex;
4591        }
4592        if (!partial_begin && !partial_end)
4593                goto out_mutex;
4594
4595        /*
4596         * In worst case we have to writeout two nonadjacent unwritten
4597         * blocks and update the inode
4598         */
4599        credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4600        if (ext4_should_journal_data(inode))
4601                credits += 2;
4602        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4603        if (IS_ERR(handle)) {
4604                ret = PTR_ERR(handle);
4605                ext4_std_error(inode->i_sb, ret);
4606                goto out_mutex;
4607        }
4608
4609        inode->i_mtime = inode->i_ctime = current_time(inode);
4610        if (new_size)
4611                ext4_update_inode_size(inode, new_size);
4612        ret = ext4_mark_inode_dirty(handle, inode);
4613        if (unlikely(ret))
4614                goto out_handle;
4615        ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
4616                        (offset + len - 1) >> inode->i_sb->s_blocksize_bits);
4617        /* Zero out partial block at the edges of the range */
4618        ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4619        if (ret >= 0)
4620                ext4_update_inode_fsync_trans(handle, inode, 1);
4621
4622        if (file->f_flags & O_SYNC)
4623                ext4_handle_sync(handle);
4624
4625out_handle:
4626        ext4_journal_stop(handle);
4627out_mutex:
4628        inode_unlock(inode);
4629        return ret;
4630}
4631
4632/*
4633 * preallocate space for a file. This implements ext4's fallocate file
4634 * operation, which gets called from sys_fallocate system call.
4635 * For block-mapped files, posix_fallocate should fall back to the method
4636 * of writing zeroes to the required new blocks (the same behavior which is
4637 * expected for file systems which do not support fallocate() system call).
4638 */
4639long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4640{
4641        struct inode *inode = file_inode(file);
4642        loff_t new_size = 0;
4643        unsigned int max_blocks;
4644        int ret = 0;
4645        int flags;
4646        ext4_lblk_t lblk;
4647        unsigned int blkbits = inode->i_blkbits;
4648
4649        /*
4650         * Encrypted inodes can't handle collapse range or insert
4651         * range since we would need to re-encrypt blocks with a
4652         * different IV or XTS tweak (which are based on the logical
4653         * block number).
4654         */
4655        if (IS_ENCRYPTED(inode) &&
4656            (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
4657                return -EOPNOTSUPP;
4658
4659        /* Return error if mode is not supported */
4660        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4661                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
4662                     FALLOC_FL_INSERT_RANGE))
4663                return -EOPNOTSUPP;
4664
4665        ext4_fc_start_update(inode);
4666
4667        if (mode & FALLOC_FL_PUNCH_HOLE) {
4668                ret = ext4_punch_hole(inode, offset, len);
4669                goto exit;
4670        }
4671
4672        ret = ext4_convert_inline_data(inode);
4673        if (ret)
4674                goto exit;
4675
4676        if (mode & FALLOC_FL_COLLAPSE_RANGE) {
4677                ret = ext4_collapse_range(inode, offset, len);
4678                goto exit;
4679        }
4680
4681        if (mode & FALLOC_FL_INSERT_RANGE) {
4682                ret = ext4_insert_range(inode, offset, len);
4683                goto exit;
4684        }
4685
4686        if (mode & FALLOC_FL_ZERO_RANGE) {
4687                ret = ext4_zero_range(file, offset, len, mode);
4688                goto exit;
4689        }
4690        trace_ext4_fallocate_enter(inode, offset, len, mode);
4691        lblk = offset >> blkbits;
4692
4693        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4694        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4695
4696        inode_lock(inode);
4697
4698        /*
4699         * We only support preallocation for extent-based files only
4700         */
4701        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4702                ret = -EOPNOTSUPP;
4703                goto out;
4704        }
4705
4706        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4707            (offset + len > inode->i_size ||
4708             offset + len > EXT4_I(inode)->i_disksize)) {
4709                new_size = offset + len;
4710                ret = inode_newsize_ok(inode, new_size);
4711                if (ret)
4712                        goto out;
4713        }
4714
4715        /* Wait all existing dio workers, newcomers will block on i_mutex */
4716        inode_dio_wait(inode);
4717
4718        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
4719        if (ret)
4720                goto out;
4721
4722        if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4723                ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
4724                                        EXT4_I(inode)->i_sync_tid);
4725        }
4726out:
4727        inode_unlock(inode);
4728        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4729exit:
4730        ext4_fc_stop_update(inode);
4731        return ret;
4732}
4733
4734/*
4735 * This function convert a range of blocks to written extents
4736 * The caller of this function will pass the start offset and the size.
4737 * all unwritten extents within this range will be converted to
4738 * written extents.
4739 *
4740 * This function is called from the direct IO end io call back
4741 * function, to convert the fallocated extents after IO is completed.
4742 * Returns 0 on success.
4743 */
4744int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4745                                   loff_t offset, ssize_t len)
4746{
4747        unsigned int max_blocks;
4748        int ret = 0, ret2 = 0, ret3 = 0;
4749        struct ext4_map_blocks map;
4750        unsigned int blkbits = inode->i_blkbits;
4751        unsigned int credits = 0;
4752
4753        map.m_lblk = offset >> blkbits;
4754        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4755
4756        if (!handle) {
4757                /*
4758                 * credits to insert 1 extent into extent tree
4759                 */
4760                credits = ext4_chunk_trans_blocks(inode, max_blocks);
4761        }
4762        while (ret >= 0 && ret < max_blocks) {
4763                map.m_lblk += ret;
4764                map.m_len = (max_blocks -= ret);
4765                if (credits) {
4766                        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4767                                                    credits);
4768                        if (IS_ERR(handle)) {
4769                                ret = PTR_ERR(handle);
4770                                break;
4771                        }
4772                }
4773                ret = ext4_map_blocks(handle, inode, &map,
4774                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4775                if (ret <= 0)
4776                        ext4_warning(inode->i_sb,
4777                                     "inode #%lu: block %u: len %u: "
4778                                     "ext4_ext_map_blocks returned %d",
4779                                     inode->i_ino, map.m_lblk,
4780                                     map.m_len, ret);
4781                ret2 = ext4_mark_inode_dirty(handle, inode);
4782                if (credits) {
4783                        ret3 = ext4_journal_stop(handle);
4784                        if (unlikely(ret3))
4785                                ret2 = ret3;
4786                }
4787
4788                if (ret <= 0 || ret2)
4789                        break;
4790        }
4791        return ret > 0 ? ret2 : ret;
4792}
4793
4794int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
4795{
4796        int ret = 0, err = 0;
4797        struct ext4_io_end_vec *io_end_vec;
4798
4799        /*
4800         * This is somewhat ugly but the idea is clear: When transaction is
4801         * reserved, everything goes into it. Otherwise we rather start several
4802         * smaller transactions for conversion of each extent separately.
4803         */
4804        if (handle) {
4805                handle = ext4_journal_start_reserved(handle,
4806                                                     EXT4_HT_EXT_CONVERT);
4807                if (IS_ERR(handle))
4808                        return PTR_ERR(handle);
4809        }
4810
4811        list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
4812                ret = ext4_convert_unwritten_extents(handle, io_end->inode,
4813                                                     io_end_vec->offset,
4814                                                     io_end_vec->size);
4815                if (ret)
4816                        break;
4817        }
4818
4819        if (handle)
4820                err = ext4_journal_stop(handle);
4821
4822        return ret < 0 ? ret : err;
4823}
4824
4825static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
4826{
4827        __u64 physical = 0;
4828        __u64 length = 0;
4829        int blockbits = inode->i_sb->s_blocksize_bits;
4830        int error = 0;
4831        u16 iomap_type;
4832
4833        /* in-inode? */
4834        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4835                struct ext4_iloc iloc;
4836                int offset;     /* offset of xattr in inode */
4837
4838                error = ext4_get_inode_loc(inode, &iloc);
4839                if (error)
4840                        return error;
4841                physical = (__u64)iloc.bh->b_blocknr << blockbits;
4842                offset = EXT4_GOOD_OLD_INODE_SIZE +
4843                                EXT4_I(inode)->i_extra_isize;
4844                physical += offset;
4845                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4846                brelse(iloc.bh);
4847                iomap_type = IOMAP_INLINE;
4848        } else if (EXT4_I(inode)->i_file_acl) { /* external block */
4849                physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4850                length = inode->i_sb->s_blocksize;
4851                iomap_type = IOMAP_MAPPED;
4852        } else {
4853                /* no in-inode or external block for xattr, so return -ENOENT */
4854                error = -ENOENT;
4855                goto out;
4856        }
4857
4858        iomap->addr = physical;
4859        iomap->offset = 0;
4860        iomap->length = length;
4861        iomap->type = iomap_type;
4862        iomap->flags = 0;
4863out:
4864        return error;
4865}
4866
4867static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
4868                                  loff_t length, unsigned flags,
4869                                  struct iomap *iomap, struct iomap *srcmap)
4870{
4871        int error;
4872
4873        error = ext4_iomap_xattr_fiemap(inode, iomap);
4874        if (error == 0 && (offset >= iomap->length))
4875                error = -ENOENT;
4876        return error;
4877}
4878
4879static const struct iomap_ops ext4_iomap_xattr_ops = {
4880        .iomap_begin            = ext4_iomap_xattr_begin,
4881};
4882
4883static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
4884{
4885        u64 maxbytes;
4886
4887        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4888                maxbytes = inode->i_sb->s_maxbytes;
4889        else
4890                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
4891
4892        if (*len == 0)
4893                return -EINVAL;
4894        if (start > maxbytes)
4895                return -EFBIG;
4896
4897        /*
4898         * Shrink request scope to what the fs can actually handle.
4899         */
4900        if (*len > maxbytes || (maxbytes - *len) < start)
4901                *len = maxbytes - start;
4902        return 0;
4903}
4904
4905int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4906                u64 start, u64 len)
4907{
4908        int error = 0;
4909
4910        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
4911                error = ext4_ext_precache(inode);
4912                if (error)
4913                        return error;
4914                fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
4915        }
4916
4917        /*
4918         * For bitmap files the maximum size limit could be smaller than
4919         * s_maxbytes, so check len here manually instead of just relying on the
4920         * generic check.
4921         */
4922        error = ext4_fiemap_check_ranges(inode, start, &len);
4923        if (error)
4924                return error;
4925
4926        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4927                fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
4928                return iomap_fiemap(inode, fieinfo, start, len,
4929                                    &ext4_iomap_xattr_ops);
4930        }
4931
4932        return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
4933}
4934
4935int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
4936                      __u64 start, __u64 len)
4937{
4938        ext4_lblk_t start_blk, len_blks;
4939        __u64 last_blk;
4940        int error = 0;
4941
4942        if (ext4_has_inline_data(inode)) {
4943                int has_inline;
4944
4945                down_read(&EXT4_I(inode)->xattr_sem);
4946                has_inline = ext4_has_inline_data(inode);
4947                up_read(&EXT4_I(inode)->xattr_sem);
4948                if (has_inline)
4949                        return 0;
4950        }
4951
4952        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
4953                error = ext4_ext_precache(inode);
4954                if (error)
4955                        return error;
4956                fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
4957        }
4958
4959        error = fiemap_prep(inode, fieinfo, start, &len, 0);
4960        if (error)
4961                return error;
4962
4963        error = ext4_fiemap_check_ranges(inode, start, &len);
4964        if (error)
4965                return error;
4966
4967        start_blk = start >> inode->i_sb->s_blocksize_bits;
4968        last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4969        if (last_blk >= EXT_MAX_BLOCKS)
4970                last_blk = EXT_MAX_BLOCKS-1;
4971        len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4972
4973        /*
4974         * Walk the extent tree gathering extent information
4975         * and pushing extents back to the user.
4976         */
4977        return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo);
4978}
4979
4980/*
4981 * ext4_access_path:
4982 * Function to access the path buffer for marking it dirty.
4983 * It also checks if there are sufficient credits left in the journal handle
4984 * to update path.
4985 */
4986static int
4987ext4_access_path(handle_t *handle, struct inode *inode,
4988                struct ext4_ext_path *path)
4989{
4990        int credits, err;
4991
4992        if (!ext4_handle_valid(handle))
4993                return 0;
4994
4995        /*
4996         * Check if need to extend journal credits
4997         * 3 for leaf, sb, and inode plus 2 (bmap and group
4998         * descriptor) for each block group; assume two block
4999         * groups
5000         */
5001        credits = ext4_writepage_trans_blocks(inode);
5002        err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
5003        if (err < 0)
5004                return err;
5005
5006        err = ext4_ext_get_access(handle, inode, path);
5007        return err;
5008}
5009
5010/*
5011 * ext4_ext_shift_path_extents:
5012 * Shift the extents of a path structure lying between path[depth].p_ext
5013 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
5014 * if it is right shift or left shift operation.
5015 */
5016static int
5017ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5018                            struct inode *inode, handle_t *handle,
5019                            enum SHIFT_DIRECTION SHIFT)
5020{
5021        int depth, err = 0;
5022        struct ext4_extent *ex_start, *ex_last;
5023        bool update = false;
5024        depth = path->p_depth;
5025
5026        while (depth >= 0) {
5027                if (depth == path->p_depth) {
5028                        ex_start = path[depth].p_ext;
5029                        if (!ex_start)
5030                                return -EFSCORRUPTED;
5031
5032                        ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5033
5034                        err = ext4_access_path(handle, inode, path + depth);
5035                        if (err)
5036                                goto out;
5037
5038                        if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5039                                update = true;
5040
5041                        while (ex_start <= ex_last) {
5042                                if (SHIFT == SHIFT_LEFT) {
5043                                        le32_add_cpu(&ex_start->ee_block,
5044                                                -shift);
5045                                        /* Try to merge to the left. */
5046                                        if ((ex_start >
5047                                            EXT_FIRST_EXTENT(path[depth].p_hdr))
5048                                            &&
5049                                            ext4_ext_try_to_merge_right(inode,
5050                                            path, ex_start - 1))
5051                                                ex_last--;
5052                                        else
5053                                                ex_start++;
5054                                } else {
5055                                        le32_add_cpu(&ex_last->ee_block, shift);
5056                                        ext4_ext_try_to_merge_right(inode, path,
5057                                                ex_last);
5058                                        ex_last--;
5059                                }
5060                        }
5061                        err = ext4_ext_dirty(handle, inode, path + depth);
5062                        if (err)
5063                                goto out;
5064
5065                        if (--depth < 0 || !update)
5066                                break;
5067                }
5068
5069                /* Update index too */
5070                err = ext4_access_path(handle, inode, path + depth);
5071                if (err)
5072                        goto out;
5073
5074                if (SHIFT == SHIFT_LEFT)
5075                        le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5076                else
5077                        le32_add_cpu(&path[depth].p_idx->ei_block, shift);
5078                err = ext4_ext_dirty(handle, inode, path + depth);
5079                if (err)
5080                        goto out;
5081
5082                /* we are done if current index is not a starting index */
5083                if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5084                        break;
5085
5086                depth--;
5087        }
5088
5089out:
5090        return err;
5091}
5092
5093/*
5094 * ext4_ext_shift_extents:
5095 * All the extents which lies in the range from @start to the last allocated
5096 * block for the @inode are shifted either towards left or right (depending
5097 * upon @SHIFT) by @shift blocks.
5098 * On success, 0 is returned, error otherwise.
5099 */
5100static int
5101ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5102                       ext4_lblk_t start, ext4_lblk_t shift,
5103                       enum SHIFT_DIRECTION SHIFT)
5104{
5105        struct ext4_ext_path *path;
5106        int ret = 0, depth;
5107        struct ext4_extent *extent;
5108        ext4_lblk_t stop, *iterator, ex_start, ex_end;
5109
5110        /* Let path point to the last extent */
5111        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
5112                                EXT4_EX_NOCACHE);
5113        if (IS_ERR(path))
5114                return PTR_ERR(path);
5115
5116        depth = path->p_depth;
5117        extent = path[depth].p_ext;
5118        if (!extent)
5119                goto out;
5120
5121        stop = le32_to_cpu(extent->ee_block);
5122
5123       /*
5124        * For left shifts, make sure the hole on the left is big enough to
5125        * accommodate the shift.  For right shifts, make sure the last extent
5126        * won't be shifted beyond EXT_MAX_BLOCKS.
5127        */
5128        if (SHIFT == SHIFT_LEFT) {
5129                path = ext4_find_extent(inode, start - 1, &path,
5130                                        EXT4_EX_NOCACHE);
5131                if (IS_ERR(path))
5132                        return PTR_ERR(path);
5133                depth = path->p_depth;
5134                extent =  path[depth].p_ext;
5135                if (extent) {
5136                        ex_start = le32_to_cpu(extent->ee_block);
5137                        ex_end = le32_to_cpu(extent->ee_block) +
5138                                ext4_ext_get_actual_len(extent);
5139                } else {
5140                        ex_start = 0;
5141                        ex_end = 0;
5142                }
5143
5144                if ((start == ex_start && shift > ex_start) ||
5145                    (shift > start - ex_end)) {
5146                        ret = -EINVAL;
5147                        goto out;
5148                }
5149        } else {
5150                if (shift > EXT_MAX_BLOCKS -
5151                    (stop + ext4_ext_get_actual_len(extent))) {
5152                        ret = -EINVAL;
5153                        goto out;
5154                }
5155        }
5156
5157        /*
5158         * In case of left shift, iterator points to start and it is increased
5159         * till we reach stop. In case of right shift, iterator points to stop
5160         * and it is decreased till we reach start.
5161         */
5162        if (SHIFT == SHIFT_LEFT)
5163                iterator = &start;
5164        else
5165                iterator = &stop;
5166
5167        /*
5168         * Its safe to start updating extents.  Start and stop are unsigned, so
5169         * in case of right shift if extent with 0 block is reached, iterator
5170         * becomes NULL to indicate the end of the loop.
5171         */
5172        while (iterator && start <= stop) {
5173                path = ext4_find_extent(inode, *iterator, &path,
5174                                        EXT4_EX_NOCACHE);
5175                if (IS_ERR(path))
5176                        return PTR_ERR(path);
5177                depth = path->p_depth;
5178                extent = path[depth].p_ext;
5179                if (!extent) {
5180                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5181                                         (unsigned long) *iterator);
5182                        return -EFSCORRUPTED;
5183                }
5184                if (SHIFT == SHIFT_LEFT && *iterator >
5185                    le32_to_cpu(extent->ee_block)) {
5186                        /* Hole, move to the next extent */
5187                        if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5188                                path[depth].p_ext++;
5189                        } else {
5190                                *iterator = ext4_ext_next_allocated_block(path);
5191                                continue;
5192                        }
5193                }
5194
5195                if (SHIFT == SHIFT_LEFT) {
5196                        extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5197                        *iterator = le32_to_cpu(extent->ee_block) +
5198                                        ext4_ext_get_actual_len(extent);
5199                } else {
5200                        extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
5201                        if (le32_to_cpu(extent->ee_block) > 0)
5202                                *iterator = le32_to_cpu(extent->ee_block) - 1;
5203                        else
5204                                /* Beginning is reached, end of the loop */
5205                                iterator = NULL;
5206                        /* Update path extent in case we need to stop */
5207                        while (le32_to_cpu(extent->ee_block) < start)
5208                                extent++;
5209                        path[depth].p_ext = extent;
5210                }
5211                ret = ext4_ext_shift_path_extents(path, shift, inode,
5212                                handle, SHIFT);
5213                if (ret)
5214                        break;
5215        }
5216out:
5217        ext4_ext_drop_refs(path);
5218        kfree(path);
5219        return ret;
5220}
5221
5222/*
5223 * ext4_collapse_range:
5224 * This implements the fallocate's collapse range functionality for ext4
5225 * Returns: 0 and non-zero on error.
5226 */
5227static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5228{
5229        struct super_block *sb = inode->i_sb;
5230        struct address_space *mapping = inode->i_mapping;
5231        ext4_lblk_t punch_start, punch_stop;
5232        handle_t *handle;
5233        unsigned int credits;
5234        loff_t new_size, ioffset;
5235        int ret;
5236
5237        /*
5238         * We need to test this early because xfstests assumes that a
5239         * collapse range of (0, 1) will return EOPNOTSUPP if the file
5240         * system does not support collapse range.
5241         */
5242        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5243                return -EOPNOTSUPP;
5244
5245        /* Collapse range works only on fs cluster size aligned regions. */
5246        if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5247                return -EINVAL;
5248
5249        trace_ext4_collapse_range(inode, offset, len);
5250
5251        punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5252        punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5253
5254        /* Call ext4_force_commit to flush all data in case of data=journal. */
5255        if (ext4_should_journal_data(inode)) {
5256                ret = ext4_force_commit(inode->i_sb);
5257                if (ret)
5258                        return ret;
5259        }
5260
5261        inode_lock(inode);
5262        /*
5263         * There is no need to overlap collapse range with EOF, in which case
5264         * it is effectively a truncate operation
5265         */
5266        if (offset + len >= inode->i_size) {
5267                ret = -EINVAL;
5268                goto out_mutex;
5269        }
5270
5271        /* Currently just for extent based files */
5272        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5273                ret = -EOPNOTSUPP;
5274                goto out_mutex;
5275        }
5276
5277        /* Wait for existing dio to complete */
5278        inode_dio_wait(inode);
5279
5280        /*
5281         * Prevent page faults from reinstantiating pages we have released from
5282         * page cache.
5283         */
5284        filemap_invalidate_lock(mapping);
5285
5286        ret = ext4_break_layouts(inode);
5287        if (ret)
5288                goto out_mmap;
5289
5290        /*
5291         * Need to round down offset to be aligned with page size boundary
5292         * for page size > block size.
5293         */
5294        ioffset = round_down(offset, PAGE_SIZE);
5295        /*
5296         * Write tail of the last page before removed range since it will get
5297         * removed from the page cache below.
5298         */
5299        ret = filemap_write_and_wait_range(mapping, ioffset, offset);
5300        if (ret)
5301                goto out_mmap;
5302        /*
5303         * Write data that will be shifted to preserve them when discarding
5304         * page cache below. We are also protected from pages becoming dirty
5305         * by i_rwsem and invalidate_lock.
5306         */
5307        ret = filemap_write_and_wait_range(mapping, offset + len,
5308                                           LLONG_MAX);
5309        if (ret)
5310                goto out_mmap;
5311        truncate_pagecache(inode, ioffset);
5312
5313        credits = ext4_writepage_trans_blocks(inode);
5314        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5315        if (IS_ERR(handle)) {
5316                ret = PTR_ERR(handle);
5317                goto out_mmap;
5318        }
5319        ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
5320
5321        down_write(&EXT4_I(inode)->i_data_sem);
5322        ext4_discard_preallocations(inode, 0);
5323
5324        ret = ext4_es_remove_extent(inode, punch_start,
5325                                    EXT_MAX_BLOCKS - punch_start);
5326        if (ret) {
5327                up_write(&EXT4_I(inode)->i_data_sem);
5328                goto out_stop;
5329        }
5330
5331        ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5332        if (ret) {
5333                up_write(&EXT4_I(inode)->i_data_sem);
5334                goto out_stop;
5335        }
5336        ext4_discard_preallocations(inode, 0);
5337
5338        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5339                                     punch_stop - punch_start, SHIFT_LEFT);
5340        if (ret) {
5341                up_write(&EXT4_I(inode)->i_data_sem);
5342                goto out_stop;
5343        }
5344
5345        new_size = inode->i_size - len;
5346        i_size_write(inode, new_size);
5347        EXT4_I(inode)->i_disksize = new_size;
5348
5349        up_write(&EXT4_I(inode)->i_data_sem);
5350        if (IS_SYNC(inode))
5351                ext4_handle_sync(handle);
5352        inode->i_mtime = inode->i_ctime = current_time(inode);
5353        ret = ext4_mark_inode_dirty(handle, inode);
5354        ext4_update_inode_fsync_trans(handle, inode, 1);
5355
5356out_stop:
5357        ext4_journal_stop(handle);
5358        ext4_fc_stop_ineligible(sb);
5359out_mmap:
5360        filemap_invalidate_unlock(mapping);
5361out_mutex:
5362        inode_unlock(inode);
5363        return ret;
5364}
5365
5366/*
5367 * ext4_insert_range:
5368 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5369 * The data blocks starting from @offset to the EOF are shifted by @len
5370 * towards right to create a hole in the @inode. Inode size is increased
5371 * by len bytes.
5372 * Returns 0 on success, error otherwise.
5373 */
5374static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5375{
5376        struct super_block *sb = inode->i_sb;
5377        struct address_space *mapping = inode->i_mapping;
5378        handle_t *handle;
5379        struct ext4_ext_path *path;
5380        struct ext4_extent *extent;
5381        ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
5382        unsigned int credits, ee_len;
5383        int ret = 0, depth, split_flag = 0;
5384        loff_t ioffset;
5385
5386        /*
5387         * We need to test this early because xfstests assumes that an
5388         * insert range of (0, 1) will return EOPNOTSUPP if the file
5389         * system does not support insert range.
5390         */
5391        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5392                return -EOPNOTSUPP;
5393
5394        /* Insert range works only on fs cluster size aligned regions. */
5395        if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5396                return -EINVAL;
5397
5398        trace_ext4_insert_range(inode, offset, len);
5399
5400        offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5401        len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
5402
5403        /* Call ext4_force_commit to flush all data in case of data=journal */
5404        if (ext4_should_journal_data(inode)) {
5405                ret = ext4_force_commit(inode->i_sb);
5406                if (ret)
5407                        return ret;
5408        }
5409
5410        inode_lock(inode);
5411        /* Currently just for extent based files */
5412        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5413                ret = -EOPNOTSUPP;
5414                goto out_mutex;
5415        }
5416
5417        /* Check whether the maximum file size would be exceeded */
5418        if (len > inode->i_sb->s_maxbytes - inode->i_size) {
5419                ret = -EFBIG;
5420                goto out_mutex;
5421        }
5422
5423        /* Offset must be less than i_size */
5424        if (offset >= inode->i_size) {
5425                ret = -EINVAL;
5426                goto out_mutex;
5427        }
5428
5429        /* Wait for existing dio to complete */
5430        inode_dio_wait(inode);
5431
5432        /*
5433         * Prevent page faults from reinstantiating pages we have released from
5434         * page cache.
5435         */
5436        filemap_invalidate_lock(mapping);
5437
5438        ret = ext4_break_layouts(inode);
5439        if (ret)
5440                goto out_mmap;
5441
5442        /*
5443         * Need to round down to align start offset to page size boundary
5444         * for page size > block size.
5445         */
5446        ioffset = round_down(offset, PAGE_SIZE);
5447        /* Write out all dirty pages */
5448        ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5449                        LLONG_MAX);
5450        if (ret)
5451                goto out_mmap;
5452        truncate_pagecache(inode, ioffset);
5453
5454        credits = ext4_writepage_trans_blocks(inode);
5455        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5456        if (IS_ERR(handle)) {
5457                ret = PTR_ERR(handle);
5458                goto out_mmap;
5459        }
5460        ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
5461
5462        /* Expand file to avoid data loss if there is error while shifting */
5463        inode->i_size += len;
5464        EXT4_I(inode)->i_disksize += len;
5465        inode->i_mtime = inode->i_ctime = current_time(inode);
5466        ret = ext4_mark_inode_dirty(handle, inode);
5467        if (ret)
5468                goto out_stop;
5469
5470        down_write(&EXT4_I(inode)->i_data_sem);
5471        ext4_discard_preallocations(inode, 0);
5472
5473        path = ext4_find_extent(inode, offset_lblk, NULL, 0);
5474        if (IS_ERR(path)) {
5475                up_write(&EXT4_I(inode)->i_data_sem);
5476                goto out_stop;
5477        }
5478
5479        depth = ext_depth(inode);
5480        extent = path[depth].p_ext;
5481        if (extent) {
5482                ee_start_lblk = le32_to_cpu(extent->ee_block);
5483                ee_len = ext4_ext_get_actual_len(extent);
5484
5485                /*
5486                 * If offset_lblk is not the starting block of extent, split
5487                 * the extent @offset_lblk
5488                 */
5489                if ((offset_lblk > ee_start_lblk) &&
5490                                (offset_lblk < (ee_start_lblk + ee_len))) {
5491                        if (ext4_ext_is_unwritten(extent))
5492                                split_flag = EXT4_EXT_MARK_UNWRIT1 |
5493                                        EXT4_EXT_MARK_UNWRIT2;
5494                        ret = ext4_split_extent_at(handle, inode, &path,
5495                                        offset_lblk, split_flag,
5496                                        EXT4_EX_NOCACHE |
5497                                        EXT4_GET_BLOCKS_PRE_IO |
5498                                        EXT4_GET_BLOCKS_METADATA_NOFAIL);
5499                }
5500
5501                ext4_ext_drop_refs(path);
5502                kfree(path);
5503                if (ret < 0) {
5504                        up_write(&EXT4_I(inode)->i_data_sem);
5505                        goto out_stop;
5506                }
5507        } else {
5508                ext4_ext_drop_refs(path);
5509                kfree(path);
5510        }
5511
5512        ret = ext4_es_remove_extent(inode, offset_lblk,
5513                        EXT_MAX_BLOCKS - offset_lblk);
5514        if (ret) {
5515                up_write(&EXT4_I(inode)->i_data_sem);
5516                goto out_stop;
5517        }
5518
5519        /*
5520         * if offset_lblk lies in a hole which is at start of file, use
5521         * ee_start_lblk to shift extents
5522         */
5523        ret = ext4_ext_shift_extents(inode, handle,
5524                ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
5525                len_lblk, SHIFT_RIGHT);
5526
5527        up_write(&EXT4_I(inode)->i_data_sem);
5528        if (IS_SYNC(inode))
5529                ext4_handle_sync(handle);
5530        if (ret >= 0)
5531                ext4_update_inode_fsync_trans(handle, inode, 1);
5532
5533out_stop:
5534        ext4_journal_stop(handle);
5535        ext4_fc_stop_ineligible(sb);
5536out_mmap:
5537        filemap_invalidate_unlock(mapping);
5538out_mutex:
5539        inode_unlock(inode);
5540        return ret;
5541}
5542
5543/**
5544 * ext4_swap_extents() - Swap extents between two inodes
5545 * @handle: handle for this transaction
5546 * @inode1:     First inode
5547 * @inode2:     Second inode
5548 * @lblk1:      Start block for first inode
5549 * @lblk2:      Start block for second inode
5550 * @count:      Number of blocks to swap
5551 * @unwritten: Mark second inode's extents as unwritten after swap
5552 * @erp:        Pointer to save error value
5553 *
5554 * This helper routine does exactly what is promise "swap extents". All other
5555 * stuff such as page-cache locking consistency, bh mapping consistency or
5556 * extent's data copying must be performed by caller.
5557 * Locking:
5558 *              i_mutex is held for both inodes
5559 *              i_data_sem is locked for write for both inodes
5560 * Assumptions:
5561 *              All pages from requested range are locked for both inodes
5562 */
5563int
5564ext4_swap_extents(handle_t *handle, struct inode *inode1,
5565                  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5566                  ext4_lblk_t count, int unwritten, int *erp)
5567{
5568        struct ext4_ext_path *path1 = NULL;
5569        struct ext4_ext_path *path2 = NULL;
5570        int replaced_count = 0;
5571
5572        BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5573        BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5574        BUG_ON(!inode_is_locked(inode1));
5575        BUG_ON(!inode_is_locked(inode2));
5576
5577        *erp = ext4_es_remove_extent(inode1, lblk1, count);
5578        if (unlikely(*erp))
5579                return 0;
5580        *erp = ext4_es_remove_extent(inode2, lblk2, count);
5581        if (unlikely(*erp))
5582                return 0;
5583
5584        while (count) {
5585                struct ext4_extent *ex1, *ex2, tmp_ex;
5586                ext4_lblk_t e1_blk, e2_blk;
5587                int e1_len, e2_len, len;
5588                int split = 0;
5589
5590                path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5591                if (IS_ERR(path1)) {
5592                        *erp = PTR_ERR(path1);
5593                        path1 = NULL;
5594                finish:
5595                        count = 0;
5596                        goto repeat;
5597                }
5598                path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5599                if (IS_ERR(path2)) {
5600                        *erp = PTR_ERR(path2);
5601                        path2 = NULL;
5602                        goto finish;
5603                }
5604                ex1 = path1[path1->p_depth].p_ext;
5605                ex2 = path2[path2->p_depth].p_ext;
5606                /* Do we have something to swap ? */
5607                if (unlikely(!ex2 || !ex1))
5608                        goto finish;
5609
5610                e1_blk = le32_to_cpu(ex1->ee_block);
5611                e2_blk = le32_to_cpu(ex2->ee_block);
5612                e1_len = ext4_ext_get_actual_len(ex1);
5613                e2_len = ext4_ext_get_actual_len(ex2);
5614
5615                /* Hole handling */
5616                if (!in_range(lblk1, e1_blk, e1_len) ||
5617                    !in_range(lblk2, e2_blk, e2_len)) {
5618                        ext4_lblk_t next1, next2;
5619
5620                        /* if hole after extent, then go to next extent */
5621                        next1 = ext4_ext_next_allocated_block(path1);
5622                        next2 = ext4_ext_next_allocated_block(path2);
5623                        /* If hole before extent, then shift to that extent */
5624                        if (e1_blk > lblk1)
5625                                next1 = e1_blk;
5626                        if (e2_blk > lblk2)
5627                                next2 = e2_blk;
5628                        /* Do we have something to swap */
5629                        if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5630                                goto finish;
5631                        /* Move to the rightest boundary */
5632                        len = next1 - lblk1;
5633                        if (len < next2 - lblk2)
5634                                len = next2 - lblk2;
5635                        if (len > count)
5636                                len = count;
5637                        lblk1 += len;
5638                        lblk2 += len;
5639                        count -= len;
5640                        goto repeat;
5641                }
5642
5643                /* Prepare left boundary */
5644                if (e1_blk < lblk1) {
5645                        split = 1;
5646                        *erp = ext4_force_split_extent_at(handle, inode1,
5647                                                &path1, lblk1, 0);
5648                        if (unlikely(*erp))
5649                                goto finish;
5650                }
5651                if (e2_blk < lblk2) {
5652                        split = 1;
5653                        *erp = ext4_force_split_extent_at(handle, inode2,
5654                                                &path2,  lblk2, 0);
5655                        if (unlikely(*erp))
5656                                goto finish;
5657                }
5658                /* ext4_split_extent_at() may result in leaf extent split,
5659                 * path must to be revalidated. */
5660                if (split)
5661                        goto repeat;
5662
5663                /* Prepare right boundary */
5664                len = count;
5665                if (len > e1_blk + e1_len - lblk1)
5666                        len = e1_blk + e1_len - lblk1;
5667                if (len > e2_blk + e2_len - lblk2)
5668                        len = e2_blk + e2_len - lblk2;
5669
5670                if (len != e1_len) {
5671                        split = 1;
5672                        *erp = ext4_force_split_extent_at(handle, inode1,
5673                                                &path1, lblk1 + len, 0);
5674                        if (unlikely(*erp))
5675                                goto finish;
5676                }
5677                if (len != e2_len) {
5678                        split = 1;
5679                        *erp = ext4_force_split_extent_at(handle, inode2,
5680                                                &path2, lblk2 + len, 0);
5681                        if (*erp)
5682                                goto finish;
5683                }
5684                /* ext4_split_extent_at() may result in leaf extent split,
5685                 * path must to be revalidated. */
5686                if (split)
5687                        goto repeat;
5688
5689                BUG_ON(e2_len != e1_len);
5690                *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5691                if (unlikely(*erp))
5692                        goto finish;
5693                *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5694                if (unlikely(*erp))
5695                        goto finish;
5696
5697                /* Both extents are fully inside boundaries. Swap it now */
5698                tmp_ex = *ex1;
5699                ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5700                ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5701                ex1->ee_len = cpu_to_le16(e2_len);
5702                ex2->ee_len = cpu_to_le16(e1_len);
5703                if (unwritten)
5704                        ext4_ext_mark_unwritten(ex2);
5705                if (ext4_ext_is_unwritten(&tmp_ex))
5706                        ext4_ext_mark_unwritten(ex1);
5707
5708                ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5709                ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5710                *erp = ext4_ext_dirty(handle, inode2, path2 +
5711                                      path2->p_depth);
5712                if (unlikely(*erp))
5713                        goto finish;
5714                *erp = ext4_ext_dirty(handle, inode1, path1 +
5715                                      path1->p_depth);
5716                /*
5717                 * Looks scarry ah..? second inode already points to new blocks,
5718                 * and it was successfully dirtied. But luckily error may happen
5719                 * only due to journal error, so full transaction will be
5720                 * aborted anyway.
5721                 */
5722                if (unlikely(*erp))
5723                        goto finish;
5724                lblk1 += len;
5725                lblk2 += len;
5726                replaced_count += len;
5727                count -= len;
5728
5729        repeat:
5730                ext4_ext_drop_refs(path1);
5731                kfree(path1);
5732                ext4_ext_drop_refs(path2);
5733                kfree(path2);
5734                path1 = path2 = NULL;
5735        }
5736        return replaced_count;
5737}
5738
5739/*
5740 * ext4_clu_mapped - determine whether any block in a logical cluster has
5741 *                   been mapped to a physical cluster
5742 *
5743 * @inode - file containing the logical cluster
5744 * @lclu - logical cluster of interest
5745 *
5746 * Returns 1 if any block in the logical cluster is mapped, signifying
5747 * that a physical cluster has been allocated for it.  Otherwise,
5748 * returns 0.  Can also return negative error codes.  Derived from
5749 * ext4_ext_map_blocks().
5750 */
5751int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
5752{
5753        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5754        struct ext4_ext_path *path;
5755        int depth, mapped = 0, err = 0;
5756        struct ext4_extent *extent;
5757        ext4_lblk_t first_lblk, first_lclu, last_lclu;
5758
5759        /* search for the extent closest to the first block in the cluster */
5760        path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
5761        if (IS_ERR(path)) {
5762                err = PTR_ERR(path);
5763                path = NULL;
5764                goto out;
5765        }
5766
5767        depth = ext_depth(inode);
5768
5769        /*
5770         * A consistent leaf must not be empty.  This situation is possible,
5771         * though, _during_ tree modification, and it's why an assert can't
5772         * be put in ext4_find_extent().
5773         */
5774        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
5775                EXT4_ERROR_INODE(inode,
5776                    "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
5777                                 (unsigned long) EXT4_C2B(sbi, lclu),
5778                                 depth, path[depth].p_block);
5779                err = -EFSCORRUPTED;
5780                goto out;
5781        }
5782
5783        extent = path[depth].p_ext;
5784
5785        /* can't be mapped if the extent tree is empty */
5786        if (extent == NULL)
5787                goto out;
5788
5789        first_lblk = le32_to_cpu(extent->ee_block);
5790        first_lclu = EXT4_B2C(sbi, first_lblk);
5791
5792        /*
5793         * Three possible outcomes at this point - found extent spanning
5794         * the target cluster, to the left of the target cluster, or to the
5795         * right of the target cluster.  The first two cases are handled here.
5796         * The last case indicates the target cluster is not mapped.
5797         */
5798        if (lclu >= first_lclu) {
5799                last_lclu = EXT4_B2C(sbi, first_lblk +
5800                                     ext4_ext_get_actual_len(extent) - 1);
5801                if (lclu <= last_lclu) {
5802                        mapped = 1;
5803                } else {
5804                        first_lblk = ext4_ext_next_allocated_block(path);
5805                        first_lclu = EXT4_B2C(sbi, first_lblk);
5806                        if (lclu == first_lclu)
5807                                mapped = 1;
5808                }
5809        }
5810
5811out:
5812        ext4_ext_drop_refs(path);
5813        kfree(path);
5814
5815        return err ? err : mapped;
5816}
5817
5818/*
5819 * Updates physical block address and unwritten status of extent
5820 * starting at lblk start and of len. If such an extent doesn't exist,
5821 * this function splits the extent tree appropriately to create an
5822 * extent like this.  This function is called in the fast commit
5823 * replay path.  Returns 0 on success and error on failure.
5824 */
5825int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
5826                              int len, int unwritten, ext4_fsblk_t pblk)
5827{
5828        struct ext4_ext_path *path = NULL, *ppath;
5829        struct ext4_extent *ex;
5830        int ret;
5831
5832        path = ext4_find_extent(inode, start, NULL, 0);
5833        if (IS_ERR(path))
5834                return PTR_ERR(path);
5835        ex = path[path->p_depth].p_ext;
5836        if (!ex) {
5837                ret = -EFSCORRUPTED;
5838                goto out;
5839        }
5840
5841        if (le32_to_cpu(ex->ee_block) != start ||
5842                ext4_ext_get_actual_len(ex) != len) {
5843                /* We need to split this extent to match our extent first */
5844                ppath = path;
5845                down_write(&EXT4_I(inode)->i_data_sem);
5846                ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
5847                up_write(&EXT4_I(inode)->i_data_sem);
5848                if (ret)
5849                        goto out;
5850                kfree(path);
5851                path = ext4_find_extent(inode, start, NULL, 0);
5852                if (IS_ERR(path))
5853                        return -1;
5854                ppath = path;
5855                ex = path[path->p_depth].p_ext;
5856                WARN_ON(le32_to_cpu(ex->ee_block) != start);
5857                if (ext4_ext_get_actual_len(ex) != len) {
5858                        down_write(&EXT4_I(inode)->i_data_sem);
5859                        ret = ext4_force_split_extent_at(NULL, inode, &ppath,
5860                                                         start + len, 1);
5861                        up_write(&EXT4_I(inode)->i_data_sem);
5862                        if (ret)
5863                                goto out;
5864                        kfree(path);
5865                        path = ext4_find_extent(inode, start, NULL, 0);
5866                        if (IS_ERR(path))
5867                                return -EINVAL;
5868                        ex = path[path->p_depth].p_ext;
5869                }
5870        }
5871        if (unwritten)
5872                ext4_ext_mark_unwritten(ex);
5873        else
5874                ext4_ext_mark_initialized(ex);
5875        ext4_ext_store_pblock(ex, pblk);
5876        down_write(&EXT4_I(inode)->i_data_sem);
5877        ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
5878        up_write(&EXT4_I(inode)->i_data_sem);
5879out:
5880        ext4_ext_drop_refs(path);
5881        kfree(path);
5882        ext4_mark_inode_dirty(NULL, inode);
5883        return ret;
5884}
5885
5886/* Try to shrink the extent tree */
5887void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
5888{
5889        struct ext4_ext_path *path = NULL;
5890        struct ext4_extent *ex;
5891        ext4_lblk_t old_cur, cur = 0;
5892
5893        while (cur < end) {
5894                path = ext4_find_extent(inode, cur, NULL, 0);
5895                if (IS_ERR(path))
5896                        return;
5897                ex = path[path->p_depth].p_ext;
5898                if (!ex) {
5899                        ext4_ext_drop_refs(path);
5900                        kfree(path);
5901                        ext4_mark_inode_dirty(NULL, inode);
5902                        return;
5903                }
5904                old_cur = cur;
5905                cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
5906                if (cur <= old_cur)
5907                        cur = old_cur + 1;
5908                ext4_ext_try_to_merge(NULL, inode, path, ex);
5909                down_write(&EXT4_I(inode)->i_data_sem);
5910                ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
5911                up_write(&EXT4_I(inode)->i_data_sem);
5912                ext4_mark_inode_dirty(NULL, inode);
5913                ext4_ext_drop_refs(path);
5914                kfree(path);
5915        }
5916}
5917
5918/* Check if *cur is a hole and if it is, skip it */
5919static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
5920{
5921        int ret;
5922        struct ext4_map_blocks map;
5923
5924        map.m_lblk = *cur;
5925        map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
5926
5927        ret = ext4_map_blocks(NULL, inode, &map, 0);
5928        if (ret < 0)
5929                return ret;
5930        if (ret != 0)
5931                return 0;
5932        *cur = *cur + map.m_len;
5933        return 0;
5934}
5935
5936/* Count number of blocks used by this inode and update i_blocks */
5937int ext4_ext_replay_set_iblocks(struct inode *inode)
5938{
5939        struct ext4_ext_path *path = NULL, *path2 = NULL;
5940        struct ext4_extent *ex;
5941        ext4_lblk_t cur = 0, end;
5942        int numblks = 0, i, ret = 0;
5943        ext4_fsblk_t cmp1, cmp2;
5944        struct ext4_map_blocks map;
5945
5946        /* Determin the size of the file first */
5947        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
5948                                        EXT4_EX_NOCACHE);
5949        if (IS_ERR(path))
5950                return PTR_ERR(path);
5951        ex = path[path->p_depth].p_ext;
5952        if (!ex) {
5953                ext4_ext_drop_refs(path);
5954                kfree(path);
5955                goto out;
5956        }
5957        end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
5958        ext4_ext_drop_refs(path);
5959        kfree(path);
5960
5961        /* Count the number of data blocks */
5962        cur = 0;
5963        while (cur < end) {
5964                map.m_lblk = cur;
5965                map.m_len = end - cur;
5966                ret = ext4_map_blocks(NULL, inode, &map, 0);
5967                if (ret < 0)
5968                        break;
5969                if (ret > 0)
5970                        numblks += ret;
5971                cur = cur + map.m_len;
5972        }
5973
5974        /*
5975         * Count the number of extent tree blocks. We do it by looking up
5976         * two successive extents and determining the difference between
5977         * their paths. When path is different for 2 successive extents
5978         * we compare the blocks in the path at each level and increment
5979         * iblocks by total number of differences found.
5980         */
5981        cur = 0;
5982        ret = skip_hole(inode, &cur);
5983        if (ret < 0)
5984                goto out;
5985        path = ext4_find_extent(inode, cur, NULL, 0);
5986        if (IS_ERR(path))
5987                goto out;
5988        numblks += path->p_depth;
5989        ext4_ext_drop_refs(path);
5990        kfree(path);
5991        while (cur < end) {
5992                path = ext4_find_extent(inode, cur, NULL, 0);
5993                if (IS_ERR(path))
5994                        break;
5995                ex = path[path->p_depth].p_ext;
5996                if (!ex) {
5997                        ext4_ext_drop_refs(path);
5998                        kfree(path);
5999                        return 0;
6000                }
6001                cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
6002                                        ext4_ext_get_actual_len(ex));
6003                ret = skip_hole(inode, &cur);
6004                if (ret < 0) {
6005                        ext4_ext_drop_refs(path);
6006                        kfree(path);
6007                        break;
6008                }
6009                path2 = ext4_find_extent(inode, cur, NULL, 0);
6010                if (IS_ERR(path2)) {
6011                        ext4_ext_drop_refs(path);
6012                        kfree(path);
6013                        break;
6014                }
6015                for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
6016                        cmp1 = cmp2 = 0;
6017                        if (i <= path->p_depth)
6018                                cmp1 = path[i].p_bh ?
6019                                        path[i].p_bh->b_blocknr : 0;
6020                        if (i <= path2->p_depth)
6021                                cmp2 = path2[i].p_bh ?
6022                                        path2[i].p_bh->b_blocknr : 0;
6023                        if (cmp1 != cmp2 && cmp2 != 0)
6024                                numblks++;
6025                }
6026                ext4_ext_drop_refs(path);
6027                ext4_ext_drop_refs(path2);
6028                kfree(path);
6029                kfree(path2);
6030        }
6031
6032out:
6033        inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
6034        ext4_mark_inode_dirty(NULL, inode);
6035        return 0;
6036}
6037
6038int ext4_ext_clear_bb(struct inode *inode)
6039{
6040        struct ext4_ext_path *path = NULL;
6041        struct ext4_extent *ex;
6042        ext4_lblk_t cur = 0, end;
6043        int j, ret = 0;
6044        struct ext4_map_blocks map;
6045
6046        /* Determin the size of the file first */
6047        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
6048                                        EXT4_EX_NOCACHE);
6049        if (IS_ERR(path))
6050                return PTR_ERR(path);
6051        ex = path[path->p_depth].p_ext;
6052        if (!ex) {
6053                ext4_ext_drop_refs(path);
6054                kfree(path);
6055                return 0;
6056        }
6057        end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
6058        ext4_ext_drop_refs(path);
6059        kfree(path);
6060
6061        cur = 0;
6062        while (cur < end) {
6063                map.m_lblk = cur;
6064                map.m_len = end - cur;
6065                ret = ext4_map_blocks(NULL, inode, &map, 0);
6066                if (ret < 0)
6067                        break;
6068                if (ret > 0) {
6069                        path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
6070                        if (!IS_ERR_OR_NULL(path)) {
6071                                for (j = 0; j < path->p_depth; j++) {
6072
6073                                        ext4_mb_mark_bb(inode->i_sb,
6074                                                        path[j].p_block, 1, 0);
6075                                }
6076                                ext4_ext_drop_refs(path);
6077                                kfree(path);
6078                        }
6079                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
6080                }
6081                cur = cur + map.m_len;
6082        }
6083
6084        return 0;
6085}
6086