linux/fs/ext4/extents.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   4 * Written by Alex Tomas <alex@clusterfs.com>
   5 *
   6 * Architecture independence:
   7 *   Copyright (c) 2005, Bull S.A.
   8 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
   9 */
  10
  11/*
  12 * Extents support for EXT4
  13 *
  14 * TODO:
  15 *   - ext4*_error() should be used in some situations
  16 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
  17 *   - smart tree reduction
  18 */
  19
  20#include <linux/fs.h>
  21#include <linux/time.h>
  22#include <linux/jbd2.h>
  23#include <linux/highuid.h>
  24#include <linux/pagemap.h>
  25#include <linux/quotaops.h>
  26#include <linux/string.h>
  27#include <linux/slab.h>
  28#include <linux/uaccess.h>
  29#include <linux/fiemap.h>
  30#include <linux/backing-dev.h>
  31#include "ext4_jbd2.h"
  32#include "ext4_extents.h"
  33#include "xattr.h"
  34
  35#include <trace/events/ext4.h>
  36
  37/*
  38 * used by extent splitting.
  39 */
  40#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
  41                                        due to ENOSPC */
  42#define EXT4_EXT_MARK_UNWRIT1   0x2  /* mark first half unwritten */
  43#define EXT4_EXT_MARK_UNWRIT2   0x4  /* mark second half unwritten */
  44
  45#define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
  46#define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
  47
  48static __le32 ext4_extent_block_csum(struct inode *inode,
  49                                     struct ext4_extent_header *eh)
  50{
  51        struct ext4_inode_info *ei = EXT4_I(inode);
  52        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  53        __u32 csum;
  54
  55        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
  56                           EXT4_EXTENT_TAIL_OFFSET(eh));
  57        return cpu_to_le32(csum);
  58}
  59
  60static int ext4_extent_block_csum_verify(struct inode *inode,
  61                                         struct ext4_extent_header *eh)
  62{
  63        struct ext4_extent_tail *et;
  64
  65        if (!ext4_has_metadata_csum(inode->i_sb))
  66                return 1;
  67
  68        et = find_ext4_extent_tail(eh);
  69        if (et->et_checksum != ext4_extent_block_csum(inode, eh))
  70                return 0;
  71        return 1;
  72}
  73
  74static void ext4_extent_block_csum_set(struct inode *inode,
  75                                       struct ext4_extent_header *eh)
  76{
  77        struct ext4_extent_tail *et;
  78
  79        if (!ext4_has_metadata_csum(inode->i_sb))
  80                return;
  81
  82        et = find_ext4_extent_tail(eh);
  83        et->et_checksum = ext4_extent_block_csum(inode, eh);
  84}
  85
  86static int ext4_split_extent(handle_t *handle,
  87                                struct inode *inode,
  88                                struct ext4_ext_path **ppath,
  89                                struct ext4_map_blocks *map,
  90                                int split_flag,
  91                                int flags);
  92
  93static int ext4_split_extent_at(handle_t *handle,
  94                             struct inode *inode,
  95                             struct ext4_ext_path **ppath,
  96                             ext4_lblk_t split,
  97                             int split_flag,
  98                             int flags);
  99
 100static int ext4_find_delayed_extent(struct inode *inode,
 101                                    struct extent_status *newes);
 102
 103static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
 104{
 105        /*
 106         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 107         * moment, get_block can be called only for blocks inside i_size since
 108         * page cache has been already dropped and writes are blocked by
 109         * i_mutex. So we can safely drop the i_data_sem here.
 110         */
 111        BUG_ON(EXT4_JOURNAL(inode) == NULL);
 112        ext4_discard_preallocations(inode);
 113        up_write(&EXT4_I(inode)->i_data_sem);
 114        *dropped = 1;
 115        return 0;
 116}
 117
 118/*
 119 * Make sure 'handle' has at least 'check_cred' credits. If not, restart
 120 * transaction with 'restart_cred' credits. The function drops i_data_sem
 121 * when restarting transaction and gets it after transaction is restarted.
 122 *
 123 * The function returns 0 on success, 1 if transaction had to be restarted,
 124 * and < 0 in case of fatal error.
 125 */
 126int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
 127                                int check_cred, int restart_cred,
 128                                int revoke_cred)
 129{
 130        int ret;
 131        int dropped = 0;
 132
 133        ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
 134                revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
 135        if (dropped)
 136                down_write(&EXT4_I(inode)->i_data_sem);
 137        return ret;
 138}
 139
 140/*
 141 * could return:
 142 *  - EROFS
 143 *  - ENOMEM
 144 */
 145static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 146                                struct ext4_ext_path *path)
 147{
 148        if (path->p_bh) {
 149                /* path points to block */
 150                BUFFER_TRACE(path->p_bh, "get_write_access");
 151                return ext4_journal_get_write_access(handle, path->p_bh);
 152        }
 153        /* path points to leaf/index in inode body */
 154        /* we use in-core data, no need to protect them */
 155        return 0;
 156}
 157
 158/*
 159 * could return:
 160 *  - EROFS
 161 *  - ENOMEM
 162 *  - EIO
 163 */
 164static int __ext4_ext_dirty(const char *where, unsigned int line,
 165                            handle_t *handle, struct inode *inode,
 166                            struct ext4_ext_path *path)
 167{
 168        int err;
 169
 170        WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
 171        if (path->p_bh) {
 172                ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
 173                /* path points to block */
 174                err = __ext4_handle_dirty_metadata(where, line, handle,
 175                                                   inode, path->p_bh);
 176        } else {
 177                /* path points to leaf/index in inode body */
 178                err = ext4_mark_inode_dirty(handle, inode);
 179        }
 180        return err;
 181}
 182
 183#define ext4_ext_dirty(handle, inode, path) \
 184                __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
 185
 186static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 187                              struct ext4_ext_path *path,
 188                              ext4_lblk_t block)
 189{
 190        if (path) {
 191                int depth = path->p_depth;
 192                struct ext4_extent *ex;
 193
 194                /*
 195                 * Try to predict block placement assuming that we are
 196                 * filling in a file which will eventually be
 197                 * non-sparse --- i.e., in the case of libbfd writing
 198                 * an ELF object sections out-of-order but in a way
 199                 * the eventually results in a contiguous object or
 200                 * executable file, or some database extending a table
 201                 * space file.  However, this is actually somewhat
 202                 * non-ideal if we are writing a sparse file such as
 203                 * qemu or KVM writing a raw image file that is going
 204                 * to stay fairly sparse, since it will end up
 205                 * fragmenting the file system's free space.  Maybe we
 206                 * should have some hueristics or some way to allow
 207                 * userspace to pass a hint to file system,
 208                 * especially if the latter case turns out to be
 209                 * common.
 210                 */
 211                ex = path[depth].p_ext;
 212                if (ex) {
 213                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
 214                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
 215
 216                        if (block > ext_block)
 217                                return ext_pblk + (block - ext_block);
 218                        else
 219                                return ext_pblk - (ext_block - block);
 220                }
 221
 222                /* it looks like index is empty;
 223                 * try to find starting block from index itself */
 224                if (path[depth].p_bh)
 225                        return path[depth].p_bh->b_blocknr;
 226        }
 227
 228        /* OK. use inode's group */
 229        return ext4_inode_to_goal_block(inode);
 230}
 231
 232/*
 233 * Allocation for a meta data block
 234 */
 235static ext4_fsblk_t
 236ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 237                        struct ext4_ext_path *path,
 238                        struct ext4_extent *ex, int *err, unsigned int flags)
 239{
 240        ext4_fsblk_t goal, newblock;
 241
 242        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
 243        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
 244                                        NULL, err);
 245        return newblock;
 246}
 247
 248static inline int ext4_ext_space_block(struct inode *inode, int check)
 249{
 250        int size;
 251
 252        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 253                        / sizeof(struct ext4_extent);
 254#ifdef AGGRESSIVE_TEST
 255        if (!check && size > 6)
 256                size = 6;
 257#endif
 258        return size;
 259}
 260
 261static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 262{
 263        int size;
 264
 265        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 266                        / sizeof(struct ext4_extent_idx);
 267#ifdef AGGRESSIVE_TEST
 268        if (!check && size > 5)
 269                size = 5;
 270#endif
 271        return size;
 272}
 273
 274static inline int ext4_ext_space_root(struct inode *inode, int check)
 275{
 276        int size;
 277
 278        size = sizeof(EXT4_I(inode)->i_data);
 279        size -= sizeof(struct ext4_extent_header);
 280        size /= sizeof(struct ext4_extent);
 281#ifdef AGGRESSIVE_TEST
 282        if (!check && size > 3)
 283                size = 3;
 284#endif
 285        return size;
 286}
 287
 288static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 289{
 290        int size;
 291
 292        size = sizeof(EXT4_I(inode)->i_data);
 293        size -= sizeof(struct ext4_extent_header);
 294        size /= sizeof(struct ext4_extent_idx);
 295#ifdef AGGRESSIVE_TEST
 296        if (!check && size > 4)
 297                size = 4;
 298#endif
 299        return size;
 300}
 301
 302static inline int
 303ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
 304                           struct ext4_ext_path **ppath, ext4_lblk_t lblk,
 305                           int nofail)
 306{
 307        struct ext4_ext_path *path = *ppath;
 308        int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
 309
 310        return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
 311                        EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
 312                        EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
 313                        (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
 314}
 315
 316static int
 317ext4_ext_max_entries(struct inode *inode, int depth)
 318{
 319        int max;
 320
 321        if (depth == ext_depth(inode)) {
 322                if (depth == 0)
 323                        max = ext4_ext_space_root(inode, 1);
 324                else
 325                        max = ext4_ext_space_root_idx(inode, 1);
 326        } else {
 327                if (depth == 0)
 328                        max = ext4_ext_space_block(inode, 1);
 329                else
 330                        max = ext4_ext_space_block_idx(inode, 1);
 331        }
 332
 333        return max;
 334}
 335
 336static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 337{
 338        ext4_fsblk_t block = ext4_ext_pblock(ext);
 339        int len = ext4_ext_get_actual_len(ext);
 340        ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
 341
 342        /*
 343         * We allow neither:
 344         *  - zero length
 345         *  - overflow/wrap-around
 346         */
 347        if (lblock + len <= lblock)
 348                return 0;
 349        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 350}
 351
 352static int ext4_valid_extent_idx(struct inode *inode,
 353                                struct ext4_extent_idx *ext_idx)
 354{
 355        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
 356
 357        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 358}
 359
 360static int ext4_valid_extent_entries(struct inode *inode,
 361                                struct ext4_extent_header *eh,
 362                                int depth)
 363{
 364        unsigned short entries;
 365        if (eh->eh_entries == 0)
 366                return 1;
 367
 368        entries = le16_to_cpu(eh->eh_entries);
 369
 370        if (depth == 0) {
 371                /* leaf entries */
 372                struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
 373                struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 374                ext4_fsblk_t pblock = 0;
 375                ext4_lblk_t lblock = 0;
 376                ext4_lblk_t prev = 0;
 377                int len = 0;
 378                while (entries) {
 379                        if (!ext4_valid_extent(inode, ext))
 380                                return 0;
 381
 382                        /* Check for overlapping extents */
 383                        lblock = le32_to_cpu(ext->ee_block);
 384                        len = ext4_ext_get_actual_len(ext);
 385                        if ((lblock <= prev) && prev) {
 386                                pblock = ext4_ext_pblock(ext);
 387                                es->s_last_error_block = cpu_to_le64(pblock);
 388                                return 0;
 389                        }
 390                        ext++;
 391                        entries--;
 392                        prev = lblock + len - 1;
 393                }
 394        } else {
 395                struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
 396                while (entries) {
 397                        if (!ext4_valid_extent_idx(inode, ext_idx))
 398                                return 0;
 399                        ext_idx++;
 400                        entries--;
 401                }
 402        }
 403        return 1;
 404}
 405
 406static int __ext4_ext_check(const char *function, unsigned int line,
 407                            struct inode *inode, struct ext4_extent_header *eh,
 408                            int depth, ext4_fsblk_t pblk)
 409{
 410        const char *error_msg;
 411        int max = 0, err = -EFSCORRUPTED;
 412
 413        if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
 414                error_msg = "invalid magic";
 415                goto corrupted;
 416        }
 417        if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
 418                error_msg = "unexpected eh_depth";
 419                goto corrupted;
 420        }
 421        if (unlikely(eh->eh_max == 0)) {
 422                error_msg = "invalid eh_max";
 423                goto corrupted;
 424        }
 425        max = ext4_ext_max_entries(inode, depth);
 426        if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
 427                error_msg = "too large eh_max";
 428                goto corrupted;
 429        }
 430        if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
 431                error_msg = "invalid eh_entries";
 432                goto corrupted;
 433        }
 434        if (!ext4_valid_extent_entries(inode, eh, depth)) {
 435                error_msg = "invalid extent entries";
 436                goto corrupted;
 437        }
 438        if (unlikely(depth > 32)) {
 439                error_msg = "too large eh_depth";
 440                goto corrupted;
 441        }
 442        /* Verify checksum on non-root extent tree nodes */
 443        if (ext_depth(inode) != depth &&
 444            !ext4_extent_block_csum_verify(inode, eh)) {
 445                error_msg = "extent tree corrupted";
 446                err = -EFSBADCRC;
 447                goto corrupted;
 448        }
 449        return 0;
 450
 451corrupted:
 452        ext4_set_errno(inode->i_sb, -err);
 453        ext4_error_inode(inode, function, line, 0,
 454                         "pblk %llu bad header/extent: %s - magic %x, "
 455                         "entries %u, max %u(%u), depth %u(%u)",
 456                         (unsigned long long) pblk, error_msg,
 457                         le16_to_cpu(eh->eh_magic),
 458                         le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
 459                         max, le16_to_cpu(eh->eh_depth), depth);
 460        return err;
 461}
 462
 463#define ext4_ext_check(inode, eh, depth, pblk)                  \
 464        __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
 465
 466int ext4_ext_check_inode(struct inode *inode)
 467{
 468        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
 469}
 470
 471static void ext4_cache_extents(struct inode *inode,
 472                               struct ext4_extent_header *eh)
 473{
 474        struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
 475        ext4_lblk_t prev = 0;
 476        int i;
 477
 478        for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
 479                unsigned int status = EXTENT_STATUS_WRITTEN;
 480                ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
 481                int len = ext4_ext_get_actual_len(ex);
 482
 483                if (prev && (prev != lblk))
 484                        ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
 485                                             EXTENT_STATUS_HOLE);
 486
 487                if (ext4_ext_is_unwritten(ex))
 488                        status = EXTENT_STATUS_UNWRITTEN;
 489                ext4_es_cache_extent(inode, lblk, len,
 490                                     ext4_ext_pblock(ex), status);
 491                prev = lblk + len;
 492        }
 493}
 494
 495static struct buffer_head *
 496__read_extent_tree_block(const char *function, unsigned int line,
 497                         struct inode *inode, ext4_fsblk_t pblk, int depth,
 498                         int flags)
 499{
 500        struct buffer_head              *bh;
 501        int                             err;
 502
 503        bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS);
 504        if (unlikely(!bh))
 505                return ERR_PTR(-ENOMEM);
 506
 507        if (!bh_uptodate_or_lock(bh)) {
 508                trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
 509                err = bh_submit_read(bh);
 510                if (err < 0)
 511                        goto errout;
 512        }
 513        if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
 514                return bh;
 515        if (!ext4_has_feature_journal(inode->i_sb) ||
 516            (inode->i_ino !=
 517             le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
 518                err = __ext4_ext_check(function, line, inode,
 519                                       ext_block_hdr(bh), depth, pblk);
 520                if (err)
 521                        goto errout;
 522        }
 523        set_buffer_verified(bh);
 524        /*
 525         * If this is a leaf block, cache all of its entries
 526         */
 527        if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
 528                struct ext4_extent_header *eh = ext_block_hdr(bh);
 529                ext4_cache_extents(inode, eh);
 530        }
 531        return bh;
 532errout:
 533        put_bh(bh);
 534        return ERR_PTR(err);
 535
 536}
 537
 538#define read_extent_tree_block(inode, pblk, depth, flags)               \
 539        __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
 540                                 (depth), (flags))
 541
 542/*
 543 * This function is called to cache a file's extent information in the
 544 * extent status tree
 545 */
 546int ext4_ext_precache(struct inode *inode)
 547{
 548        struct ext4_inode_info *ei = EXT4_I(inode);
 549        struct ext4_ext_path *path = NULL;
 550        struct buffer_head *bh;
 551        int i = 0, depth, ret = 0;
 552
 553        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 554                return 0;       /* not an extent-mapped inode */
 555
 556        down_read(&ei->i_data_sem);
 557        depth = ext_depth(inode);
 558
 559        path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
 560                       GFP_NOFS);
 561        if (path == NULL) {
 562                up_read(&ei->i_data_sem);
 563                return -ENOMEM;
 564        }
 565
 566        /* Don't cache anything if there are no external extent blocks */
 567        if (depth == 0)
 568                goto out;
 569        path[0].p_hdr = ext_inode_hdr(inode);
 570        ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
 571        if (ret)
 572                goto out;
 573        path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
 574        while (i >= 0) {
 575                /*
 576                 * If this is a leaf block or we've reached the end of
 577                 * the index block, go up
 578                 */
 579                if ((i == depth) ||
 580                    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
 581                        brelse(path[i].p_bh);
 582                        path[i].p_bh = NULL;
 583                        i--;
 584                        continue;
 585                }
 586                bh = read_extent_tree_block(inode,
 587                                            ext4_idx_pblock(path[i].p_idx++),
 588                                            depth - i - 1,
 589                                            EXT4_EX_FORCE_CACHE);
 590                if (IS_ERR(bh)) {
 591                        ret = PTR_ERR(bh);
 592                        break;
 593                }
 594                i++;
 595                path[i].p_bh = bh;
 596                path[i].p_hdr = ext_block_hdr(bh);
 597                path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
 598        }
 599        ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
 600out:
 601        up_read(&ei->i_data_sem);
 602        ext4_ext_drop_refs(path);
 603        kfree(path);
 604        return ret;
 605}
 606
 607#ifdef EXT_DEBUG
 608static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 609{
 610        int k, l = path->p_depth;
 611
 612        ext_debug("path:");
 613        for (k = 0; k <= l; k++, path++) {
 614                if (path->p_idx) {
 615                        ext_debug("  %d->%llu",
 616                                  le32_to_cpu(path->p_idx->ei_block),
 617                                  ext4_idx_pblock(path->p_idx));
 618                } else if (path->p_ext) {
 619                        ext_debug("  %d:[%d]%d:%llu ",
 620                                  le32_to_cpu(path->p_ext->ee_block),
 621                                  ext4_ext_is_unwritten(path->p_ext),
 622                                  ext4_ext_get_actual_len(path->p_ext),
 623                                  ext4_ext_pblock(path->p_ext));
 624                } else
 625                        ext_debug("  []");
 626        }
 627        ext_debug("\n");
 628}
 629
 630static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 631{
 632        int depth = ext_depth(inode);
 633        struct ext4_extent_header *eh;
 634        struct ext4_extent *ex;
 635        int i;
 636
 637        if (!path)
 638                return;
 639
 640        eh = path[depth].p_hdr;
 641        ex = EXT_FIRST_EXTENT(eh);
 642
 643        ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
 644
 645        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
 646                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
 647                          ext4_ext_is_unwritten(ex),
 648                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
 649        }
 650        ext_debug("\n");
 651}
 652
 653static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
 654                        ext4_fsblk_t newblock, int level)
 655{
 656        int depth = ext_depth(inode);
 657        struct ext4_extent *ex;
 658
 659        if (depth != level) {
 660                struct ext4_extent_idx *idx;
 661                idx = path[level].p_idx;
 662                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
 663                        ext_debug("%d: move %d:%llu in new index %llu\n", level,
 664                                        le32_to_cpu(idx->ei_block),
 665                                        ext4_idx_pblock(idx),
 666                                        newblock);
 667                        idx++;
 668                }
 669
 670                return;
 671        }
 672
 673        ex = path[depth].p_ext;
 674        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
 675                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
 676                                le32_to_cpu(ex->ee_block),
 677                                ext4_ext_pblock(ex),
 678                                ext4_ext_is_unwritten(ex),
 679                                ext4_ext_get_actual_len(ex),
 680                                newblock);
 681                ex++;
 682        }
 683}
 684
 685#else
 686#define ext4_ext_show_path(inode, path)
 687#define ext4_ext_show_leaf(inode, path)
 688#define ext4_ext_show_move(inode, path, newblock, level)
 689#endif
 690
 691void ext4_ext_drop_refs(struct ext4_ext_path *path)
 692{
 693        int depth, i;
 694
 695        if (!path)
 696                return;
 697        depth = path->p_depth;
 698        for (i = 0; i <= depth; i++, path++) {
 699                if (path->p_bh) {
 700                        brelse(path->p_bh);
 701                        path->p_bh = NULL;
 702                }
 703        }
 704}
 705
 706/*
 707 * ext4_ext_binsearch_idx:
 708 * binary search for the closest index of the given block
 709 * the header must be checked before calling this
 710 */
 711static void
 712ext4_ext_binsearch_idx(struct inode *inode,
 713                        struct ext4_ext_path *path, ext4_lblk_t block)
 714{
 715        struct ext4_extent_header *eh = path->p_hdr;
 716        struct ext4_extent_idx *r, *l, *m;
 717
 718
 719        ext_debug("binsearch for %u(idx):  ", block);
 720
 721        l = EXT_FIRST_INDEX(eh) + 1;
 722        r = EXT_LAST_INDEX(eh);
 723        while (l <= r) {
 724                m = l + (r - l) / 2;
 725                if (block < le32_to_cpu(m->ei_block))
 726                        r = m - 1;
 727                else
 728                        l = m + 1;
 729                ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
 730                                m, le32_to_cpu(m->ei_block),
 731                                r, le32_to_cpu(r->ei_block));
 732        }
 733
 734        path->p_idx = l - 1;
 735        ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
 736                  ext4_idx_pblock(path->p_idx));
 737
 738#ifdef CHECK_BINSEARCH
 739        {
 740                struct ext4_extent_idx *chix, *ix;
 741                int k;
 742
 743                chix = ix = EXT_FIRST_INDEX(eh);
 744                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
 745                        if (k != 0 && le32_to_cpu(ix->ei_block) <=
 746                            le32_to_cpu(ix[-1].ei_block)) {
 747                                printk(KERN_DEBUG "k=%d, ix=0x%p, "
 748                                       "first=0x%p\n", k,
 749                                       ix, EXT_FIRST_INDEX(eh));
 750                                printk(KERN_DEBUG "%u <= %u\n",
 751                                       le32_to_cpu(ix->ei_block),
 752                                       le32_to_cpu(ix[-1].ei_block));
 753                        }
 754                        BUG_ON(k && le32_to_cpu(ix->ei_block)
 755                                           <= le32_to_cpu(ix[-1].ei_block));
 756                        if (block < le32_to_cpu(ix->ei_block))
 757                                break;
 758                        chix = ix;
 759                }
 760                BUG_ON(chix != path->p_idx);
 761        }
 762#endif
 763
 764}
 765
 766/*
 767 * ext4_ext_binsearch:
 768 * binary search for closest extent of the given block
 769 * the header must be checked before calling this
 770 */
 771static void
 772ext4_ext_binsearch(struct inode *inode,
 773                struct ext4_ext_path *path, ext4_lblk_t block)
 774{
 775        struct ext4_extent_header *eh = path->p_hdr;
 776        struct ext4_extent *r, *l, *m;
 777
 778        if (eh->eh_entries == 0) {
 779                /*
 780                 * this leaf is empty:
 781                 * we get such a leaf in split/add case
 782                 */
 783                return;
 784        }
 785
 786        ext_debug("binsearch for %u:  ", block);
 787
 788        l = EXT_FIRST_EXTENT(eh) + 1;
 789        r = EXT_LAST_EXTENT(eh);
 790
 791        while (l <= r) {
 792                m = l + (r - l) / 2;
 793                if (block < le32_to_cpu(m->ee_block))
 794                        r = m - 1;
 795                else
 796                        l = m + 1;
 797                ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
 798                                m, le32_to_cpu(m->ee_block),
 799                                r, le32_to_cpu(r->ee_block));
 800        }
 801
 802        path->p_ext = l - 1;
 803        ext_debug("  -> %d:%llu:[%d]%d ",
 804                        le32_to_cpu(path->p_ext->ee_block),
 805                        ext4_ext_pblock(path->p_ext),
 806                        ext4_ext_is_unwritten(path->p_ext),
 807                        ext4_ext_get_actual_len(path->p_ext));
 808
 809#ifdef CHECK_BINSEARCH
 810        {
 811                struct ext4_extent *chex, *ex;
 812                int k;
 813
 814                chex = ex = EXT_FIRST_EXTENT(eh);
 815                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
 816                        BUG_ON(k && le32_to_cpu(ex->ee_block)
 817                                          <= le32_to_cpu(ex[-1].ee_block));
 818                        if (block < le32_to_cpu(ex->ee_block))
 819                                break;
 820                        chex = ex;
 821                }
 822                BUG_ON(chex != path->p_ext);
 823        }
 824#endif
 825
 826}
 827
 828int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 829{
 830        struct ext4_extent_header *eh;
 831
 832        eh = ext_inode_hdr(inode);
 833        eh->eh_depth = 0;
 834        eh->eh_entries = 0;
 835        eh->eh_magic = EXT4_EXT_MAGIC;
 836        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
 837        ext4_mark_inode_dirty(handle, inode);
 838        return 0;
 839}
 840
 841struct ext4_ext_path *
 842ext4_find_extent(struct inode *inode, ext4_lblk_t block,
 843                 struct ext4_ext_path **orig_path, int flags)
 844{
 845        struct ext4_extent_header *eh;
 846        struct buffer_head *bh;
 847        struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
 848        short int depth, i, ppos = 0;
 849        int ret;
 850
 851        eh = ext_inode_hdr(inode);
 852        depth = ext_depth(inode);
 853        if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
 854                EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
 855                                 depth);
 856                ret = -EFSCORRUPTED;
 857                goto err;
 858        }
 859
 860        if (path) {
 861                ext4_ext_drop_refs(path);
 862                if (depth > path[0].p_maxdepth) {
 863                        kfree(path);
 864                        *orig_path = path = NULL;
 865                }
 866        }
 867        if (!path) {
 868                /* account possible depth increase */
 869                path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
 870                                GFP_NOFS);
 871                if (unlikely(!path))
 872                        return ERR_PTR(-ENOMEM);
 873                path[0].p_maxdepth = depth + 1;
 874        }
 875        path[0].p_hdr = eh;
 876        path[0].p_bh = NULL;
 877
 878        i = depth;
 879        if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
 880                ext4_cache_extents(inode, eh);
 881        /* walk through the tree */
 882        while (i) {
 883                ext_debug("depth %d: num %d, max %d\n",
 884                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 885
 886                ext4_ext_binsearch_idx(inode, path + ppos, block);
 887                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 888                path[ppos].p_depth = i;
 889                path[ppos].p_ext = NULL;
 890
 891                bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
 892                                            flags);
 893                if (IS_ERR(bh)) {
 894                        ret = PTR_ERR(bh);
 895                        goto err;
 896                }
 897
 898                eh = ext_block_hdr(bh);
 899                ppos++;
 900                path[ppos].p_bh = bh;
 901                path[ppos].p_hdr = eh;
 902        }
 903
 904        path[ppos].p_depth = i;
 905        path[ppos].p_ext = NULL;
 906        path[ppos].p_idx = NULL;
 907
 908        /* find extent */
 909        ext4_ext_binsearch(inode, path + ppos, block);
 910        /* if not an empty leaf */
 911        if (path[ppos].p_ext)
 912                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 913
 914        ext4_ext_show_path(inode, path);
 915
 916        return path;
 917
 918err:
 919        ext4_ext_drop_refs(path);
 920        kfree(path);
 921        if (orig_path)
 922                *orig_path = NULL;
 923        return ERR_PTR(ret);
 924}
 925
 926/*
 927 * ext4_ext_insert_index:
 928 * insert new index [@logical;@ptr] into the block at @curp;
 929 * check where to insert: before @curp or after @curp
 930 */
 931static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 932                                 struct ext4_ext_path *curp,
 933                                 int logical, ext4_fsblk_t ptr)
 934{
 935        struct ext4_extent_idx *ix;
 936        int len, err;
 937
 938        err = ext4_ext_get_access(handle, inode, curp);
 939        if (err)
 940                return err;
 941
 942        if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
 943                EXT4_ERROR_INODE(inode,
 944                                 "logical %d == ei_block %d!",
 945                                 logical, le32_to_cpu(curp->p_idx->ei_block));
 946                return -EFSCORRUPTED;
 947        }
 948
 949        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
 950                             >= le16_to_cpu(curp->p_hdr->eh_max))) {
 951                EXT4_ERROR_INODE(inode,
 952                                 "eh_entries %d >= eh_max %d!",
 953                                 le16_to_cpu(curp->p_hdr->eh_entries),
 954                                 le16_to_cpu(curp->p_hdr->eh_max));
 955                return -EFSCORRUPTED;
 956        }
 957
 958        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
 959                /* insert after */
 960                ext_debug("insert new index %d after: %llu\n", logical, ptr);
 961                ix = curp->p_idx + 1;
 962        } else {
 963                /* insert before */
 964                ext_debug("insert new index %d before: %llu\n", logical, ptr);
 965                ix = curp->p_idx;
 966        }
 967
 968        len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
 969        BUG_ON(len < 0);
 970        if (len > 0) {
 971                ext_debug("insert new index %d: "
 972                                "move %d indices from 0x%p to 0x%p\n",
 973                                logical, len, ix, ix + 1);
 974                memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
 975        }
 976
 977        if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
 978                EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
 979                return -EFSCORRUPTED;
 980        }
 981
 982        ix->ei_block = cpu_to_le32(logical);
 983        ext4_idx_store_pblock(ix, ptr);
 984        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
 985
 986        if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
 987                EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
 988                return -EFSCORRUPTED;
 989        }
 990
 991        err = ext4_ext_dirty(handle, inode, curp);
 992        ext4_std_error(inode->i_sb, err);
 993
 994        return err;
 995}
 996
 997/*
 998 * ext4_ext_split:
 999 * inserts new subtree into the path, using free index entry
1000 * at depth @at:
1001 * - allocates all needed blocks (new leaf and all intermediate index blocks)
1002 * - makes decision where to split
1003 * - moves remaining extents and index entries (right to the split point)
1004 *   into the newly allocated blocks
1005 * - initializes subtree
1006 */
1007static int ext4_ext_split(handle_t *handle, struct inode *inode,
1008                          unsigned int flags,
1009                          struct ext4_ext_path *path,
1010                          struct ext4_extent *newext, int at)
1011{
1012        struct buffer_head *bh = NULL;
1013        int depth = ext_depth(inode);
1014        struct ext4_extent_header *neh;
1015        struct ext4_extent_idx *fidx;
1016        int i = at, k, m, a;
1017        ext4_fsblk_t newblock, oldblock;
1018        __le32 border;
1019        ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1020        int err = 0;
1021        size_t ext_size = 0;
1022
1023        /* make decision: where to split? */
1024        /* FIXME: now decision is simplest: at current extent */
1025
1026        /* if current leaf will be split, then we should use
1027         * border from split point */
1028        if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1029                EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1030                return -EFSCORRUPTED;
1031        }
1032        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1033                border = path[depth].p_ext[1].ee_block;
1034                ext_debug("leaf will be split."
1035                                " next leaf starts at %d\n",
1036                                  le32_to_cpu(border));
1037        } else {
1038                border = newext->ee_block;
1039                ext_debug("leaf will be added."
1040                                " next leaf starts at %d\n",
1041                                le32_to_cpu(border));
1042        }
1043
1044        /*
1045         * If error occurs, then we break processing
1046         * and mark filesystem read-only. index won't
1047         * be inserted and tree will be in consistent
1048         * state. Next mount will repair buffers too.
1049         */
1050
1051        /*
1052         * Get array to track all allocated blocks.
1053         * We need this to handle errors and free blocks
1054         * upon them.
1055         */
1056        ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), GFP_NOFS);
1057        if (!ablocks)
1058                return -ENOMEM;
1059
1060        /* allocate all needed blocks */
1061        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1062        for (a = 0; a < depth - at; a++) {
1063                newblock = ext4_ext_new_meta_block(handle, inode, path,
1064                                                   newext, &err, flags);
1065                if (newblock == 0)
1066                        goto cleanup;
1067                ablocks[a] = newblock;
1068        }
1069
1070        /* initialize new leaf */
1071        newblock = ablocks[--a];
1072        if (unlikely(newblock == 0)) {
1073                EXT4_ERROR_INODE(inode, "newblock == 0!");
1074                err = -EFSCORRUPTED;
1075                goto cleanup;
1076        }
1077        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1078        if (unlikely(!bh)) {
1079                err = -ENOMEM;
1080                goto cleanup;
1081        }
1082        lock_buffer(bh);
1083
1084        err = ext4_journal_get_create_access(handle, bh);
1085        if (err)
1086                goto cleanup;
1087
1088        neh = ext_block_hdr(bh);
1089        neh->eh_entries = 0;
1090        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1091        neh->eh_magic = EXT4_EXT_MAGIC;
1092        neh->eh_depth = 0;
1093
1094        /* move remainder of path[depth] to the new leaf */
1095        if (unlikely(path[depth].p_hdr->eh_entries !=
1096                     path[depth].p_hdr->eh_max)) {
1097                EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1098                                 path[depth].p_hdr->eh_entries,
1099                                 path[depth].p_hdr->eh_max);
1100                err = -EFSCORRUPTED;
1101                goto cleanup;
1102        }
1103        /* start copy from next extent */
1104        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1105        ext4_ext_show_move(inode, path, newblock, depth);
1106        if (m) {
1107                struct ext4_extent *ex;
1108                ex = EXT_FIRST_EXTENT(neh);
1109                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1110                le16_add_cpu(&neh->eh_entries, m);
1111        }
1112
1113        /* zero out unused area in the extent block */
1114        ext_size = sizeof(struct ext4_extent_header) +
1115                sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
1116        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1117        ext4_extent_block_csum_set(inode, neh);
1118        set_buffer_uptodate(bh);
1119        unlock_buffer(bh);
1120
1121        err = ext4_handle_dirty_metadata(handle, inode, bh);
1122        if (err)
1123                goto cleanup;
1124        brelse(bh);
1125        bh = NULL;
1126
1127        /* correct old leaf */
1128        if (m) {
1129                err = ext4_ext_get_access(handle, inode, path + depth);
1130                if (err)
1131                        goto cleanup;
1132                le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1133                err = ext4_ext_dirty(handle, inode, path + depth);
1134                if (err)
1135                        goto cleanup;
1136
1137        }
1138
1139        /* create intermediate indexes */
1140        k = depth - at - 1;
1141        if (unlikely(k < 0)) {
1142                EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1143                err = -EFSCORRUPTED;
1144                goto cleanup;
1145        }
1146        if (k)
1147                ext_debug("create %d intermediate indices\n", k);
1148        /* insert new index into current index block */
1149        /* current depth stored in i var */
1150        i = depth - 1;
1151        while (k--) {
1152                oldblock = newblock;
1153                newblock = ablocks[--a];
1154                bh = sb_getblk(inode->i_sb, newblock);
1155                if (unlikely(!bh)) {
1156                        err = -ENOMEM;
1157                        goto cleanup;
1158                }
1159                lock_buffer(bh);
1160
1161                err = ext4_journal_get_create_access(handle, bh);
1162                if (err)
1163                        goto cleanup;
1164
1165                neh = ext_block_hdr(bh);
1166                neh->eh_entries = cpu_to_le16(1);
1167                neh->eh_magic = EXT4_EXT_MAGIC;
1168                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1169                neh->eh_depth = cpu_to_le16(depth - i);
1170                fidx = EXT_FIRST_INDEX(neh);
1171                fidx->ei_block = border;
1172                ext4_idx_store_pblock(fidx, oldblock);
1173
1174                ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1175                                i, newblock, le32_to_cpu(border), oldblock);
1176
1177                /* move remainder of path[i] to the new index block */
1178                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1179                                        EXT_LAST_INDEX(path[i].p_hdr))) {
1180                        EXT4_ERROR_INODE(inode,
1181                                         "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1182                                         le32_to_cpu(path[i].p_ext->ee_block));
1183                        err = -EFSCORRUPTED;
1184                        goto cleanup;
1185                }
1186                /* start copy indexes */
1187                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1188                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1189                                EXT_MAX_INDEX(path[i].p_hdr));
1190                ext4_ext_show_move(inode, path, newblock, i);
1191                if (m) {
1192                        memmove(++fidx, path[i].p_idx,
1193                                sizeof(struct ext4_extent_idx) * m);
1194                        le16_add_cpu(&neh->eh_entries, m);
1195                }
1196                /* zero out unused area in the extent block */
1197                ext_size = sizeof(struct ext4_extent_header) +
1198                   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
1199                memset(bh->b_data + ext_size, 0,
1200                        inode->i_sb->s_blocksize - ext_size);
1201                ext4_extent_block_csum_set(inode, neh);
1202                set_buffer_uptodate(bh);
1203                unlock_buffer(bh);
1204
1205                err = ext4_handle_dirty_metadata(handle, inode, bh);
1206                if (err)
1207                        goto cleanup;
1208                brelse(bh);
1209                bh = NULL;
1210
1211                /* correct old index */
1212                if (m) {
1213                        err = ext4_ext_get_access(handle, inode, path + i);
1214                        if (err)
1215                                goto cleanup;
1216                        le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1217                        err = ext4_ext_dirty(handle, inode, path + i);
1218                        if (err)
1219                                goto cleanup;
1220                }
1221
1222                i--;
1223        }
1224
1225        /* insert new index */
1226        err = ext4_ext_insert_index(handle, inode, path + at,
1227                                    le32_to_cpu(border), newblock);
1228
1229cleanup:
1230        if (bh) {
1231                if (buffer_locked(bh))
1232                        unlock_buffer(bh);
1233                brelse(bh);
1234        }
1235
1236        if (err) {
1237                /* free all allocated blocks in error case */
1238                for (i = 0; i < depth; i++) {
1239                        if (!ablocks[i])
1240                                continue;
1241                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1242                                         EXT4_FREE_BLOCKS_METADATA);
1243                }
1244        }
1245        kfree(ablocks);
1246
1247        return err;
1248}
1249
1250/*
1251 * ext4_ext_grow_indepth:
1252 * implements tree growing procedure:
1253 * - allocates new block
1254 * - moves top-level data (index block or leaf) into the new block
1255 * - initializes new top-level, creating index that points to the
1256 *   just created block
1257 */
1258static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1259                                 unsigned int flags)
1260{
1261        struct ext4_extent_header *neh;
1262        struct buffer_head *bh;
1263        ext4_fsblk_t newblock, goal = 0;
1264        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1265        int err = 0;
1266        size_t ext_size = 0;
1267
1268        /* Try to prepend new index to old one */
1269        if (ext_depth(inode))
1270                goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1271        if (goal > le32_to_cpu(es->s_first_data_block)) {
1272                flags |= EXT4_MB_HINT_TRY_GOAL;
1273                goal--;
1274        } else
1275                goal = ext4_inode_to_goal_block(inode);
1276        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1277                                        NULL, &err);
1278        if (newblock == 0)
1279                return err;
1280
1281        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1282        if (unlikely(!bh))
1283                return -ENOMEM;
1284        lock_buffer(bh);
1285
1286        err = ext4_journal_get_create_access(handle, bh);
1287        if (err) {
1288                unlock_buffer(bh);
1289                goto out;
1290        }
1291
1292        ext_size = sizeof(EXT4_I(inode)->i_data);
1293        /* move top-level index/leaf into new block */
1294        memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
1295        /* zero out unused area in the extent block */
1296        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1297
1298        /* set size of new block */
1299        neh = ext_block_hdr(bh);
1300        /* old root could have indexes or leaves
1301         * so calculate e_max right way */
1302        if (ext_depth(inode))
1303                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1304        else
1305                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1306        neh->eh_magic = EXT4_EXT_MAGIC;
1307        ext4_extent_block_csum_set(inode, neh);
1308        set_buffer_uptodate(bh);
1309        unlock_buffer(bh);
1310
1311        err = ext4_handle_dirty_metadata(handle, inode, bh);
1312        if (err)
1313                goto out;
1314
1315        /* Update top-level index: num,max,pointer */
1316        neh = ext_inode_hdr(inode);
1317        neh->eh_entries = cpu_to_le16(1);
1318        ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1319        if (neh->eh_depth == 0) {
1320                /* Root extent block becomes index block */
1321                neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1322                EXT_FIRST_INDEX(neh)->ei_block =
1323                        EXT_FIRST_EXTENT(neh)->ee_block;
1324        }
1325        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1326                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1327                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1328                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1329
1330        le16_add_cpu(&neh->eh_depth, 1);
1331        ext4_mark_inode_dirty(handle, inode);
1332out:
1333        brelse(bh);
1334
1335        return err;
1336}
1337
1338/*
1339 * ext4_ext_create_new_leaf:
1340 * finds empty index and adds new leaf.
1341 * if no free index is found, then it requests in-depth growing.
1342 */
1343static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1344                                    unsigned int mb_flags,
1345                                    unsigned int gb_flags,
1346                                    struct ext4_ext_path **ppath,
1347                                    struct ext4_extent *newext)
1348{
1349        struct ext4_ext_path *path = *ppath;
1350        struct ext4_ext_path *curp;
1351        int depth, i, err = 0;
1352
1353repeat:
1354        i = depth = ext_depth(inode);
1355
1356        /* walk up to the tree and look for free index entry */
1357        curp = path + depth;
1358        while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1359                i--;
1360                curp--;
1361        }
1362
1363        /* we use already allocated block for index block,
1364         * so subsequent data blocks should be contiguous */
1365        if (EXT_HAS_FREE_INDEX(curp)) {
1366                /* if we found index with free entry, then use that
1367                 * entry: create all needed subtree and add new leaf */
1368                err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1369                if (err)
1370                        goto out;
1371
1372                /* refill path */
1373                path = ext4_find_extent(inode,
1374                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1375                                    ppath, gb_flags);
1376                if (IS_ERR(path))
1377                        err = PTR_ERR(path);
1378        } else {
1379                /* tree is full, time to grow in depth */
1380                err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1381                if (err)
1382                        goto out;
1383
1384                /* refill path */
1385                path = ext4_find_extent(inode,
1386                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1387                                    ppath, gb_flags);
1388                if (IS_ERR(path)) {
1389                        err = PTR_ERR(path);
1390                        goto out;
1391                }
1392
1393                /*
1394                 * only first (depth 0 -> 1) produces free space;
1395                 * in all other cases we have to split the grown tree
1396                 */
1397                depth = ext_depth(inode);
1398                if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1399                        /* now we need to split */
1400                        goto repeat;
1401                }
1402        }
1403
1404out:
1405        return err;
1406}
1407
1408/*
1409 * search the closest allocated block to the left for *logical
1410 * and returns it at @logical + it's physical address at @phys
1411 * if *logical is the smallest allocated block, the function
1412 * returns 0 at @phys
1413 * return value contains 0 (success) or error code
1414 */
1415static int ext4_ext_search_left(struct inode *inode,
1416                                struct ext4_ext_path *path,
1417                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
1418{
1419        struct ext4_extent_idx *ix;
1420        struct ext4_extent *ex;
1421        int depth, ee_len;
1422
1423        if (unlikely(path == NULL)) {
1424                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1425                return -EFSCORRUPTED;
1426        }
1427        depth = path->p_depth;
1428        *phys = 0;
1429
1430        if (depth == 0 && path->p_ext == NULL)
1431                return 0;
1432
1433        /* usually extent in the path covers blocks smaller
1434         * then *logical, but it can be that extent is the
1435         * first one in the file */
1436
1437        ex = path[depth].p_ext;
1438        ee_len = ext4_ext_get_actual_len(ex);
1439        if (*logical < le32_to_cpu(ex->ee_block)) {
1440                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1441                        EXT4_ERROR_INODE(inode,
1442                                         "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1443                                         *logical, le32_to_cpu(ex->ee_block));
1444                        return -EFSCORRUPTED;
1445                }
1446                while (--depth >= 0) {
1447                        ix = path[depth].p_idx;
1448                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1449                                EXT4_ERROR_INODE(inode,
1450                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1451                                  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1452                                  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1453                le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1454                                  depth);
1455                                return -EFSCORRUPTED;
1456                        }
1457                }
1458                return 0;
1459        }
1460
1461        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1462                EXT4_ERROR_INODE(inode,
1463                                 "logical %d < ee_block %d + ee_len %d!",
1464                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
1465                return -EFSCORRUPTED;
1466        }
1467
1468        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1469        *phys = ext4_ext_pblock(ex) + ee_len - 1;
1470        return 0;
1471}
1472
1473/*
1474 * search the closest allocated block to the right for *logical
1475 * and returns it at @logical + it's physical address at @phys
1476 * if *logical is the largest allocated block, the function
1477 * returns 0 at @phys
1478 * return value contains 0 (success) or error code
1479 */
1480static int ext4_ext_search_right(struct inode *inode,
1481                                 struct ext4_ext_path *path,
1482                                 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1483                                 struct ext4_extent **ret_ex)
1484{
1485        struct buffer_head *bh = NULL;
1486        struct ext4_extent_header *eh;
1487        struct ext4_extent_idx *ix;
1488        struct ext4_extent *ex;
1489        ext4_fsblk_t block;
1490        int depth;      /* Note, NOT eh_depth; depth from top of tree */
1491        int ee_len;
1492
1493        if (unlikely(path == NULL)) {
1494                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1495                return -EFSCORRUPTED;
1496        }
1497        depth = path->p_depth;
1498        *phys = 0;
1499
1500        if (depth == 0 && path->p_ext == NULL)
1501                return 0;
1502
1503        /* usually extent in the path covers blocks smaller
1504         * then *logical, but it can be that extent is the
1505         * first one in the file */
1506
1507        ex = path[depth].p_ext;
1508        ee_len = ext4_ext_get_actual_len(ex);
1509        if (*logical < le32_to_cpu(ex->ee_block)) {
1510                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1511                        EXT4_ERROR_INODE(inode,
1512                                         "first_extent(path[%d].p_hdr) != ex",
1513                                         depth);
1514                        return -EFSCORRUPTED;
1515                }
1516                while (--depth >= 0) {
1517                        ix = path[depth].p_idx;
1518                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1519                                EXT4_ERROR_INODE(inode,
1520                                                 "ix != EXT_FIRST_INDEX *logical %d!",
1521                                                 *logical);
1522                                return -EFSCORRUPTED;
1523                        }
1524                }
1525                goto found_extent;
1526        }
1527
1528        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1529                EXT4_ERROR_INODE(inode,
1530                                 "logical %d < ee_block %d + ee_len %d!",
1531                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
1532                return -EFSCORRUPTED;
1533        }
1534
1535        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1536                /* next allocated block in this leaf */
1537                ex++;
1538                goto found_extent;
1539        }
1540
1541        /* go up and search for index to the right */
1542        while (--depth >= 0) {
1543                ix = path[depth].p_idx;
1544                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1545                        goto got_index;
1546        }
1547
1548        /* we've gone up to the root and found no index to the right */
1549        return 0;
1550
1551got_index:
1552        /* we've found index to the right, let's
1553         * follow it and find the closest allocated
1554         * block to the right */
1555        ix++;
1556        block = ext4_idx_pblock(ix);
1557        while (++depth < path->p_depth) {
1558                /* subtract from p_depth to get proper eh_depth */
1559                bh = read_extent_tree_block(inode, block,
1560                                            path->p_depth - depth, 0);
1561                if (IS_ERR(bh))
1562                        return PTR_ERR(bh);
1563                eh = ext_block_hdr(bh);
1564                ix = EXT_FIRST_INDEX(eh);
1565                block = ext4_idx_pblock(ix);
1566                put_bh(bh);
1567        }
1568
1569        bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1570        if (IS_ERR(bh))
1571                return PTR_ERR(bh);
1572        eh = ext_block_hdr(bh);
1573        ex = EXT_FIRST_EXTENT(eh);
1574found_extent:
1575        *logical = le32_to_cpu(ex->ee_block);
1576        *phys = ext4_ext_pblock(ex);
1577        *ret_ex = ex;
1578        if (bh)
1579                put_bh(bh);
1580        return 0;
1581}
1582
1583/*
1584 * ext4_ext_next_allocated_block:
1585 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1586 * NOTE: it considers block number from index entry as
1587 * allocated block. Thus, index entries have to be consistent
1588 * with leaves.
1589 */
1590ext4_lblk_t
1591ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1592{
1593        int depth;
1594
1595        BUG_ON(path == NULL);
1596        depth = path->p_depth;
1597
1598        if (depth == 0 && path->p_ext == NULL)
1599                return EXT_MAX_BLOCKS;
1600
1601        while (depth >= 0) {
1602                struct ext4_ext_path *p = &path[depth];
1603
1604                if (depth == path->p_depth) {
1605                        /* leaf */
1606                        if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
1607                                return le32_to_cpu(p->p_ext[1].ee_block);
1608                } else {
1609                        /* index */
1610                        if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
1611                                return le32_to_cpu(p->p_idx[1].ei_block);
1612                }
1613                depth--;
1614        }
1615
1616        return EXT_MAX_BLOCKS;
1617}
1618
1619/*
1620 * ext4_ext_next_leaf_block:
1621 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1622 */
1623static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1624{
1625        int depth;
1626
1627        BUG_ON(path == NULL);
1628        depth = path->p_depth;
1629
1630        /* zero-tree has no leaf blocks at all */
1631        if (depth == 0)
1632                return EXT_MAX_BLOCKS;
1633
1634        /* go to index block */
1635        depth--;
1636
1637        while (depth >= 0) {
1638                if (path[depth].p_idx !=
1639                                EXT_LAST_INDEX(path[depth].p_hdr))
1640                        return (ext4_lblk_t)
1641                                le32_to_cpu(path[depth].p_idx[1].ei_block);
1642                depth--;
1643        }
1644
1645        return EXT_MAX_BLOCKS;
1646}
1647
1648/*
1649 * ext4_ext_correct_indexes:
1650 * if leaf gets modified and modified extent is first in the leaf,
1651 * then we have to correct all indexes above.
1652 * TODO: do we need to correct tree in all cases?
1653 */
1654static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1655                                struct ext4_ext_path *path)
1656{
1657        struct ext4_extent_header *eh;
1658        int depth = ext_depth(inode);
1659        struct ext4_extent *ex;
1660        __le32 border;
1661        int k, err = 0;
1662
1663        eh = path[depth].p_hdr;
1664        ex = path[depth].p_ext;
1665
1666        if (unlikely(ex == NULL || eh == NULL)) {
1667                EXT4_ERROR_INODE(inode,
1668                                 "ex %p == NULL or eh %p == NULL", ex, eh);
1669                return -EFSCORRUPTED;
1670        }
1671
1672        if (depth == 0) {
1673                /* there is no tree at all */
1674                return 0;
1675        }
1676
1677        if (ex != EXT_FIRST_EXTENT(eh)) {
1678                /* we correct tree if first leaf got modified only */
1679                return 0;
1680        }
1681
1682        /*
1683         * TODO: we need correction if border is smaller than current one
1684         */
1685        k = depth - 1;
1686        border = path[depth].p_ext->ee_block;
1687        err = ext4_ext_get_access(handle, inode, path + k);
1688        if (err)
1689                return err;
1690        path[k].p_idx->ei_block = border;
1691        err = ext4_ext_dirty(handle, inode, path + k);
1692        if (err)
1693                return err;
1694
1695        while (k--) {
1696                /* change all left-side indexes */
1697                if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1698                        break;
1699                err = ext4_ext_get_access(handle, inode, path + k);
1700                if (err)
1701                        break;
1702                path[k].p_idx->ei_block = border;
1703                err = ext4_ext_dirty(handle, inode, path + k);
1704                if (err)
1705                        break;
1706        }
1707
1708        return err;
1709}
1710
1711static int ext4_can_extents_be_merged(struct inode *inode,
1712                                      struct ext4_extent *ex1,
1713                                      struct ext4_extent *ex2)
1714{
1715        unsigned short ext1_ee_len, ext2_ee_len;
1716
1717        if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1718                return 0;
1719
1720        ext1_ee_len = ext4_ext_get_actual_len(ex1);
1721        ext2_ee_len = ext4_ext_get_actual_len(ex2);
1722
1723        if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1724                        le32_to_cpu(ex2->ee_block))
1725                return 0;
1726
1727        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1728                return 0;
1729
1730        if (ext4_ext_is_unwritten(ex1) &&
1731            ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
1732                return 0;
1733#ifdef AGGRESSIVE_TEST
1734        if (ext1_ee_len >= 4)
1735                return 0;
1736#endif
1737
1738        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1739                return 1;
1740        return 0;
1741}
1742
1743/*
1744 * This function tries to merge the "ex" extent to the next extent in the tree.
1745 * It always tries to merge towards right. If you want to merge towards
1746 * left, pass "ex - 1" as argument instead of "ex".
1747 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1748 * 1 if they got merged.
1749 */
1750static int ext4_ext_try_to_merge_right(struct inode *inode,
1751                                 struct ext4_ext_path *path,
1752                                 struct ext4_extent *ex)
1753{
1754        struct ext4_extent_header *eh;
1755        unsigned int depth, len;
1756        int merge_done = 0, unwritten;
1757
1758        depth = ext_depth(inode);
1759        BUG_ON(path[depth].p_hdr == NULL);
1760        eh = path[depth].p_hdr;
1761
1762        while (ex < EXT_LAST_EXTENT(eh)) {
1763                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1764                        break;
1765                /* merge with next extent! */
1766                unwritten = ext4_ext_is_unwritten(ex);
1767                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1768                                + ext4_ext_get_actual_len(ex + 1));
1769                if (unwritten)
1770                        ext4_ext_mark_unwritten(ex);
1771
1772                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1773                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
1774                                * sizeof(struct ext4_extent);
1775                        memmove(ex + 1, ex + 2, len);
1776                }
1777                le16_add_cpu(&eh->eh_entries, -1);
1778                merge_done = 1;
1779                WARN_ON(eh->eh_entries == 0);
1780                if (!eh->eh_entries)
1781                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1782        }
1783
1784        return merge_done;
1785}
1786
1787/*
1788 * This function does a very simple check to see if we can collapse
1789 * an extent tree with a single extent tree leaf block into the inode.
1790 */
1791static void ext4_ext_try_to_merge_up(handle_t *handle,
1792                                     struct inode *inode,
1793                                     struct ext4_ext_path *path)
1794{
1795        size_t s;
1796        unsigned max_root = ext4_ext_space_root(inode, 0);
1797        ext4_fsblk_t blk;
1798
1799        if ((path[0].p_depth != 1) ||
1800            (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1801            (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1802                return;
1803
1804        /*
1805         * We need to modify the block allocation bitmap and the block
1806         * group descriptor to release the extent tree block.  If we
1807         * can't get the journal credits, give up.
1808         */
1809        if (ext4_journal_extend(handle, 2,
1810                        ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
1811                return;
1812
1813        /*
1814         * Copy the extent data up to the inode
1815         */
1816        blk = ext4_idx_pblock(path[0].p_idx);
1817        s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1818                sizeof(struct ext4_extent_idx);
1819        s += sizeof(struct ext4_extent_header);
1820
1821        path[1].p_maxdepth = path[0].p_maxdepth;
1822        memcpy(path[0].p_hdr, path[1].p_hdr, s);
1823        path[0].p_depth = 0;
1824        path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1825                (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1826        path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1827
1828        brelse(path[1].p_bh);
1829        ext4_free_blocks(handle, inode, NULL, blk, 1,
1830                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1831}
1832
1833/*
1834 * This function tries to merge the @ex extent to neighbours in the tree, then
1835 * tries to collapse the extent tree into the inode.
1836 */
1837static void ext4_ext_try_to_merge(handle_t *handle,
1838                                  struct inode *inode,
1839                                  struct ext4_ext_path *path,
1840                                  struct ext4_extent *ex)
1841{
1842        struct ext4_extent_header *eh;
1843        unsigned int depth;
1844        int merge_done = 0;
1845
1846        depth = ext_depth(inode);
1847        BUG_ON(path[depth].p_hdr == NULL);
1848        eh = path[depth].p_hdr;
1849
1850        if (ex > EXT_FIRST_EXTENT(eh))
1851                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1852
1853        if (!merge_done)
1854                (void) ext4_ext_try_to_merge_right(inode, path, ex);
1855
1856        ext4_ext_try_to_merge_up(handle, inode, path);
1857}
1858
1859/*
1860 * check if a portion of the "newext" extent overlaps with an
1861 * existing extent.
1862 *
1863 * If there is an overlap discovered, it updates the length of the newext
1864 * such that there will be no overlap, and then returns 1.
1865 * If there is no overlap found, it returns 0.
1866 */
1867static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1868                                           struct inode *inode,
1869                                           struct ext4_extent *newext,
1870                                           struct ext4_ext_path *path)
1871{
1872        ext4_lblk_t b1, b2;
1873        unsigned int depth, len1;
1874        unsigned int ret = 0;
1875
1876        b1 = le32_to_cpu(newext->ee_block);
1877        len1 = ext4_ext_get_actual_len(newext);
1878        depth = ext_depth(inode);
1879        if (!path[depth].p_ext)
1880                goto out;
1881        b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1882
1883        /*
1884         * get the next allocated block if the extent in the path
1885         * is before the requested block(s)
1886         */
1887        if (b2 < b1) {
1888                b2 = ext4_ext_next_allocated_block(path);
1889                if (b2 == EXT_MAX_BLOCKS)
1890                        goto out;
1891                b2 = EXT4_LBLK_CMASK(sbi, b2);
1892        }
1893
1894        /* check for wrap through zero on extent logical start block*/
1895        if (b1 + len1 < b1) {
1896                len1 = EXT_MAX_BLOCKS - b1;
1897                newext->ee_len = cpu_to_le16(len1);
1898                ret = 1;
1899        }
1900
1901        /* check for overlap */
1902        if (b1 + len1 > b2) {
1903                newext->ee_len = cpu_to_le16(b2 - b1);
1904                ret = 1;
1905        }
1906out:
1907        return ret;
1908}
1909
1910/*
1911 * ext4_ext_insert_extent:
1912 * tries to merge requsted extent into the existing extent or
1913 * inserts requested extent as new one into the tree,
1914 * creating new leaf in the no-space case.
1915 */
1916int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1917                                struct ext4_ext_path **ppath,
1918                                struct ext4_extent *newext, int gb_flags)
1919{
1920        struct ext4_ext_path *path = *ppath;
1921        struct ext4_extent_header *eh;
1922        struct ext4_extent *ex, *fex;
1923        struct ext4_extent *nearex; /* nearest extent */
1924        struct ext4_ext_path *npath = NULL;
1925        int depth, len, err;
1926        ext4_lblk_t next;
1927        int mb_flags = 0, unwritten;
1928
1929        if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1930                mb_flags |= EXT4_MB_DELALLOC_RESERVED;
1931        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1932                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1933                return -EFSCORRUPTED;
1934        }
1935        depth = ext_depth(inode);
1936        ex = path[depth].p_ext;
1937        eh = path[depth].p_hdr;
1938        if (unlikely(path[depth].p_hdr == NULL)) {
1939                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1940                return -EFSCORRUPTED;
1941        }
1942
1943        /* try to insert block into found extent and return */
1944        if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1945
1946                /*
1947                 * Try to see whether we should rather test the extent on
1948                 * right from ex, or from the left of ex. This is because
1949                 * ext4_find_extent() can return either extent on the
1950                 * left, or on the right from the searched position. This
1951                 * will make merging more effective.
1952                 */
1953                if (ex < EXT_LAST_EXTENT(eh) &&
1954                    (le32_to_cpu(ex->ee_block) +
1955                    ext4_ext_get_actual_len(ex) <
1956                    le32_to_cpu(newext->ee_block))) {
1957                        ex += 1;
1958                        goto prepend;
1959                } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1960                           (le32_to_cpu(newext->ee_block) +
1961                           ext4_ext_get_actual_len(newext) <
1962                           le32_to_cpu(ex->ee_block)))
1963                        ex -= 1;
1964
1965                /* Try to append newex to the ex */
1966                if (ext4_can_extents_be_merged(inode, ex, newext)) {
1967                        ext_debug("append [%d]%d block to %u:[%d]%d"
1968                                  "(from %llu)\n",
1969                                  ext4_ext_is_unwritten(newext),
1970                                  ext4_ext_get_actual_len(newext),
1971                                  le32_to_cpu(ex->ee_block),
1972                                  ext4_ext_is_unwritten(ex),
1973                                  ext4_ext_get_actual_len(ex),
1974                                  ext4_ext_pblock(ex));
1975                        err = ext4_ext_get_access(handle, inode,
1976                                                  path + depth);
1977                        if (err)
1978                                return err;
1979                        unwritten = ext4_ext_is_unwritten(ex);
1980                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1981                                        + ext4_ext_get_actual_len(newext));
1982                        if (unwritten)
1983                                ext4_ext_mark_unwritten(ex);
1984                        eh = path[depth].p_hdr;
1985                        nearex = ex;
1986                        goto merge;
1987                }
1988
1989prepend:
1990                /* Try to prepend newex to the ex */
1991                if (ext4_can_extents_be_merged(inode, newext, ex)) {
1992                        ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1993                                  "(from %llu)\n",
1994                                  le32_to_cpu(newext->ee_block),
1995                                  ext4_ext_is_unwritten(newext),
1996                                  ext4_ext_get_actual_len(newext),
1997                                  le32_to_cpu(ex->ee_block),
1998                                  ext4_ext_is_unwritten(ex),
1999                                  ext4_ext_get_actual_len(ex),
2000                                  ext4_ext_pblock(ex));
2001                        err = ext4_ext_get_access(handle, inode,
2002                                                  path + depth);
2003                        if (err)
2004                                return err;
2005
2006                        unwritten = ext4_ext_is_unwritten(ex);
2007                        ex->ee_block = newext->ee_block;
2008                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
2009                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2010                                        + ext4_ext_get_actual_len(newext));
2011                        if (unwritten)
2012                                ext4_ext_mark_unwritten(ex);
2013                        eh = path[depth].p_hdr;
2014                        nearex = ex;
2015                        goto merge;
2016                }
2017        }
2018
2019        depth = ext_depth(inode);
2020        eh = path[depth].p_hdr;
2021        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2022                goto has_space;
2023
2024        /* probably next leaf has space for us? */
2025        fex = EXT_LAST_EXTENT(eh);
2026        next = EXT_MAX_BLOCKS;
2027        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2028                next = ext4_ext_next_leaf_block(path);
2029        if (next != EXT_MAX_BLOCKS) {
2030                ext_debug("next leaf block - %u\n", next);
2031                BUG_ON(npath != NULL);
2032                npath = ext4_find_extent(inode, next, NULL, 0);
2033                if (IS_ERR(npath))
2034                        return PTR_ERR(npath);
2035                BUG_ON(npath->p_depth != path->p_depth);
2036                eh = npath[depth].p_hdr;
2037                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2038                        ext_debug("next leaf isn't full(%d)\n",
2039                                  le16_to_cpu(eh->eh_entries));
2040                        path = npath;
2041                        goto has_space;
2042                }
2043                ext_debug("next leaf has no free space(%d,%d)\n",
2044                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2045        }
2046
2047        /*
2048         * There is no free space in the found leaf.
2049         * We're gonna add a new leaf in the tree.
2050         */
2051        if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2052                mb_flags |= EXT4_MB_USE_RESERVED;
2053        err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2054                                       ppath, newext);
2055        if (err)
2056                goto cleanup;
2057        depth = ext_depth(inode);
2058        eh = path[depth].p_hdr;
2059
2060has_space:
2061        nearex = path[depth].p_ext;
2062
2063        err = ext4_ext_get_access(handle, inode, path + depth);
2064        if (err)
2065                goto cleanup;
2066
2067        if (!nearex) {
2068                /* there is no extent in this leaf, create first one */
2069                ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2070                                le32_to_cpu(newext->ee_block),
2071                                ext4_ext_pblock(newext),
2072                                ext4_ext_is_unwritten(newext),
2073                                ext4_ext_get_actual_len(newext));
2074                nearex = EXT_FIRST_EXTENT(eh);
2075        } else {
2076                if (le32_to_cpu(newext->ee_block)
2077                           > le32_to_cpu(nearex->ee_block)) {
2078                        /* Insert after */
2079                        ext_debug("insert %u:%llu:[%d]%d before: "
2080                                        "nearest %p\n",
2081                                        le32_to_cpu(newext->ee_block),
2082                                        ext4_ext_pblock(newext),
2083                                        ext4_ext_is_unwritten(newext),
2084                                        ext4_ext_get_actual_len(newext),
2085                                        nearex);
2086                        nearex++;
2087                } else {
2088                        /* Insert before */
2089                        BUG_ON(newext->ee_block == nearex->ee_block);
2090                        ext_debug("insert %u:%llu:[%d]%d after: "
2091                                        "nearest %p\n",
2092                                        le32_to_cpu(newext->ee_block),
2093                                        ext4_ext_pblock(newext),
2094                                        ext4_ext_is_unwritten(newext),
2095                                        ext4_ext_get_actual_len(newext),
2096                                        nearex);
2097                }
2098                len = EXT_LAST_EXTENT(eh) - nearex + 1;
2099                if (len > 0) {
2100                        ext_debug("insert %u:%llu:[%d]%d: "
2101                                        "move %d extents from 0x%p to 0x%p\n",
2102                                        le32_to_cpu(newext->ee_block),
2103                                        ext4_ext_pblock(newext),
2104                                        ext4_ext_is_unwritten(newext),
2105                                        ext4_ext_get_actual_len(newext),
2106                                        len, nearex, nearex + 1);
2107                        memmove(nearex + 1, nearex,
2108                                len * sizeof(struct ext4_extent));
2109                }
2110        }
2111
2112        le16_add_cpu(&eh->eh_entries, 1);
2113        path[depth].p_ext = nearex;
2114        nearex->ee_block = newext->ee_block;
2115        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2116        nearex->ee_len = newext->ee_len;
2117
2118merge:
2119        /* try to merge extents */
2120        if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2121                ext4_ext_try_to_merge(handle, inode, path, nearex);
2122
2123
2124        /* time to correct all indexes above */
2125        err = ext4_ext_correct_indexes(handle, inode, path);
2126        if (err)
2127                goto cleanup;
2128
2129        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2130
2131cleanup:
2132        ext4_ext_drop_refs(npath);
2133        kfree(npath);
2134        return err;
2135}
2136
2137static int ext4_fill_fiemap_extents(struct inode *inode,
2138                                    ext4_lblk_t block, ext4_lblk_t num,
2139                                    struct fiemap_extent_info *fieinfo)
2140{
2141        struct ext4_ext_path *path = NULL;
2142        struct ext4_extent *ex;
2143        struct extent_status es;
2144        ext4_lblk_t next, next_del, start = 0, end = 0;
2145        ext4_lblk_t last = block + num;
2146        int exists, depth = 0, err = 0;
2147        unsigned int flags = 0;
2148        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2149
2150        while (block < last && block != EXT_MAX_BLOCKS) {
2151                num = last - block;
2152                /* find extent for this block */
2153                down_read(&EXT4_I(inode)->i_data_sem);
2154
2155                path = ext4_find_extent(inode, block, &path, 0);
2156                if (IS_ERR(path)) {
2157                        up_read(&EXT4_I(inode)->i_data_sem);
2158                        err = PTR_ERR(path);
2159                        path = NULL;
2160                        break;
2161                }
2162
2163                depth = ext_depth(inode);
2164                if (unlikely(path[depth].p_hdr == NULL)) {
2165                        up_read(&EXT4_I(inode)->i_data_sem);
2166                        EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2167                        err = -EFSCORRUPTED;
2168                        break;
2169                }
2170                ex = path[depth].p_ext;
2171                next = ext4_ext_next_allocated_block(path);
2172
2173                flags = 0;
2174                exists = 0;
2175                if (!ex) {
2176                        /* there is no extent yet, so try to allocate
2177                         * all requested space */
2178                        start = block;
2179                        end = block + num;
2180                } else if (le32_to_cpu(ex->ee_block) > block) {
2181                        /* need to allocate space before found extent */
2182                        start = block;
2183                        end = le32_to_cpu(ex->ee_block);
2184                        if (block + num < end)
2185                                end = block + num;
2186                } else if (block >= le32_to_cpu(ex->ee_block)
2187                                        + ext4_ext_get_actual_len(ex)) {
2188                        /* need to allocate space after found extent */
2189                        start = block;
2190                        end = block + num;
2191                        if (end >= next)
2192                                end = next;
2193                } else if (block >= le32_to_cpu(ex->ee_block)) {
2194                        /*
2195                         * some part of requested space is covered
2196                         * by found extent
2197                         */
2198                        start = block;
2199                        end = le32_to_cpu(ex->ee_block)
2200                                + ext4_ext_get_actual_len(ex);
2201                        if (block + num < end)
2202                                end = block + num;
2203                        exists = 1;
2204                } else {
2205                        BUG();
2206                }
2207                BUG_ON(end <= start);
2208
2209                if (!exists) {
2210                        es.es_lblk = start;
2211                        es.es_len = end - start;
2212                        es.es_pblk = 0;
2213                } else {
2214                        es.es_lblk = le32_to_cpu(ex->ee_block);
2215                        es.es_len = ext4_ext_get_actual_len(ex);
2216                        es.es_pblk = ext4_ext_pblock(ex);
2217                        if (ext4_ext_is_unwritten(ex))
2218                                flags |= FIEMAP_EXTENT_UNWRITTEN;
2219                }
2220
2221                /*
2222                 * Find delayed extent and update es accordingly. We call
2223                 * it even in !exists case to find out whether es is the
2224                 * last existing extent or not.
2225                 */
2226                next_del = ext4_find_delayed_extent(inode, &es);
2227                if (!exists && next_del) {
2228                        exists = 1;
2229                        flags |= (FIEMAP_EXTENT_DELALLOC |
2230                                  FIEMAP_EXTENT_UNKNOWN);
2231                }
2232                up_read(&EXT4_I(inode)->i_data_sem);
2233
2234                if (unlikely(es.es_len == 0)) {
2235                        EXT4_ERROR_INODE(inode, "es.es_len == 0");
2236                        err = -EFSCORRUPTED;
2237                        break;
2238                }
2239
2240                /*
2241                 * This is possible iff next == next_del == EXT_MAX_BLOCKS.
2242                 * we need to check next == EXT_MAX_BLOCKS because it is
2243                 * possible that an extent is with unwritten and delayed
2244                 * status due to when an extent is delayed allocated and
2245                 * is allocated by fallocate status tree will track both of
2246                 * them in a extent.
2247                 *
2248                 * So we could return a unwritten and delayed extent, and
2249                 * its block is equal to 'next'.
2250                 */
2251                if (next == next_del && next == EXT_MAX_BLOCKS) {
2252                        flags |= FIEMAP_EXTENT_LAST;
2253                        if (unlikely(next_del != EXT_MAX_BLOCKS ||
2254                                     next != EXT_MAX_BLOCKS)) {
2255                                EXT4_ERROR_INODE(inode,
2256                                                 "next extent == %u, next "
2257                                                 "delalloc extent = %u",
2258                                                 next, next_del);
2259                                err = -EFSCORRUPTED;
2260                                break;
2261                        }
2262                }
2263
2264                if (exists) {
2265                        err = fiemap_fill_next_extent(fieinfo,
2266                                (__u64)es.es_lblk << blksize_bits,
2267                                (__u64)es.es_pblk << blksize_bits,
2268                                (__u64)es.es_len << blksize_bits,
2269                                flags);
2270                        if (err < 0)
2271                                break;
2272                        if (err == 1) {
2273                                err = 0;
2274                                break;
2275                        }
2276                }
2277
2278                block = es.es_lblk + es.es_len;
2279        }
2280
2281        ext4_ext_drop_refs(path);
2282        kfree(path);
2283        return err;
2284}
2285
2286static int ext4_fill_es_cache_info(struct inode *inode,
2287                                   ext4_lblk_t block, ext4_lblk_t num,
2288                                   struct fiemap_extent_info *fieinfo)
2289{
2290        ext4_lblk_t next, end = block + num - 1;
2291        struct extent_status es;
2292        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2293        unsigned int flags;
2294        int err;
2295
2296        while (block <= end) {
2297                next = 0;
2298                flags = 0;
2299                if (!ext4_es_lookup_extent(inode, block, &next, &es))
2300                        break;
2301                if (ext4_es_is_unwritten(&es))
2302                        flags |= FIEMAP_EXTENT_UNWRITTEN;
2303                if (ext4_es_is_delayed(&es))
2304                        flags |= (FIEMAP_EXTENT_DELALLOC |
2305                                  FIEMAP_EXTENT_UNKNOWN);
2306                if (ext4_es_is_hole(&es))
2307                        flags |= EXT4_FIEMAP_EXTENT_HOLE;
2308                if (next == 0)
2309                        flags |= FIEMAP_EXTENT_LAST;
2310                if (flags & (FIEMAP_EXTENT_DELALLOC|
2311                             EXT4_FIEMAP_EXTENT_HOLE))
2312                        es.es_pblk = 0;
2313                else
2314                        es.es_pblk = ext4_es_pblock(&es);
2315                err = fiemap_fill_next_extent(fieinfo,
2316                                (__u64)es.es_lblk << blksize_bits,
2317                                (__u64)es.es_pblk << blksize_bits,
2318                                (__u64)es.es_len << blksize_bits,
2319                                flags);
2320                if (next == 0)
2321                        break;
2322                block = next;
2323                if (err < 0)
2324                        return err;
2325                if (err == 1)
2326                        return 0;
2327        }
2328        return 0;
2329}
2330
2331
2332/*
2333 * ext4_ext_determine_hole - determine hole around given block
2334 * @inode:      inode we lookup in
2335 * @path:       path in extent tree to @lblk
2336 * @lblk:       pointer to logical block around which we want to determine hole
2337 *
2338 * Determine hole length (and start if easily possible) around given logical
2339 * block. We don't try too hard to find the beginning of the hole but @path
2340 * actually points to extent before @lblk, we provide it.
2341 *
2342 * The function returns the length of a hole starting at @lblk. We update @lblk
2343 * to the beginning of the hole if we managed to find it.
2344 */
2345static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2346                                           struct ext4_ext_path *path,
2347                                           ext4_lblk_t *lblk)
2348{
2349        int depth = ext_depth(inode);
2350        struct ext4_extent *ex;
2351        ext4_lblk_t len;
2352
2353        ex = path[depth].p_ext;
2354        if (ex == NULL) {
2355                /* there is no extent yet, so gap is [0;-] */
2356                *lblk = 0;
2357                len = EXT_MAX_BLOCKS;
2358        } else if (*lblk < le32_to_cpu(ex->ee_block)) {
2359                len = le32_to_cpu(ex->ee_block) - *lblk;
2360        } else if (*lblk >= le32_to_cpu(ex->ee_block)
2361                        + ext4_ext_get_actual_len(ex)) {
2362                ext4_lblk_t next;
2363
2364                *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2365                next = ext4_ext_next_allocated_block(path);
2366                BUG_ON(next == *lblk);
2367                len = next - *lblk;
2368        } else {
2369                BUG();
2370        }
2371        return len;
2372}
2373
2374/*
2375 * ext4_ext_put_gap_in_cache:
2376 * calculate boundaries of the gap that the requested block fits into
2377 * and cache this gap
2378 */
2379static void
2380ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2381                          ext4_lblk_t hole_len)
2382{
2383        struct extent_status es;
2384
2385        ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
2386                                  hole_start + hole_len - 1, &es);
2387        if (es.es_len) {
2388                /* There's delayed extent containing lblock? */
2389                if (es.es_lblk <= hole_start)
2390                        return;
2391                hole_len = min(es.es_lblk - hole_start, hole_len);
2392        }
2393        ext_debug(" -> %u:%u\n", hole_start, hole_len);
2394        ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2395                              EXTENT_STATUS_HOLE);
2396}
2397
2398/*
2399 * ext4_ext_rm_idx:
2400 * removes index from the index block.
2401 */
2402static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2403                        struct ext4_ext_path *path, int depth)
2404{
2405        int err;
2406        ext4_fsblk_t leaf;
2407
2408        /* free index block */
2409        depth--;
2410        path = path + depth;
2411        leaf = ext4_idx_pblock(path->p_idx);
2412        if (unlikely(path->p_hdr->eh_entries == 0)) {
2413                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2414                return -EFSCORRUPTED;
2415        }
2416        err = ext4_ext_get_access(handle, inode, path);
2417        if (err)
2418                return err;
2419
2420        if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2421                int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2422                len *= sizeof(struct ext4_extent_idx);
2423                memmove(path->p_idx, path->p_idx + 1, len);
2424        }
2425
2426        le16_add_cpu(&path->p_hdr->eh_entries, -1);
2427        err = ext4_ext_dirty(handle, inode, path);
2428        if (err)
2429                return err;
2430        ext_debug("index is empty, remove it, free block %llu\n", leaf);
2431        trace_ext4_ext_rm_idx(inode, leaf);
2432
2433        ext4_free_blocks(handle, inode, NULL, leaf, 1,
2434                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2435
2436        while (--depth >= 0) {
2437                if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2438                        break;
2439                path--;
2440                err = ext4_ext_get_access(handle, inode, path);
2441                if (err)
2442                        break;
2443                path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2444                err = ext4_ext_dirty(handle, inode, path);
2445                if (err)
2446                        break;
2447        }
2448        return err;
2449}
2450
2451/*
2452 * ext4_ext_calc_credits_for_single_extent:
2453 * This routine returns max. credits that needed to insert an extent
2454 * to the extent tree.
2455 * When pass the actual path, the caller should calculate credits
2456 * under i_data_sem.
2457 */
2458int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2459                                                struct ext4_ext_path *path)
2460{
2461        if (path) {
2462                int depth = ext_depth(inode);
2463                int ret = 0;
2464
2465                /* probably there is space in leaf? */
2466                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2467                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2468
2469                        /*
2470                         *  There are some space in the leaf tree, no
2471                         *  need to account for leaf block credit
2472                         *
2473                         *  bitmaps and block group descriptor blocks
2474                         *  and other metadata blocks still need to be
2475                         *  accounted.
2476                         */
2477                        /* 1 bitmap, 1 block group descriptor */
2478                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2479                        return ret;
2480                }
2481        }
2482
2483        return ext4_chunk_trans_blocks(inode, nrblocks);
2484}
2485
2486/*
2487 * How many index/leaf blocks need to change/allocate to add @extents extents?
2488 *
2489 * If we add a single extent, then in the worse case, each tree level
2490 * index/leaf need to be changed in case of the tree split.
2491 *
2492 * If more extents are inserted, they could cause the whole tree split more
2493 * than once, but this is really rare.
2494 */
2495int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2496{
2497        int index;
2498        int depth;
2499
2500        /* If we are converting the inline data, only one is needed here. */
2501        if (ext4_has_inline_data(inode))
2502                return 1;
2503
2504        depth = ext_depth(inode);
2505
2506        if (extents <= 1)
2507                index = depth * 2;
2508        else
2509                index = depth * 3;
2510
2511        return index;
2512}
2513
2514static inline int get_default_free_blocks_flags(struct inode *inode)
2515{
2516        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
2517            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
2518                return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2519        else if (ext4_should_journal_data(inode))
2520                return EXT4_FREE_BLOCKS_FORGET;
2521        return 0;
2522}
2523
2524/*
2525 * ext4_rereserve_cluster - increment the reserved cluster count when
2526 *                          freeing a cluster with a pending reservation
2527 *
2528 * @inode - file containing the cluster
2529 * @lblk - logical block in cluster to be reserved
2530 *
2531 * Increments the reserved cluster count and adjusts quota in a bigalloc
2532 * file system when freeing a partial cluster containing at least one
2533 * delayed and unwritten block.  A partial cluster meeting that
2534 * requirement will have a pending reservation.  If so, the
2535 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2536 * defer reserved and allocated space accounting to a subsequent call
2537 * to this function.
2538 */
2539static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
2540{
2541        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2542        struct ext4_inode_info *ei = EXT4_I(inode);
2543
2544        dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
2545
2546        spin_lock(&ei->i_block_reservation_lock);
2547        ei->i_reserved_data_blocks++;
2548        percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
2549        spin_unlock(&ei->i_block_reservation_lock);
2550
2551        percpu_counter_add(&sbi->s_freeclusters_counter, 1);
2552        ext4_remove_pending(inode, lblk);
2553}
2554
2555static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2556                              struct ext4_extent *ex,
2557                              struct partial_cluster *partial,
2558                              ext4_lblk_t from, ext4_lblk_t to)
2559{
2560        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2561        unsigned short ee_len = ext4_ext_get_actual_len(ex);
2562        ext4_fsblk_t last_pblk, pblk;
2563        ext4_lblk_t num;
2564        int flags;
2565
2566        /* only extent tail removal is allowed */
2567        if (from < le32_to_cpu(ex->ee_block) ||
2568            to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
2569                ext4_error(sbi->s_sb,
2570                           "strange request: removal(2) %u-%u from %u:%u",
2571                           from, to, le32_to_cpu(ex->ee_block), ee_len);
2572                return 0;
2573        }
2574
2575#ifdef EXTENTS_STATS
2576        spin_lock(&sbi->s_ext_stats_lock);
2577        sbi->s_ext_blocks += ee_len;
2578        sbi->s_ext_extents++;
2579        if (ee_len < sbi->s_ext_min)
2580                sbi->s_ext_min = ee_len;
2581        if (ee_len > sbi->s_ext_max)
2582                sbi->s_ext_max = ee_len;
2583        if (ext_depth(inode) > sbi->s_depth_max)
2584                sbi->s_depth_max = ext_depth(inode);
2585        spin_unlock(&sbi->s_ext_stats_lock);
2586#endif
2587
2588        trace_ext4_remove_blocks(inode, ex, from, to, partial);
2589
2590        /*
2591         * if we have a partial cluster, and it's different from the
2592         * cluster of the last block in the extent, we free it
2593         */
2594        last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
2595
2596        if (partial->state != initial &&
2597            partial->pclu != EXT4_B2C(sbi, last_pblk)) {
2598                if (partial->state == tofree) {
2599                        flags = get_default_free_blocks_flags(inode);
2600                        if (ext4_is_pending(inode, partial->lblk))
2601                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2602                        ext4_free_blocks(handle, inode, NULL,
2603                                         EXT4_C2B(sbi, partial->pclu),
2604                                         sbi->s_cluster_ratio, flags);
2605                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2606                                ext4_rereserve_cluster(inode, partial->lblk);
2607                }
2608                partial->state = initial;
2609        }
2610
2611        num = le32_to_cpu(ex->ee_block) + ee_len - from;
2612        pblk = ext4_ext_pblock(ex) + ee_len - num;
2613
2614        /*
2615         * We free the partial cluster at the end of the extent (if any),
2616         * unless the cluster is used by another extent (partial_cluster
2617         * state is nofree).  If a partial cluster exists here, it must be
2618         * shared with the last block in the extent.
2619         */
2620        flags = get_default_free_blocks_flags(inode);
2621
2622        /* partial, left end cluster aligned, right end unaligned */
2623        if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
2624            (EXT4_LBLK_CMASK(sbi, to) >= from) &&
2625            (partial->state != nofree)) {
2626                if (ext4_is_pending(inode, to))
2627                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2628                ext4_free_blocks(handle, inode, NULL,
2629                                 EXT4_PBLK_CMASK(sbi, last_pblk),
2630                                 sbi->s_cluster_ratio, flags);
2631                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2632                        ext4_rereserve_cluster(inode, to);
2633                partial->state = initial;
2634                flags = get_default_free_blocks_flags(inode);
2635        }
2636
2637        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2638
2639        /*
2640         * For bigalloc file systems, we never free a partial cluster
2641         * at the beginning of the extent.  Instead, we check to see if we
2642         * need to free it on a subsequent call to ext4_remove_blocks,
2643         * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2644         */
2645        flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2646        ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2647
2648        /* reset the partial cluster if we've freed past it */
2649        if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
2650                partial->state = initial;
2651
2652        /*
2653         * If we've freed the entire extent but the beginning is not left
2654         * cluster aligned and is not marked as ineligible for freeing we
2655         * record the partial cluster at the beginning of the extent.  It
2656         * wasn't freed by the preceding ext4_free_blocks() call, and we
2657         * need to look farther to the left to determine if it's to be freed
2658         * (not shared with another extent). Else, reset the partial
2659         * cluster - we're either  done freeing or the beginning of the
2660         * extent is left cluster aligned.
2661         */
2662        if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
2663                if (partial->state == initial) {
2664                        partial->pclu = EXT4_B2C(sbi, pblk);
2665                        partial->lblk = from;
2666                        partial->state = tofree;
2667                }
2668        } else {
2669                partial->state = initial;
2670        }
2671
2672        return 0;
2673}
2674
2675/*
2676 * ext4_ext_rm_leaf() Removes the extents associated with the
2677 * blocks appearing between "start" and "end".  Both "start"
2678 * and "end" must appear in the same extent or EIO is returned.
2679 *
2680 * @handle: The journal handle
2681 * @inode:  The files inode
2682 * @path:   The path to the leaf
2683 * @partial_cluster: The cluster which we'll have to free if all extents
2684 *                   has been released from it.  However, if this value is
2685 *                   negative, it's a cluster just to the right of the
2686 *                   punched region and it must not be freed.
2687 * @start:  The first block to remove
2688 * @end:   The last block to remove
2689 */
2690static int
2691ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2692                 struct ext4_ext_path *path,
2693                 struct partial_cluster *partial,
2694                 ext4_lblk_t start, ext4_lblk_t end)
2695{
2696        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2697        int err = 0, correct_index = 0;
2698        int depth = ext_depth(inode), credits, revoke_credits;
2699        struct ext4_extent_header *eh;
2700        ext4_lblk_t a, b;
2701        unsigned num;
2702        ext4_lblk_t ex_ee_block;
2703        unsigned short ex_ee_len;
2704        unsigned unwritten = 0;
2705        struct ext4_extent *ex;
2706        ext4_fsblk_t pblk;
2707
2708        /* the header must be checked already in ext4_ext_remove_space() */
2709        ext_debug("truncate since %u in leaf to %u\n", start, end);
2710        if (!path[depth].p_hdr)
2711                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2712        eh = path[depth].p_hdr;
2713        if (unlikely(path[depth].p_hdr == NULL)) {
2714                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2715                return -EFSCORRUPTED;
2716        }
2717        /* find where to start removing */
2718        ex = path[depth].p_ext;
2719        if (!ex)
2720                ex = EXT_LAST_EXTENT(eh);
2721
2722        ex_ee_block = le32_to_cpu(ex->ee_block);
2723        ex_ee_len = ext4_ext_get_actual_len(ex);
2724
2725        trace_ext4_ext_rm_leaf(inode, start, ex, partial);
2726
2727        while (ex >= EXT_FIRST_EXTENT(eh) &&
2728                        ex_ee_block + ex_ee_len > start) {
2729
2730                if (ext4_ext_is_unwritten(ex))
2731                        unwritten = 1;
2732                else
2733                        unwritten = 0;
2734
2735                ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2736                          unwritten, ex_ee_len);
2737                path[depth].p_ext = ex;
2738
2739                a = ex_ee_block > start ? ex_ee_block : start;
2740                b = ex_ee_block+ex_ee_len - 1 < end ?
2741                        ex_ee_block+ex_ee_len - 1 : end;
2742
2743                ext_debug("  border %u:%u\n", a, b);
2744
2745                /* If this extent is beyond the end of the hole, skip it */
2746                if (end < ex_ee_block) {
2747                        /*
2748                         * We're going to skip this extent and move to another,
2749                         * so note that its first cluster is in use to avoid
2750                         * freeing it when removing blocks.  Eventually, the
2751                         * right edge of the truncated/punched region will
2752                         * be just to the left.
2753                         */
2754                        if (sbi->s_cluster_ratio > 1) {
2755                                pblk = ext4_ext_pblock(ex);
2756                                partial->pclu = EXT4_B2C(sbi, pblk);
2757                                partial->state = nofree;
2758                        }
2759                        ex--;
2760                        ex_ee_block = le32_to_cpu(ex->ee_block);
2761                        ex_ee_len = ext4_ext_get_actual_len(ex);
2762                        continue;
2763                } else if (b != ex_ee_block + ex_ee_len - 1) {
2764                        EXT4_ERROR_INODE(inode,
2765                                         "can not handle truncate %u:%u "
2766                                         "on extent %u:%u",
2767                                         start, end, ex_ee_block,
2768                                         ex_ee_block + ex_ee_len - 1);
2769                        err = -EFSCORRUPTED;
2770                        goto out;
2771                } else if (a != ex_ee_block) {
2772                        /* remove tail of the extent */
2773                        num = a - ex_ee_block;
2774                } else {
2775                        /* remove whole extent: excellent! */
2776                        num = 0;
2777                }
2778                /*
2779                 * 3 for leaf, sb, and inode plus 2 (bmap and group
2780                 * descriptor) for each block group; assume two block
2781                 * groups plus ex_ee_len/blocks_per_block_group for
2782                 * the worst case
2783                 */
2784                credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2785                if (ex == EXT_FIRST_EXTENT(eh)) {
2786                        correct_index = 1;
2787                        credits += (ext_depth(inode)) + 1;
2788                }
2789                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2790                /*
2791                 * We may end up freeing some index blocks and data from the
2792                 * punched range. Note that partial clusters are accounted for
2793                 * by ext4_free_data_revoke_credits().
2794                 */
2795                revoke_credits =
2796                        ext4_free_metadata_revoke_credits(inode->i_sb,
2797                                                          ext_depth(inode)) +
2798                        ext4_free_data_revoke_credits(inode, b - a + 1);
2799
2800                err = ext4_datasem_ensure_credits(handle, inode, credits,
2801                                                  credits, revoke_credits);
2802                if (err) {
2803                        if (err > 0)
2804                                err = -EAGAIN;
2805                        goto out;
2806                }
2807
2808                err = ext4_ext_get_access(handle, inode, path + depth);
2809                if (err)
2810                        goto out;
2811
2812                err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
2813                if (err)
2814                        goto out;
2815
2816                if (num == 0)
2817                        /* this extent is removed; mark slot entirely unused */
2818                        ext4_ext_store_pblock(ex, 0);
2819
2820                ex->ee_len = cpu_to_le16(num);
2821                /*
2822                 * Do not mark unwritten if all the blocks in the
2823                 * extent have been removed.
2824                 */
2825                if (unwritten && num)
2826                        ext4_ext_mark_unwritten(ex);
2827                /*
2828                 * If the extent was completely released,
2829                 * we need to remove it from the leaf
2830                 */
2831                if (num == 0) {
2832                        if (end != EXT_MAX_BLOCKS - 1) {
2833                                /*
2834                                 * For hole punching, we need to scoot all the
2835                                 * extents up when an extent is removed so that
2836                                 * we dont have blank extents in the middle
2837                                 */
2838                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2839                                        sizeof(struct ext4_extent));
2840
2841                                /* Now get rid of the one at the end */
2842                                memset(EXT_LAST_EXTENT(eh), 0,
2843                                        sizeof(struct ext4_extent));
2844                        }
2845                        le16_add_cpu(&eh->eh_entries, -1);
2846                }
2847
2848                err = ext4_ext_dirty(handle, inode, path + depth);
2849                if (err)
2850                        goto out;
2851
2852                ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2853                                ext4_ext_pblock(ex));
2854                ex--;
2855                ex_ee_block = le32_to_cpu(ex->ee_block);
2856                ex_ee_len = ext4_ext_get_actual_len(ex);
2857        }
2858
2859        if (correct_index && eh->eh_entries)
2860                err = ext4_ext_correct_indexes(handle, inode, path);
2861
2862        /*
2863         * If there's a partial cluster and at least one extent remains in
2864         * the leaf, free the partial cluster if it isn't shared with the
2865         * current extent.  If it is shared with the current extent
2866         * we reset the partial cluster because we've reached the start of the
2867         * truncated/punched region and we're done removing blocks.
2868         */
2869        if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
2870                pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2871                if (partial->pclu != EXT4_B2C(sbi, pblk)) {
2872                        int flags = get_default_free_blocks_flags(inode);
2873
2874                        if (ext4_is_pending(inode, partial->lblk))
2875                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2876                        ext4_free_blocks(handle, inode, NULL,
2877                                         EXT4_C2B(sbi, partial->pclu),
2878                                         sbi->s_cluster_ratio, flags);
2879                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2880                                ext4_rereserve_cluster(inode, partial->lblk);
2881                }
2882                partial->state = initial;
2883        }
2884
2885        /* if this leaf is free, then we should
2886         * remove it from index block above */
2887        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2888                err = ext4_ext_rm_idx(handle, inode, path, depth);
2889
2890out:
2891        return err;
2892}
2893
2894/*
2895 * ext4_ext_more_to_rm:
2896 * returns 1 if current index has to be freed (even partial)
2897 */
2898static int
2899ext4_ext_more_to_rm(struct ext4_ext_path *path)
2900{
2901        BUG_ON(path->p_idx == NULL);
2902
2903        if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2904                return 0;
2905
2906        /*
2907         * if truncate on deeper level happened, it wasn't partial,
2908         * so we have to consider current index for truncation
2909         */
2910        if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2911                return 0;
2912        return 1;
2913}
2914
2915int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2916                          ext4_lblk_t end)
2917{
2918        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2919        int depth = ext_depth(inode);
2920        struct ext4_ext_path *path = NULL;
2921        struct partial_cluster partial;
2922        handle_t *handle;
2923        int i = 0, err = 0;
2924
2925        partial.pclu = 0;
2926        partial.lblk = 0;
2927        partial.state = initial;
2928
2929        ext_debug("truncate since %u to %u\n", start, end);
2930
2931        /* probably first extent we're gonna free will be last in block */
2932        handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
2933                        depth + 1,
2934                        ext4_free_metadata_revoke_credits(inode->i_sb, depth));
2935        if (IS_ERR(handle))
2936                return PTR_ERR(handle);
2937
2938again:
2939        trace_ext4_ext_remove_space(inode, start, end, depth);
2940
2941        /*
2942         * Check if we are removing extents inside the extent tree. If that
2943         * is the case, we are going to punch a hole inside the extent tree
2944         * so we have to check whether we need to split the extent covering
2945         * the last block to remove so we can easily remove the part of it
2946         * in ext4_ext_rm_leaf().
2947         */
2948        if (end < EXT_MAX_BLOCKS - 1) {
2949                struct ext4_extent *ex;
2950                ext4_lblk_t ee_block, ex_end, lblk;
2951                ext4_fsblk_t pblk;
2952
2953                /* find extent for or closest extent to this block */
2954                path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2955                if (IS_ERR(path)) {
2956                        ext4_journal_stop(handle);
2957                        return PTR_ERR(path);
2958                }
2959                depth = ext_depth(inode);
2960                /* Leaf not may not exist only if inode has no blocks at all */
2961                ex = path[depth].p_ext;
2962                if (!ex) {
2963                        if (depth) {
2964                                EXT4_ERROR_INODE(inode,
2965                                                 "path[%d].p_hdr == NULL",
2966                                                 depth);
2967                                err = -EFSCORRUPTED;
2968                        }
2969                        goto out;
2970                }
2971
2972                ee_block = le32_to_cpu(ex->ee_block);
2973                ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2974
2975                /*
2976                 * See if the last block is inside the extent, if so split
2977                 * the extent at 'end' block so we can easily remove the
2978                 * tail of the first part of the split extent in
2979                 * ext4_ext_rm_leaf().
2980                 */
2981                if (end >= ee_block && end < ex_end) {
2982
2983                        /*
2984                         * If we're going to split the extent, note that
2985                         * the cluster containing the block after 'end' is
2986                         * in use to avoid freeing it when removing blocks.
2987                         */
2988                        if (sbi->s_cluster_ratio > 1) {
2989                                pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
2990                                partial.pclu = EXT4_B2C(sbi, pblk);
2991                                partial.state = nofree;
2992                        }
2993
2994                        /*
2995                         * Split the extent in two so that 'end' is the last
2996                         * block in the first new extent. Also we should not
2997                         * fail removing space due to ENOSPC so try to use
2998                         * reserved block if that happens.
2999                         */
3000                        err = ext4_force_split_extent_at(handle, inode, &path,
3001                                                         end + 1, 1);
3002                        if (err < 0)
3003                                goto out;
3004
3005                } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
3006                           partial.state == initial) {
3007                        /*
3008                         * If we're punching, there's an extent to the right.
3009                         * If the partial cluster hasn't been set, set it to
3010                         * that extent's first cluster and its state to nofree
3011                         * so it won't be freed should it contain blocks to be
3012                         * removed. If it's already set (tofree/nofree), we're
3013                         * retrying and keep the original partial cluster info
3014                         * so a cluster marked tofree as a result of earlier
3015                         * extent removal is not lost.
3016                         */
3017                        lblk = ex_end + 1;
3018                        err = ext4_ext_search_right(inode, path, &lblk, &pblk,
3019                                                    &ex);
3020                        if (err)
3021                                goto out;
3022                        if (pblk) {
3023                                partial.pclu = EXT4_B2C(sbi, pblk);
3024                                partial.state = nofree;
3025                        }
3026                }
3027        }
3028        /*
3029         * We start scanning from right side, freeing all the blocks
3030         * after i_size and walking into the tree depth-wise.
3031         */
3032        depth = ext_depth(inode);
3033        if (path) {
3034                int k = i = depth;
3035                while (--k > 0)
3036                        path[k].p_block =
3037                                le16_to_cpu(path[k].p_hdr->eh_entries)+1;
3038        } else {
3039                path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
3040                               GFP_NOFS);
3041                if (path == NULL) {
3042                        ext4_journal_stop(handle);
3043                        return -ENOMEM;
3044                }
3045                path[0].p_maxdepth = path[0].p_depth = depth;
3046                path[0].p_hdr = ext_inode_hdr(inode);
3047                i = 0;
3048
3049                if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
3050                        err = -EFSCORRUPTED;
3051                        goto out;
3052                }
3053        }
3054        err = 0;
3055
3056        while (i >= 0 && err == 0) {
3057                if (i == depth) {
3058                        /* this is leaf block */
3059                        err = ext4_ext_rm_leaf(handle, inode, path,
3060                                               &partial, start, end);
3061                        /* root level has p_bh == NULL, brelse() eats this */
3062                        brelse(path[i].p_bh);
3063                        path[i].p_bh = NULL;
3064                        i--;
3065                        continue;
3066                }
3067
3068                /* this is index block */
3069                if (!path[i].p_hdr) {
3070                        ext_debug("initialize header\n");
3071                        path[i].p_hdr = ext_block_hdr(path[i].p_bh);
3072                }
3073
3074                if (!path[i].p_idx) {
3075                        /* this level hasn't been touched yet */
3076                        path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
3077                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
3078                        ext_debug("init index ptr: hdr 0x%p, num %d\n",
3079                                  path[i].p_hdr,
3080                                  le16_to_cpu(path[i].p_hdr->eh_entries));
3081                } else {
3082                        /* we were already here, see at next index */
3083                        path[i].p_idx--;
3084                }
3085
3086                ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
3087                                i, EXT_FIRST_INDEX(path[i].p_hdr),
3088                                path[i].p_idx);
3089                if (ext4_ext_more_to_rm(path + i)) {
3090                        struct buffer_head *bh;
3091                        /* go to the next level */
3092                        ext_debug("move to level %d (block %llu)\n",
3093                                  i + 1, ext4_idx_pblock(path[i].p_idx));
3094                        memset(path + i + 1, 0, sizeof(*path));
3095                        bh = read_extent_tree_block(inode,
3096                                ext4_idx_pblock(path[i].p_idx), depth - i - 1,
3097                                EXT4_EX_NOCACHE);
3098                        if (IS_ERR(bh)) {
3099                                /* should we reset i_size? */
3100                                err = PTR_ERR(bh);
3101                                break;
3102                        }
3103                        /* Yield here to deal with large extent trees.
3104                         * Should be a no-op if we did IO above. */
3105                        cond_resched();
3106                        if (WARN_ON(i + 1 > depth)) {
3107                                err = -EFSCORRUPTED;
3108                                break;
3109                        }
3110                        path[i + 1].p_bh = bh;
3111
3112                        /* save actual number of indexes since this
3113                         * number is changed at the next iteration */
3114                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
3115                        i++;
3116                } else {
3117                        /* we finished processing this index, go up */
3118                        if (path[i].p_hdr->eh_entries == 0 && i > 0) {
3119                                /* index is empty, remove it;
3120                                 * handle must be already prepared by the
3121                                 * truncatei_leaf() */
3122                                err = ext4_ext_rm_idx(handle, inode, path, i);
3123                        }
3124                        /* root level has p_bh == NULL, brelse() eats this */
3125                        brelse(path[i].p_bh);
3126                        path[i].p_bh = NULL;
3127                        i--;
3128                        ext_debug("return to level %d\n", i);
3129                }
3130        }
3131
3132        trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
3133                                         path->p_hdr->eh_entries);
3134
3135        /*
3136         * if there's a partial cluster and we have removed the first extent
3137         * in the file, then we also free the partial cluster, if any
3138         */
3139        if (partial.state == tofree && err == 0) {
3140                int flags = get_default_free_blocks_flags(inode);
3141
3142                if (ext4_is_pending(inode, partial.lblk))
3143                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
3144                ext4_free_blocks(handle, inode, NULL,
3145                                 EXT4_C2B(sbi, partial.pclu),
3146                                 sbi->s_cluster_ratio, flags);
3147                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
3148                        ext4_rereserve_cluster(inode, partial.lblk);
3149                partial.state = initial;
3150        }
3151
3152        /* TODO: flexible tree reduction should be here */
3153        if (path->p_hdr->eh_entries == 0) {
3154                /*
3155                 * truncate to zero freed all the tree,
3156                 * so we need to correct eh_depth
3157                 */
3158                err = ext4_ext_get_access(handle, inode, path);
3159                if (err == 0) {
3160                        ext_inode_hdr(inode)->eh_depth = 0;
3161                        ext_inode_hdr(inode)->eh_max =
3162                                cpu_to_le16(ext4_ext_space_root(inode, 0));
3163                        err = ext4_ext_dirty(handle, inode, path);
3164                }
3165        }
3166out:
3167        ext4_ext_drop_refs(path);
3168        kfree(path);
3169        path = NULL;
3170        if (err == -EAGAIN)
3171                goto again;
3172        ext4_journal_stop(handle);
3173
3174        return err;
3175}
3176
3177/*
3178 * called at mount time
3179 */
3180void ext4_ext_init(struct super_block *sb)
3181{
3182        /*
3183         * possible initialization would be here
3184         */
3185
3186        if (ext4_has_feature_extents(sb)) {
3187#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3188                printk(KERN_INFO "EXT4-fs: file extents enabled"
3189#ifdef AGGRESSIVE_TEST
3190                       ", aggressive tests"
3191#endif
3192#ifdef CHECK_BINSEARCH
3193                       ", check binsearch"
3194#endif
3195#ifdef EXTENTS_STATS
3196                       ", stats"
3197#endif
3198                       "\n");
3199#endif
3200#ifdef EXTENTS_STATS
3201                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3202                EXT4_SB(sb)->s_ext_min = 1 << 30;
3203                EXT4_SB(sb)->s_ext_max = 0;
3204#endif
3205        }
3206}
3207
3208/*
3209 * called at umount time
3210 */
3211void ext4_ext_release(struct super_block *sb)
3212{
3213        if (!ext4_has_feature_extents(sb))
3214                return;
3215
3216#ifdef EXTENTS_STATS
3217        if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3218                struct ext4_sb_info *sbi = EXT4_SB(sb);
3219                printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3220                        sbi->s_ext_blocks, sbi->s_ext_extents,
3221                        sbi->s_ext_blocks / sbi->s_ext_extents);
3222                printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3223                        sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3224        }
3225#endif
3226}
3227
3228static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3229{
3230        ext4_lblk_t  ee_block;
3231        ext4_fsblk_t ee_pblock;
3232        unsigned int ee_len;
3233
3234        ee_block  = le32_to_cpu(ex->ee_block);
3235        ee_len    = ext4_ext_get_actual_len(ex);
3236        ee_pblock = ext4_ext_pblock(ex);
3237
3238        if (ee_len == 0)
3239                return 0;
3240
3241        return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3242                                     EXTENT_STATUS_WRITTEN);
3243}
3244
3245/* FIXME!! we need to try to merge to left or right after zero-out  */
3246static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3247{
3248        ext4_fsblk_t ee_pblock;
3249        unsigned int ee_len;
3250
3251        ee_len    = ext4_ext_get_actual_len(ex);
3252        ee_pblock = ext4_ext_pblock(ex);
3253        return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
3254                                  ee_len);
3255}
3256
3257/*
3258 * ext4_split_extent_at() splits an extent at given block.
3259 *
3260 * @handle: the journal handle
3261 * @inode: the file inode
3262 * @path: the path to the extent
3263 * @split: the logical block where the extent is splitted.
3264 * @split_flags: indicates if the extent could be zeroout if split fails, and
3265 *               the states(init or unwritten) of new extents.
3266 * @flags: flags used to insert new extent to extent tree.
3267 *
3268 *
3269 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3270 * of which are deterimined by split_flag.
3271 *
3272 * There are two cases:
3273 *  a> the extent are splitted into two extent.
3274 *  b> split is not needed, and just mark the extent.
3275 *
3276 * return 0 on success.
3277 */
3278static int ext4_split_extent_at(handle_t *handle,
3279                             struct inode *inode,
3280                             struct ext4_ext_path **ppath,
3281                             ext4_lblk_t split,
3282                             int split_flag,
3283                             int flags)
3284{
3285        struct ext4_ext_path *path = *ppath;
3286        ext4_fsblk_t newblock;
3287        ext4_lblk_t ee_block;
3288        struct ext4_extent *ex, newex, orig_ex, zero_ex;
3289        struct ext4_extent *ex2 = NULL;
3290        unsigned int ee_len, depth;
3291        int err = 0;
3292
3293        BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3294               (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3295
3296        ext_debug("ext4_split_extents_at: inode %lu, logical"
3297                "block %llu\n", inode->i_ino, (unsigned long long)split);
3298
3299        ext4_ext_show_leaf(inode, path);
3300
3301        depth = ext_depth(inode);
3302        ex = path[depth].p_ext;
3303        ee_block = le32_to_cpu(ex->ee_block);
3304        ee_len = ext4_ext_get_actual_len(ex);
3305        newblock = split - ee_block + ext4_ext_pblock(ex);
3306
3307        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3308        BUG_ON(!ext4_ext_is_unwritten(ex) &&
3309               split_flag & (EXT4_EXT_MAY_ZEROOUT |
3310                             EXT4_EXT_MARK_UNWRIT1 |
3311                             EXT4_EXT_MARK_UNWRIT2));
3312
3313        err = ext4_ext_get_access(handle, inode, path + depth);
3314        if (err)
3315                goto out;
3316
3317        if (split == ee_block) {
3318                /*
3319                 * case b: block @split is the block that the extent begins with
3320                 * then we just change the state of the extent, and splitting
3321                 * is not needed.
3322                 */
3323                if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3324                        ext4_ext_mark_unwritten(ex);
3325                else
3326                        ext4_ext_mark_initialized(ex);
3327
3328                if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3329                        ext4_ext_try_to_merge(handle, inode, path, ex);
3330
3331                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3332                goto out;
3333        }
3334
3335        /* case a */
3336        memcpy(&orig_ex, ex, sizeof(orig_ex));
3337        ex->ee_len = cpu_to_le16(split - ee_block);
3338        if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3339                ext4_ext_mark_unwritten(ex);
3340
3341        /*
3342         * path may lead to new leaf, not to original leaf any more
3343         * after ext4_ext_insert_extent() returns,
3344         */
3345        err = ext4_ext_dirty(handle, inode, path + depth);
3346        if (err)
3347                goto fix_extent_len;
3348
3349        ex2 = &newex;
3350        ex2->ee_block = cpu_to_le32(split);
3351        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3352        ext4_ext_store_pblock(ex2, newblock);
3353        if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3354                ext4_ext_mark_unwritten(ex2);
3355
3356        err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3357        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3358                if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3359                        if (split_flag & EXT4_EXT_DATA_VALID1) {
3360                                err = ext4_ext_zeroout(inode, ex2);
3361                                zero_ex.ee_block = ex2->ee_block;
3362                                zero_ex.ee_len = cpu_to_le16(
3363                                                ext4_ext_get_actual_len(ex2));
3364                                ext4_ext_store_pblock(&zero_ex,
3365                                                      ext4_ext_pblock(ex2));
3366                        } else {
3367                                err = ext4_ext_zeroout(inode, ex);
3368                                zero_ex.ee_block = ex->ee_block;
3369                                zero_ex.ee_len = cpu_to_le16(
3370                                                ext4_ext_get_actual_len(ex));
3371                                ext4_ext_store_pblock(&zero_ex,
3372                                                      ext4_ext_pblock(ex));
3373                        }
3374                } else {
3375                        err = ext4_ext_zeroout(inode, &orig_ex);
3376                        zero_ex.ee_block = orig_ex.ee_block;
3377                        zero_ex.ee_len = cpu_to_le16(
3378                                                ext4_ext_get_actual_len(&orig_ex));
3379                        ext4_ext_store_pblock(&zero_ex,
3380                                              ext4_ext_pblock(&orig_ex));
3381                }
3382
3383                if (err)
3384                        goto fix_extent_len;
3385                /* update the extent length and mark as initialized */
3386                ex->ee_len = cpu_to_le16(ee_len);
3387                ext4_ext_try_to_merge(handle, inode, path, ex);
3388                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3389                if (err)
3390                        goto fix_extent_len;
3391
3392                /* update extent status tree */
3393                err = ext4_zeroout_es(inode, &zero_ex);
3394
3395                goto out;
3396        } else if (err)
3397                goto fix_extent_len;
3398
3399out:
3400        ext4_ext_show_leaf(inode, path);
3401        return err;
3402
3403fix_extent_len:
3404        ex->ee_len = orig_ex.ee_len;
3405        ext4_ext_dirty(handle, inode, path + path->p_depth);
3406        return err;
3407}
3408
3409/*
3410 * ext4_split_extents() splits an extent and mark extent which is covered
3411 * by @map as split_flags indicates
3412 *
3413 * It may result in splitting the extent into multiple extents (up to three)
3414 * There are three possibilities:
3415 *   a> There is no split required
3416 *   b> Splits in two extents: Split is happening at either end of the extent
3417 *   c> Splits in three extents: Somone is splitting in middle of the extent
3418 *
3419 */
3420static int ext4_split_extent(handle_t *handle,
3421                              struct inode *inode,
3422                              struct ext4_ext_path **ppath,
3423                              struct ext4_map_blocks *map,
3424                              int split_flag,
3425                              int flags)
3426{
3427        struct ext4_ext_path *path = *ppath;
3428        ext4_lblk_t ee_block;
3429        struct ext4_extent *ex;
3430        unsigned int ee_len, depth;
3431        int err = 0;
3432        int unwritten;
3433        int split_flag1, flags1;
3434        int allocated = map->m_len;
3435
3436        depth = ext_depth(inode);
3437        ex = path[depth].p_ext;
3438        ee_block = le32_to_cpu(ex->ee_block);
3439        ee_len = ext4_ext_get_actual_len(ex);
3440        unwritten = ext4_ext_is_unwritten(ex);
3441
3442        if (map->m_lblk + map->m_len < ee_block + ee_len) {
3443                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3444                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3445                if (unwritten)
3446                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3447                                       EXT4_EXT_MARK_UNWRIT2;
3448                if (split_flag & EXT4_EXT_DATA_VALID2)
3449                        split_flag1 |= EXT4_EXT_DATA_VALID1;
3450                err = ext4_split_extent_at(handle, inode, ppath,
3451                                map->m_lblk + map->m_len, split_flag1, flags1);
3452                if (err)
3453                        goto out;
3454        } else {
3455                allocated = ee_len - (map->m_lblk - ee_block);
3456        }
3457        /*
3458         * Update path is required because previous ext4_split_extent_at() may
3459         * result in split of original leaf or extent zeroout.
3460         */
3461        path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3462        if (IS_ERR(path))
3463                return PTR_ERR(path);
3464        depth = ext_depth(inode);
3465        ex = path[depth].p_ext;
3466        if (!ex) {
3467                EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3468                                 (unsigned long) map->m_lblk);
3469                return -EFSCORRUPTED;
3470        }
3471        unwritten = ext4_ext_is_unwritten(ex);
3472        split_flag1 = 0;
3473
3474        if (map->m_lblk >= ee_block) {
3475                split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3476                if (unwritten) {
3477                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3478                        split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3479                                                     EXT4_EXT_MARK_UNWRIT2);
3480                }
3481                err = ext4_split_extent_at(handle, inode, ppath,
3482                                map->m_lblk, split_flag1, flags);
3483                if (err)
3484                        goto out;
3485        }
3486
3487        ext4_ext_show_leaf(inode, path);
3488out:
3489        return err ? err : allocated;
3490}
3491
3492/*
3493 * This function is called by ext4_ext_map_blocks() if someone tries to write
3494 * to an unwritten extent. It may result in splitting the unwritten
3495 * extent into multiple extents (up to three - one initialized and two
3496 * unwritten).
3497 * There are three possibilities:
3498 *   a> There is no split required: Entire extent should be initialized
3499 *   b> Splits in two extents: Write is happening at either end of the extent
3500 *   c> Splits in three extents: Somone is writing in middle of the extent
3501 *
3502 * Pre-conditions:
3503 *  - The extent pointed to by 'path' is unwritten.
3504 *  - The extent pointed to by 'path' contains a superset
3505 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3506 *
3507 * Post-conditions on success:
3508 *  - the returned value is the number of blocks beyond map->l_lblk
3509 *    that are allocated and initialized.
3510 *    It is guaranteed to be >= map->m_len.
3511 */
3512static int ext4_ext_convert_to_initialized(handle_t *handle,
3513                                           struct inode *inode,
3514                                           struct ext4_map_blocks *map,
3515                                           struct ext4_ext_path **ppath,
3516                                           int flags)
3517{
3518        struct ext4_ext_path *path = *ppath;
3519        struct ext4_sb_info *sbi;
3520        struct ext4_extent_header *eh;
3521        struct ext4_map_blocks split_map;
3522        struct ext4_extent zero_ex1, zero_ex2;
3523        struct ext4_extent *ex, *abut_ex;
3524        ext4_lblk_t ee_block, eof_block;
3525        unsigned int ee_len, depth, map_len = map->m_len;
3526        int allocated = 0, max_zeroout = 0;
3527        int err = 0;
3528        int split_flag = EXT4_EXT_DATA_VALID2;
3529
3530        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3531                "block %llu, max_blocks %u\n", inode->i_ino,
3532                (unsigned long long)map->m_lblk, map_len);
3533
3534        sbi = EXT4_SB(inode->i_sb);
3535        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3536                inode->i_sb->s_blocksize_bits;
3537        if (eof_block < map->m_lblk + map_len)
3538                eof_block = map->m_lblk + map_len;
3539
3540        depth = ext_depth(inode);
3541        eh = path[depth].p_hdr;
3542        ex = path[depth].p_ext;
3543        ee_block = le32_to_cpu(ex->ee_block);
3544        ee_len = ext4_ext_get_actual_len(ex);
3545        zero_ex1.ee_len = 0;
3546        zero_ex2.ee_len = 0;
3547
3548        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3549
3550        /* Pre-conditions */
3551        BUG_ON(!ext4_ext_is_unwritten(ex));
3552        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3553
3554        /*
3555         * Attempt to transfer newly initialized blocks from the currently
3556         * unwritten extent to its neighbor. This is much cheaper
3557         * than an insertion followed by a merge as those involve costly
3558         * memmove() calls. Transferring to the left is the common case in
3559         * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3560         * followed by append writes.
3561         *
3562         * Limitations of the current logic:
3563         *  - L1: we do not deal with writes covering the whole extent.
3564         *    This would require removing the extent if the transfer
3565         *    is possible.
3566         *  - L2: we only attempt to merge with an extent stored in the
3567         *    same extent tree node.
3568         */
3569        if ((map->m_lblk == ee_block) &&
3570                /* See if we can merge left */
3571                (map_len < ee_len) &&           /*L1*/
3572                (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3573                ext4_lblk_t prev_lblk;
3574                ext4_fsblk_t prev_pblk, ee_pblk;
3575                unsigned int prev_len;
3576
3577                abut_ex = ex - 1;
3578                prev_lblk = le32_to_cpu(abut_ex->ee_block);
3579                prev_len = ext4_ext_get_actual_len(abut_ex);
3580                prev_pblk = ext4_ext_pblock(abut_ex);
3581                ee_pblk = ext4_ext_pblock(ex);
3582
3583                /*
3584                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3585                 * upon those conditions:
3586                 * - C1: abut_ex is initialized,
3587                 * - C2: abut_ex is logically abutting ex,
3588                 * - C3: abut_ex is physically abutting ex,
3589                 * - C4: abut_ex can receive the additional blocks without
3590                 *   overflowing the (initialized) length limit.
3591                 */
3592                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3593                        ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3594                        ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3595                        (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3596                        err = ext4_ext_get_access(handle, inode, path + depth);
3597                        if (err)
3598                                goto out;
3599
3600                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
3601                                map, ex, abut_ex);
3602
3603                        /* Shift the start of ex by 'map_len' blocks */
3604                        ex->ee_block = cpu_to_le32(ee_block + map_len);
3605                        ext4_ext_store_pblock(ex, ee_pblk + map_len);
3606                        ex->ee_len = cpu_to_le16(ee_len - map_len);
3607                        ext4_ext_mark_unwritten(ex); /* Restore the flag */
3608
3609                        /* Extend abut_ex by 'map_len' blocks */
3610                        abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3611
3612                        /* Result: number of initialized blocks past m_lblk */
3613                        allocated = map_len;
3614                }
3615        } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3616                   (map_len < ee_len) &&        /*L1*/
3617                   ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3618                /* See if we can merge right */
3619                ext4_lblk_t next_lblk;
3620                ext4_fsblk_t next_pblk, ee_pblk;
3621                unsigned int next_len;
3622
3623                abut_ex = ex + 1;
3624                next_lblk = le32_to_cpu(abut_ex->ee_block);
3625                next_len = ext4_ext_get_actual_len(abut_ex);
3626                next_pblk = ext4_ext_pblock(abut_ex);
3627                ee_pblk = ext4_ext_pblock(ex);
3628
3629                /*
3630                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3631                 * upon those conditions:
3632                 * - C1: abut_ex is initialized,
3633                 * - C2: abut_ex is logically abutting ex,
3634                 * - C3: abut_ex is physically abutting ex,
3635                 * - C4: abut_ex can receive the additional blocks without
3636                 *   overflowing the (initialized) length limit.
3637                 */
3638                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3639                    ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3640                    ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3641                    (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3642                        err = ext4_ext_get_access(handle, inode, path + depth);
3643                        if (err)
3644                                goto out;
3645
3646                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
3647                                map, ex, abut_ex);
3648
3649                        /* Shift the start of abut_ex by 'map_len' blocks */
3650                        abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3651                        ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3652                        ex->ee_len = cpu_to_le16(ee_len - map_len);
3653                        ext4_ext_mark_unwritten(ex); /* Restore the flag */
3654
3655                        /* Extend abut_ex by 'map_len' blocks */
3656                        abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3657
3658                        /* Result: number of initialized blocks past m_lblk */
3659                        allocated = map_len;
3660                }
3661        }
3662        if (allocated) {
3663                /* Mark the block containing both extents as dirty */
3664                ext4_ext_dirty(handle, inode, path + depth);
3665
3666                /* Update path to point to the right extent */
3667                path[depth].p_ext = abut_ex;
3668                goto out;
3669        } else
3670                allocated = ee_len - (map->m_lblk - ee_block);
3671
3672        WARN_ON(map->m_lblk < ee_block);
3673        /*
3674         * It is safe to convert extent to initialized via explicit
3675         * zeroout only if extent is fully inside i_size or new_size.
3676         */
3677        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3678
3679        if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3680                max_zeroout = sbi->s_extent_max_zeroout_kb >>
3681                        (inode->i_sb->s_blocksize_bits - 10);
3682
3683        /*
3684         * five cases:
3685         * 1. split the extent into three extents.
3686         * 2. split the extent into two extents, zeroout the head of the first
3687         *    extent.
3688         * 3. split the extent into two extents, zeroout the tail of the second
3689         *    extent.
3690         * 4. split the extent into two extents with out zeroout.
3691         * 5. no splitting needed, just possibly zeroout the head and / or the
3692         *    tail of the extent.
3693         */
3694        split_map.m_lblk = map->m_lblk;
3695        split_map.m_len = map->m_len;
3696
3697        if (max_zeroout && (allocated > split_map.m_len)) {
3698                if (allocated <= max_zeroout) {
3699                        /* case 3 or 5 */
3700                        zero_ex1.ee_block =
3701                                 cpu_to_le32(split_map.m_lblk +
3702                                             split_map.m_len);
3703                        zero_ex1.ee_len =
3704                                cpu_to_le16(allocated - split_map.m_len);
3705                        ext4_ext_store_pblock(&zero_ex1,
3706                                ext4_ext_pblock(ex) + split_map.m_lblk +
3707                                split_map.m_len - ee_block);
3708                        err = ext4_ext_zeroout(inode, &zero_ex1);
3709                        if (err)
3710                                goto out;
3711                        split_map.m_len = allocated;
3712                }
3713                if (split_map.m_lblk - ee_block + split_map.m_len <
3714                                                                max_zeroout) {
3715                        /* case 2 or 5 */
3716                        if (split_map.m_lblk != ee_block) {
3717                                zero_ex2.ee_block = ex->ee_block;
3718                                zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
3719                                                        ee_block);
3720                                ext4_ext_store_pblock(&zero_ex2,
3721                                                      ext4_ext_pblock(ex));
3722                                err = ext4_ext_zeroout(inode, &zero_ex2);
3723                                if (err)
3724                                        goto out;
3725                        }
3726
3727                        split_map.m_len += split_map.m_lblk - ee_block;
3728                        split_map.m_lblk = ee_block;
3729                        allocated = map->m_len;
3730                }
3731        }
3732
3733        err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3734                                flags);
3735        if (err > 0)
3736                err = 0;
3737out:
3738        /* If we have gotten a failure, don't zero out status tree */
3739        if (!err) {
3740                err = ext4_zeroout_es(inode, &zero_ex1);
3741                if (!err)
3742                        err = ext4_zeroout_es(inode, &zero_ex2);
3743        }
3744        return err ? err : allocated;
3745}
3746
3747/*
3748 * This function is called by ext4_ext_map_blocks() from
3749 * ext4_get_blocks_dio_write() when DIO to write
3750 * to an unwritten extent.
3751 *
3752 * Writing to an unwritten extent may result in splitting the unwritten
3753 * extent into multiple initialized/unwritten extents (up to three)
3754 * There are three possibilities:
3755 *   a> There is no split required: Entire extent should be unwritten
3756 *   b> Splits in two extents: Write is happening at either end of the extent
3757 *   c> Splits in three extents: Somone is writing in middle of the extent
3758 *
3759 * This works the same way in the case of initialized -> unwritten conversion.
3760 *
3761 * One of more index blocks maybe needed if the extent tree grow after
3762 * the unwritten extent split. To prevent ENOSPC occur at the IO
3763 * complete, we need to split the unwritten extent before DIO submit
3764 * the IO. The unwritten extent called at this time will be split
3765 * into three unwritten extent(at most). After IO complete, the part
3766 * being filled will be convert to initialized by the end_io callback function
3767 * via ext4_convert_unwritten_extents().
3768 *
3769 * Returns the size of unwritten extent to be written on success.
3770 */
3771static int ext4_split_convert_extents(handle_t *handle,
3772                                        struct inode *inode,
3773                                        struct ext4_map_blocks *map,
3774                                        struct ext4_ext_path **ppath,
3775                                        int flags)
3776{
3777        struct ext4_ext_path *path = *ppath;
3778        ext4_lblk_t eof_block;
3779        ext4_lblk_t ee_block;
3780        struct ext4_extent *ex;
3781        unsigned int ee_len;
3782        int split_flag = 0, depth;
3783
3784        ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3785                  __func__, inode->i_ino,
3786                  (unsigned long long)map->m_lblk, map->m_len);
3787
3788        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3789                inode->i_sb->s_blocksize_bits;
3790        if (eof_block < map->m_lblk + map->m_len)
3791                eof_block = map->m_lblk + map->m_len;
3792        /*
3793         * It is safe to convert extent to initialized via explicit
3794         * zeroout only if extent is fully insde i_size or new_size.
3795         */
3796        depth = ext_depth(inode);
3797        ex = path[depth].p_ext;
3798        ee_block = le32_to_cpu(ex->ee_block);
3799        ee_len = ext4_ext_get_actual_len(ex);
3800
3801        /* Convert to unwritten */
3802        if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3803                split_flag |= EXT4_EXT_DATA_VALID1;
3804        /* Convert to initialized */
3805        } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3806                split_flag |= ee_block + ee_len <= eof_block ?
3807                              EXT4_EXT_MAY_ZEROOUT : 0;
3808                split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3809        }
3810        flags |= EXT4_GET_BLOCKS_PRE_IO;
3811        return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3812}
3813
3814static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3815                                                struct inode *inode,
3816                                                struct ext4_map_blocks *map,
3817                                                struct ext4_ext_path **ppath)
3818{
3819        struct ext4_ext_path *path = *ppath;
3820        struct ext4_extent *ex;
3821        ext4_lblk_t ee_block;
3822        unsigned int ee_len;
3823        int depth;
3824        int err = 0;
3825
3826        depth = ext_depth(inode);
3827        ex = path[depth].p_ext;
3828        ee_block = le32_to_cpu(ex->ee_block);
3829        ee_len = ext4_ext_get_actual_len(ex);
3830
3831        ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3832                "block %llu, max_blocks %u\n", inode->i_ino,
3833                  (unsigned long long)ee_block, ee_len);
3834
3835        /* If extent is larger than requested it is a clear sign that we still
3836         * have some extent state machine issues left. So extent_split is still
3837         * required.
3838         * TODO: Once all related issues will be fixed this situation should be
3839         * illegal.
3840         */
3841        if (ee_block != map->m_lblk || ee_len > map->m_len) {
3842#ifdef CONFIG_EXT4_DEBUG
3843                ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
3844                             " len %u; IO logical block %llu, len %u",
3845                             inode->i_ino, (unsigned long long)ee_block, ee_len,
3846                             (unsigned long long)map->m_lblk, map->m_len);
3847#endif
3848                err = ext4_split_convert_extents(handle, inode, map, ppath,
3849                                                 EXT4_GET_BLOCKS_CONVERT);
3850                if (err < 0)
3851                        return err;
3852                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3853                if (IS_ERR(path))
3854                        return PTR_ERR(path);
3855                depth = ext_depth(inode);
3856                ex = path[depth].p_ext;
3857        }
3858
3859        err = ext4_ext_get_access(handle, inode, path + depth);
3860        if (err)
3861                goto out;
3862        /* first mark the extent as initialized */
3863        ext4_ext_mark_initialized(ex);
3864
3865        /* note: ext4_ext_correct_indexes() isn't needed here because
3866         * borders are not changed
3867         */
3868        ext4_ext_try_to_merge(handle, inode, path, ex);
3869
3870        /* Mark modified extent as dirty */
3871        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3872out:
3873        ext4_ext_show_leaf(inode, path);
3874        return err;
3875}
3876
3877/*
3878 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3879 */
3880static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3881                              ext4_lblk_t lblk,
3882                              struct ext4_ext_path *path,
3883                              unsigned int len)
3884{
3885        int i, depth;
3886        struct ext4_extent_header *eh;
3887        struct ext4_extent *last_ex;
3888
3889        if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3890                return 0;
3891
3892        depth = ext_depth(inode);
3893        eh = path[depth].p_hdr;
3894
3895        /*
3896         * We're going to remove EOFBLOCKS_FL entirely in future so we
3897         * do not care for this case anymore. Simply remove the flag
3898         * if there are no extents.
3899         */
3900        if (unlikely(!eh->eh_entries))
3901                goto out;
3902        last_ex = EXT_LAST_EXTENT(eh);
3903        /*
3904         * We should clear the EOFBLOCKS_FL flag if we are writing the
3905         * last block in the last extent in the file.  We test this by
3906         * first checking to see if the caller to
3907         * ext4_ext_get_blocks() was interested in the last block (or
3908         * a block beyond the last block) in the current extent.  If
3909         * this turns out to be false, we can bail out from this
3910         * function immediately.
3911         */
3912        if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3913            ext4_ext_get_actual_len(last_ex))
3914                return 0;
3915        /*
3916         * If the caller does appear to be planning to write at or
3917         * beyond the end of the current extent, we then test to see
3918         * if the current extent is the last extent in the file, by
3919         * checking to make sure it was reached via the rightmost node
3920         * at each level of the tree.
3921         */
3922        for (i = depth-1; i >= 0; i--)
3923                if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3924                        return 0;
3925out:
3926        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3927        return ext4_mark_inode_dirty(handle, inode);
3928}
3929
3930static int
3931convert_initialized_extent(handle_t *handle, struct inode *inode,
3932                           struct ext4_map_blocks *map,
3933                           struct ext4_ext_path **ppath,
3934                           unsigned int allocated)
3935{
3936        struct ext4_ext_path *path = *ppath;
3937        struct ext4_extent *ex;
3938        ext4_lblk_t ee_block;
3939        unsigned int ee_len;
3940        int depth;
3941        int err = 0;
3942
3943        /*
3944         * Make sure that the extent is no bigger than we support with
3945         * unwritten extent
3946         */
3947        if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3948                map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3949
3950        depth = ext_depth(inode);
3951        ex = path[depth].p_ext;
3952        ee_block = le32_to_cpu(ex->ee_block);
3953        ee_len = ext4_ext_get_actual_len(ex);
3954
3955        ext_debug("%s: inode %lu, logical"
3956                "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3957                  (unsigned long long)ee_block, ee_len);
3958
3959        if (ee_block != map->m_lblk || ee_len > map->m_len) {
3960                err = ext4_split_convert_extents(handle, inode, map, ppath,
3961                                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3962                if (err < 0)
3963                        return err;
3964                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3965                if (IS_ERR(path))
3966                        return PTR_ERR(path);
3967                depth = ext_depth(inode);
3968                ex = path[depth].p_ext;
3969                if (!ex) {
3970                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3971                                         (unsigned long) map->m_lblk);
3972                        return -EFSCORRUPTED;
3973                }
3974        }
3975
3976        err = ext4_ext_get_access(handle, inode, path + depth);
3977        if (err)
3978                return err;
3979        /* first mark the extent as unwritten */
3980        ext4_ext_mark_unwritten(ex);
3981
3982        /* note: ext4_ext_correct_indexes() isn't needed here because
3983         * borders are not changed
3984         */
3985        ext4_ext_try_to_merge(handle, inode, path, ex);
3986
3987        /* Mark modified extent as dirty */
3988        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3989        if (err)
3990                return err;
3991        ext4_ext_show_leaf(inode, path);
3992
3993        ext4_update_inode_fsync_trans(handle, inode, 1);
3994        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
3995        if (err)
3996                return err;
3997        map->m_flags |= EXT4_MAP_UNWRITTEN;
3998        if (allocated > map->m_len)
3999                allocated = map->m_len;
4000        map->m_len = allocated;
4001        return allocated;
4002}
4003
4004static int
4005ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4006                        struct ext4_map_blocks *map,
4007                        struct ext4_ext_path **ppath, int flags,
4008                        unsigned int allocated, ext4_fsblk_t newblock)
4009{
4010        struct ext4_ext_path *path = *ppath;
4011        int ret = 0;
4012        int err = 0;
4013
4014        ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
4015                  "block %llu, max_blocks %u, flags %x, allocated %u\n",
4016                  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
4017                  flags, allocated);
4018        ext4_ext_show_leaf(inode, path);
4019
4020        /*
4021         * When writing into unwritten space, we should not fail to
4022         * allocate metadata blocks for the new extent block if needed.
4023         */
4024        flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
4025
4026        trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
4027                                                    allocated, newblock);
4028
4029        /* get_block() before submit the IO, split the extent */
4030        if (flags & EXT4_GET_BLOCKS_PRE_IO) {
4031                ret = ext4_split_convert_extents(handle, inode, map, ppath,
4032                                         flags | EXT4_GET_BLOCKS_CONVERT);
4033                if (ret <= 0)
4034                        goto out;
4035                map->m_flags |= EXT4_MAP_UNWRITTEN;
4036                goto out;
4037        }
4038        /* IO end_io complete, convert the filled extent to written */
4039        if (flags & EXT4_GET_BLOCKS_CONVERT) {
4040                if (flags & EXT4_GET_BLOCKS_ZERO) {
4041                        if (allocated > map->m_len)
4042                                allocated = map->m_len;
4043                        err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
4044                                                 allocated);
4045                        if (err < 0)
4046                                goto out2;
4047                }
4048                ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
4049                                                           ppath);
4050                if (ret >= 0) {
4051                        ext4_update_inode_fsync_trans(handle, inode, 1);
4052                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
4053                                                 path, map->m_len);
4054                } else
4055                        err = ret;
4056                map->m_flags |= EXT4_MAP_MAPPED;
4057                map->m_pblk = newblock;
4058                if (allocated > map->m_len)
4059                        allocated = map->m_len;
4060                map->m_len = allocated;
4061                goto out2;
4062        }
4063        /* buffered IO case */
4064        /*
4065         * repeat fallocate creation request
4066         * we already have an unwritten extent
4067         */
4068        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4069                map->m_flags |= EXT4_MAP_UNWRITTEN;
4070                goto map_out;
4071        }
4072
4073        /* buffered READ or buffered write_begin() lookup */
4074        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4075                /*
4076                 * We have blocks reserved already.  We
4077                 * return allocated blocks so that delalloc
4078                 * won't do block reservation for us.  But
4079                 * the buffer head will be unmapped so that
4080                 * a read from the block returns 0s.
4081                 */
4082                map->m_flags |= EXT4_MAP_UNWRITTEN;
4083                goto out1;
4084        }
4085
4086        /* buffered write, writepage time, convert*/
4087        ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
4088        if (ret >= 0)
4089                ext4_update_inode_fsync_trans(handle, inode, 1);
4090out:
4091        if (ret <= 0) {
4092                err = ret;
4093                goto out2;
4094        } else
4095                allocated = ret;
4096        map->m_flags |= EXT4_MAP_NEW;
4097        if (allocated > map->m_len)
4098                allocated = map->m_len;
4099        map->m_len = allocated;
4100
4101map_out:
4102        map->m_flags |= EXT4_MAP_MAPPED;
4103        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
4104                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
4105                                         map->m_len);
4106                if (err < 0)
4107                        goto out2;
4108        }
4109out1:
4110        if (allocated > map->m_len)
4111                allocated = map->m_len;
4112        ext4_ext_show_leaf(inode, path);
4113        map->m_pblk = newblock;
4114        map->m_len = allocated;
4115out2:
4116        return err ? err : allocated;
4117}
4118
4119/*
4120 * get_implied_cluster_alloc - check to see if the requested
4121 * allocation (in the map structure) overlaps with a cluster already
4122 * allocated in an extent.
4123 *      @sb     The filesystem superblock structure
4124 *      @map    The requested lblk->pblk mapping
4125 *      @ex     The extent structure which might contain an implied
4126 *                      cluster allocation
4127 *
4128 * This function is called by ext4_ext_map_blocks() after we failed to
4129 * find blocks that were already in the inode's extent tree.  Hence,
4130 * we know that the beginning of the requested region cannot overlap
4131 * the extent from the inode's extent tree.  There are three cases we
4132 * want to catch.  The first is this case:
4133 *
4134 *               |--- cluster # N--|
4135 *    |--- extent ---|  |---- requested region ---|
4136 *                      |==========|
4137 *
4138 * The second case that we need to test for is this one:
4139 *
4140 *   |--------- cluster # N ----------------|
4141 *         |--- requested region --|   |------- extent ----|
4142 *         |=======================|
4143 *
4144 * The third case is when the requested region lies between two extents
4145 * within the same cluster:
4146 *          |------------- cluster # N-------------|
4147 * |----- ex -----|                  |---- ex_right ----|
4148 *                  |------ requested region ------|
4149 *                  |================|
4150 *
4151 * In each of the above cases, we need to set the map->m_pblk and
4152 * map->m_len so it corresponds to the return the extent labelled as
4153 * "|====|" from cluster #N, since it is already in use for data in
4154 * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
4155 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4156 * as a new "allocated" block region.  Otherwise, we will return 0 and
4157 * ext4_ext_map_blocks() will then allocate one or more new clusters
4158 * by calling ext4_mb_new_blocks().
4159 */
4160static int get_implied_cluster_alloc(struct super_block *sb,
4161                                     struct ext4_map_blocks *map,
4162                                     struct ext4_extent *ex,
4163                                     struct ext4_ext_path *path)
4164{
4165        struct ext4_sb_info *sbi = EXT4_SB(sb);
4166        ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4167        ext4_lblk_t ex_cluster_start, ex_cluster_end;
4168        ext4_lblk_t rr_cluster_start;
4169        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4170        ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4171        unsigned short ee_len = ext4_ext_get_actual_len(ex);
4172
4173        /* The extent passed in that we are trying to match */
4174        ex_cluster_start = EXT4_B2C(sbi, ee_block);
4175        ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4176
4177        /* The requested region passed into ext4_map_blocks() */
4178        rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4179
4180        if ((rr_cluster_start == ex_cluster_end) ||
4181            (rr_cluster_start == ex_cluster_start)) {
4182                if (rr_cluster_start == ex_cluster_end)
4183                        ee_start += ee_len - 1;
4184                map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4185                map->m_len = min(map->m_len,
4186                                 (unsigned) sbi->s_cluster_ratio - c_offset);
4187                /*
4188                 * Check for and handle this case:
4189                 *
4190                 *   |--------- cluster # N-------------|
4191                 *                     |------- extent ----|
4192                 *         |--- requested region ---|
4193                 *         |===========|
4194                 */
4195
4196                if (map->m_lblk < ee_block)
4197                        map->m_len = min(map->m_len, ee_block - map->m_lblk);
4198
4199                /*
4200                 * Check for the case where there is already another allocated
4201                 * block to the right of 'ex' but before the end of the cluster.
4202                 *
4203                 *          |------------- cluster # N-------------|
4204                 * |----- ex -----|                  |---- ex_right ----|
4205                 *                  |------ requested region ------|
4206                 *                  |================|
4207                 */
4208                if (map->m_lblk > ee_block) {
4209                        ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4210                        map->m_len = min(map->m_len, next - map->m_lblk);
4211                }
4212
4213                trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4214                return 1;
4215        }
4216
4217        trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4218        return 0;
4219}
4220
4221
4222/*
4223 * Block allocation/map/preallocation routine for extents based files
4224 *
4225 *
4226 * Need to be called with
4227 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4228 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4229 *
4230 * return > 0, number of of blocks already mapped/allocated
4231 *          if create == 0 and these are pre-allocated blocks
4232 *              buffer head is unmapped
4233 *          otherwise blocks are mapped
4234 *
4235 * return = 0, if plain look up failed (blocks have not been allocated)
4236 *          buffer head is unmapped
4237 *
4238 * return < 0, error case.
4239 */
4240int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4241                        struct ext4_map_blocks *map, int flags)
4242{
4243        struct ext4_ext_path *path = NULL;
4244        struct ext4_extent newex, *ex, *ex2;
4245        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4246        ext4_fsblk_t newblock = 0;
4247        int free_on_err = 0, err = 0, depth, ret;
4248        unsigned int allocated = 0, offset = 0;
4249        unsigned int allocated_clusters = 0;
4250        struct ext4_allocation_request ar;
4251        ext4_lblk_t cluster_offset;
4252        bool map_from_cluster = false;
4253
4254        ext_debug("blocks %u/%u requested for inode %lu\n",
4255                  map->m_lblk, map->m_len, inode->i_ino);
4256        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4257
4258        /* find extent for this block */
4259        path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4260        if (IS_ERR(path)) {
4261                err = PTR_ERR(path);
4262                path = NULL;
4263                goto out2;
4264        }
4265
4266        depth = ext_depth(inode);
4267
4268        /*
4269         * consistent leaf must not be empty;
4270         * this situation is possible, though, _during_ tree modification;
4271         * this is why assert can't be put in ext4_find_extent()
4272         */
4273        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4274                EXT4_ERROR_INODE(inode, "bad extent address "
4275                                 "lblock: %lu, depth: %d pblock %lld",
4276                                 (unsigned long) map->m_lblk, depth,
4277                                 path[depth].p_block);
4278                err = -EFSCORRUPTED;
4279                goto out2;
4280        }
4281
4282        ex = path[depth].p_ext;
4283        if (ex) {
4284                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4285                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4286                unsigned short ee_len;
4287
4288
4289                /*
4290                 * unwritten extents are treated as holes, except that
4291                 * we split out initialized portions during a write.
4292                 */
4293                ee_len = ext4_ext_get_actual_len(ex);
4294
4295                trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4296
4297                /* if found extent covers block, simply return it */
4298                if (in_range(map->m_lblk, ee_block, ee_len)) {
4299                        newblock = map->m_lblk - ee_block + ee_start;
4300                        /* number of remaining blocks in the extent */
4301                        allocated = ee_len - (map->m_lblk - ee_block);
4302                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4303                                  ee_block, ee_len, newblock);
4304
4305                        /*
4306                         * If the extent is initialized check whether the
4307                         * caller wants to convert it to unwritten.
4308                         */
4309                        if ((!ext4_ext_is_unwritten(ex)) &&
4310                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4311                                allocated = convert_initialized_extent(
4312                                                handle, inode, map, &path,
4313                                                allocated);
4314                                goto out2;
4315                        } else if (!ext4_ext_is_unwritten(ex))
4316                                goto out;
4317
4318                        ret = ext4_ext_handle_unwritten_extents(
4319                                handle, inode, map, &path, flags,
4320                                allocated, newblock);
4321                        if (ret < 0)
4322                                err = ret;
4323                        else
4324                                allocated = ret;
4325                        goto out2;
4326                }
4327        }
4328
4329        /*
4330         * requested block isn't allocated yet;
4331         * we couldn't try to create block if create flag is zero
4332         */
4333        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4334                ext4_lblk_t hole_start, hole_len;
4335
4336                hole_start = map->m_lblk;
4337                hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4338                /*
4339                 * put just found gap into cache to speed up
4340                 * subsequent requests
4341                 */
4342                ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4343
4344                /* Update hole_len to reflect hole size after map->m_lblk */
4345                if (hole_start != map->m_lblk)
4346                        hole_len -= map->m_lblk - hole_start;
4347                map->m_pblk = 0;
4348                map->m_len = min_t(unsigned int, map->m_len, hole_len);
4349
4350                goto out2;
4351        }
4352
4353        /*
4354         * Okay, we need to do block allocation.
4355         */
4356        newex.ee_block = cpu_to_le32(map->m_lblk);
4357        cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4358
4359        /*
4360         * If we are doing bigalloc, check to see if the extent returned
4361         * by ext4_find_extent() implies a cluster we can use.
4362         */
4363        if (cluster_offset && ex &&
4364            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4365                ar.len = allocated = map->m_len;
4366                newblock = map->m_pblk;
4367                map_from_cluster = true;
4368                goto got_allocated_blocks;
4369        }
4370
4371        /* find neighbour allocated blocks */
4372        ar.lleft = map->m_lblk;
4373        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4374        if (err)
4375                goto out2;
4376        ar.lright = map->m_lblk;
4377        ex2 = NULL;
4378        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4379        if (err)
4380                goto out2;
4381
4382        /* Check if the extent after searching to the right implies a
4383         * cluster we can use. */
4384        if ((sbi->s_cluster_ratio > 1) && ex2 &&
4385            get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4386                ar.len = allocated = map->m_len;
4387                newblock = map->m_pblk;
4388                map_from_cluster = true;
4389                goto got_allocated_blocks;
4390        }
4391
4392        /*
4393         * See if request is beyond maximum number of blocks we can have in
4394         * a single extent. For an initialized extent this limit is
4395         * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4396         * EXT_UNWRITTEN_MAX_LEN.
4397         */
4398        if (map->m_len > EXT_INIT_MAX_LEN &&
4399            !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4400                map->m_len = EXT_INIT_MAX_LEN;
4401        else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4402                 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4403                map->m_len = EXT_UNWRITTEN_MAX_LEN;
4404
4405        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4406        newex.ee_len = cpu_to_le16(map->m_len);
4407        err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4408        if (err)
4409                allocated = ext4_ext_get_actual_len(&newex);
4410        else
4411                allocated = map->m_len;
4412
4413        /* allocate new block */
4414        ar.inode = inode;
4415        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4416        ar.logical = map->m_lblk;
4417        /*
4418         * We calculate the offset from the beginning of the cluster
4419         * for the logical block number, since when we allocate a
4420         * physical cluster, the physical block should start at the
4421         * same offset from the beginning of the cluster.  This is
4422         * needed so that future calls to get_implied_cluster_alloc()
4423         * work correctly.
4424         */
4425        offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4426        ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4427        ar.goal -= offset;
4428        ar.logical -= offset;
4429        if (S_ISREG(inode->i_mode))
4430                ar.flags = EXT4_MB_HINT_DATA;
4431        else
4432                /* disable in-core preallocation for non-regular files */
4433                ar.flags = 0;
4434        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4435                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4436        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4437                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4438        if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
4439                ar.flags |= EXT4_MB_USE_RESERVED;
4440        newblock = ext4_mb_new_blocks(handle, &ar, &err);
4441        if (!newblock)
4442                goto out2;
4443        ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4444                  ar.goal, newblock, allocated);
4445        free_on_err = 1;
4446        allocated_clusters = ar.len;
4447        ar.len = EXT4_C2B(sbi, ar.len) - offset;
4448        if (ar.len > allocated)
4449                ar.len = allocated;
4450
4451got_allocated_blocks:
4452        /* try to insert new extent into found leaf and return */
4453        ext4_ext_store_pblock(&newex, newblock + offset);
4454        newex.ee_len = cpu_to_le16(ar.len);
4455        /* Mark unwritten */
4456        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4457                ext4_ext_mark_unwritten(&newex);
4458                map->m_flags |= EXT4_MAP_UNWRITTEN;
4459        }
4460
4461        err = 0;
4462        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4463                err = check_eofblocks_fl(handle, inode, map->m_lblk,
4464                                         path, ar.len);
4465        if (!err)
4466                err = ext4_ext_insert_extent(handle, inode, &path,
4467                                             &newex, flags);
4468
4469        if (err && free_on_err) {
4470                int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4471                        EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4472                /* free data blocks we just allocated */
4473                /* not a good idea to call discard here directly,
4474                 * but otherwise we'd need to call it every free() */
4475                ext4_discard_preallocations(inode);
4476                ext4_free_blocks(handle, inode, NULL, newblock,
4477                                 EXT4_C2B(sbi, allocated_clusters), fb_flags);
4478                goto out2;
4479        }
4480
4481        /* previous routine could use block we allocated */
4482        newblock = ext4_ext_pblock(&newex);
4483        allocated = ext4_ext_get_actual_len(&newex);
4484        if (allocated > map->m_len)
4485                allocated = map->m_len;
4486        map->m_flags |= EXT4_MAP_NEW;
4487
4488        /*
4489         * Reduce the reserved cluster count to reflect successful deferred
4490         * allocation of delayed allocated clusters or direct allocation of
4491         * clusters discovered to be delayed allocated.  Once allocated, a
4492         * cluster is not included in the reserved count.
4493         */
4494        if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
4495                if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4496                        /*
4497                         * When allocating delayed allocated clusters, simply
4498                         * reduce the reserved cluster count and claim quota
4499                         */
4500                        ext4_da_update_reserve_space(inode, allocated_clusters,
4501                                                        1);
4502                } else {
4503                        ext4_lblk_t lblk, len;
4504                        unsigned int n;
4505
4506                        /*
4507                         * When allocating non-delayed allocated clusters
4508                         * (from fallocate, filemap, DIO, or clusters
4509                         * allocated when delalloc has been disabled by
4510                         * ext4_nonda_switch), reduce the reserved cluster
4511                         * count by the number of allocated clusters that
4512                         * have previously been delayed allocated.  Quota
4513                         * has been claimed by ext4_mb_new_blocks() above,
4514                         * so release the quota reservations made for any
4515                         * previously delayed allocated clusters.
4516                         */
4517                        lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
4518                        len = allocated_clusters << sbi->s_cluster_bits;
4519                        n = ext4_es_delayed_clu(inode, lblk, len);
4520                        if (n > 0)
4521                                ext4_da_update_reserve_space(inode, (int) n, 0);
4522                }
4523        }
4524
4525        /*
4526         * Cache the extent and update transaction to commit on fdatasync only
4527         * when it is _not_ an unwritten extent.
4528         */
4529        if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4530                ext4_update_inode_fsync_trans(handle, inode, 1);
4531        else
4532                ext4_update_inode_fsync_trans(handle, inode, 0);
4533out:
4534        if (allocated > map->m_len)
4535                allocated = map->m_len;
4536        ext4_ext_show_leaf(inode, path);
4537        map->m_flags |= EXT4_MAP_MAPPED;
4538        map->m_pblk = newblock;
4539        map->m_len = allocated;
4540out2:
4541        ext4_ext_drop_refs(path);
4542        kfree(path);
4543
4544        trace_ext4_ext_map_blocks_exit(inode, flags, map,
4545                                       err ? err : allocated);
4546        return err ? err : allocated;
4547}
4548
4549int ext4_ext_truncate(handle_t *handle, struct inode *inode)
4550{
4551        struct super_block *sb = inode->i_sb;
4552        ext4_lblk_t last_block;
4553        int err = 0;
4554
4555        /*
4556         * TODO: optimization is possible here.
4557         * Probably we need not scan at all,
4558         * because page truncation is enough.
4559         */
4560
4561        /* we have to know where to truncate from in crash case */
4562        EXT4_I(inode)->i_disksize = inode->i_size;
4563        err = ext4_mark_inode_dirty(handle, inode);
4564        if (err)
4565                return err;
4566
4567        last_block = (inode->i_size + sb->s_blocksize - 1)
4568                        >> EXT4_BLOCK_SIZE_BITS(sb);
4569retry:
4570        err = ext4_es_remove_extent(inode, last_block,
4571                                    EXT_MAX_BLOCKS - last_block);
4572        if (err == -ENOMEM) {
4573                cond_resched();
4574                congestion_wait(BLK_RW_ASYNC, HZ/50);
4575                goto retry;
4576        }
4577        if (err)
4578                return err;
4579        return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4580}
4581
4582static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4583                                  ext4_lblk_t len, loff_t new_size,
4584                                  int flags)
4585{
4586        struct inode *inode = file_inode(file);
4587        handle_t *handle;
4588        int ret = 0;
4589        int ret2 = 0;
4590        int retries = 0;
4591        int depth = 0;
4592        struct ext4_map_blocks map;
4593        unsigned int credits;
4594        loff_t epos;
4595
4596        BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
4597        map.m_lblk = offset;
4598        map.m_len = len;
4599        /*
4600         * Don't normalize the request if it can fit in one extent so
4601         * that it doesn't get unnecessarily split into multiple
4602         * extents.
4603         */
4604        if (len <= EXT_UNWRITTEN_MAX_LEN)
4605                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4606
4607        /*
4608         * credits to insert 1 extent into extent tree
4609         */
4610        credits = ext4_chunk_trans_blocks(inode, len);
4611        depth = ext_depth(inode);
4612
4613retry:
4614        while (ret >= 0 && len) {
4615                /*
4616                 * Recalculate credits when extent tree depth changes.
4617                 */
4618                if (depth != ext_depth(inode)) {
4619                        credits = ext4_chunk_trans_blocks(inode, len);
4620                        depth = ext_depth(inode);
4621                }
4622
4623                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4624                                            credits);
4625                if (IS_ERR(handle)) {
4626                        ret = PTR_ERR(handle);
4627                        break;
4628                }
4629                ret = ext4_map_blocks(handle, inode, &map, flags);
4630                if (ret <= 0) {
4631                        ext4_debug("inode #%lu: block %u: len %u: "
4632                                   "ext4_ext_map_blocks returned %d",
4633                                   inode->i_ino, map.m_lblk,
4634                                   map.m_len, ret);
4635                        ext4_mark_inode_dirty(handle, inode);
4636                        ret2 = ext4_journal_stop(handle);
4637                        break;
4638                }
4639                map.m_lblk += ret;
4640                map.m_len = len = len - ret;
4641                epos = (loff_t)map.m_lblk << inode->i_blkbits;
4642                inode->i_ctime = current_time(inode);
4643                if (new_size) {
4644                        if (epos > new_size)
4645                                epos = new_size;
4646                        if (ext4_update_inode_size(inode, epos) & 0x1)
4647                                inode->i_mtime = inode->i_ctime;
4648                } else {
4649                        if (epos > inode->i_size)
4650                                ext4_set_inode_flag(inode,
4651                                                    EXT4_INODE_EOFBLOCKS);
4652                }
4653                ext4_mark_inode_dirty(handle, inode);
4654                ext4_update_inode_fsync_trans(handle, inode, 1);
4655                ret2 = ext4_journal_stop(handle);
4656                if (ret2)
4657                        break;
4658        }
4659        if (ret == -ENOSPC &&
4660                        ext4_should_retry_alloc(inode->i_sb, &retries)) {
4661                ret = 0;
4662                goto retry;
4663        }
4664
4665        return ret > 0 ? ret2 : ret;
4666}
4667
4668static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
4669
4670static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
4671
4672static long ext4_zero_range(struct file *file, loff_t offset,
4673                            loff_t len, int mode)
4674{
4675        struct inode *inode = file_inode(file);
4676        handle_t *handle = NULL;
4677        unsigned int max_blocks;
4678        loff_t new_size = 0;
4679        int ret = 0;
4680        int flags;
4681        int credits;
4682        int partial_begin, partial_end;
4683        loff_t start, end;
4684        ext4_lblk_t lblk;
4685        unsigned int blkbits = inode->i_blkbits;
4686
4687        trace_ext4_zero_range(inode, offset, len, mode);
4688
4689        /* Call ext4_force_commit to flush all data in case of data=journal. */
4690        if (ext4_should_journal_data(inode)) {
4691                ret = ext4_force_commit(inode->i_sb);
4692                if (ret)
4693                        return ret;
4694        }
4695
4696        /*
4697         * Round up offset. This is not fallocate, we neet to zero out
4698         * blocks, so convert interior block aligned part of the range to
4699         * unwritten and possibly manually zero out unaligned parts of the
4700         * range.
4701         */
4702        start = round_up(offset, 1 << blkbits);
4703        end = round_down((offset + len), 1 << blkbits);
4704
4705        if (start < offset || end > offset + len)
4706                return -EINVAL;
4707        partial_begin = offset & ((1 << blkbits) - 1);
4708        partial_end = (offset + len) & ((1 << blkbits) - 1);
4709
4710        lblk = start >> blkbits;
4711        max_blocks = (end >> blkbits);
4712        if (max_blocks < lblk)
4713                max_blocks = 0;
4714        else
4715                max_blocks -= lblk;
4716
4717        inode_lock(inode);
4718
4719        /*
4720         * Indirect files do not support unwritten extnets
4721         */
4722        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4723                ret = -EOPNOTSUPP;
4724                goto out_mutex;
4725        }
4726
4727        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4728            (offset + len > inode->i_size ||
4729             offset + len > EXT4_I(inode)->i_disksize)) {
4730                new_size = offset + len;
4731                ret = inode_newsize_ok(inode, new_size);
4732                if (ret)
4733                        goto out_mutex;
4734        }
4735
4736        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4737        if (mode & FALLOC_FL_KEEP_SIZE)
4738                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4739
4740        /* Wait all existing dio workers, newcomers will block on i_mutex */
4741        inode_dio_wait(inode);
4742
4743        /* Preallocate the range including the unaligned edges */
4744        if (partial_begin || partial_end) {
4745                ret = ext4_alloc_file_blocks(file,
4746                                round_down(offset, 1 << blkbits) >> blkbits,
4747                                (round_up((offset + len), 1 << blkbits) -
4748                                 round_down(offset, 1 << blkbits)) >> blkbits,
4749                                new_size, flags);
4750                if (ret)
4751                        goto out_mutex;
4752
4753        }
4754
4755        /* Zero range excluding the unaligned edges */
4756        if (max_blocks > 0) {
4757                flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4758                          EXT4_EX_NOCACHE);
4759
4760                /*
4761                 * Prevent page faults from reinstantiating pages we have
4762                 * released from page cache.
4763                 */
4764                down_write(&EXT4_I(inode)->i_mmap_sem);
4765
4766                ret = ext4_break_layouts(inode);
4767                if (ret) {
4768                        up_write(&EXT4_I(inode)->i_mmap_sem);
4769                        goto out_mutex;
4770                }
4771
4772                ret = ext4_update_disksize_before_punch(inode, offset, len);
4773                if (ret) {
4774                        up_write(&EXT4_I(inode)->i_mmap_sem);
4775                        goto out_mutex;
4776                }
4777                /* Now release the pages and zero block aligned part of pages */
4778                truncate_pagecache_range(inode, start, end - 1);
4779                inode->i_mtime = inode->i_ctime = current_time(inode);
4780
4781                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4782                                             flags);
4783                up_write(&EXT4_I(inode)->i_mmap_sem);
4784                if (ret)
4785                        goto out_mutex;
4786        }
4787        if (!partial_begin && !partial_end)
4788                goto out_mutex;
4789
4790        /*
4791         * In worst case we have to writeout two nonadjacent unwritten
4792         * blocks and update the inode
4793         */
4794        credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4795        if (ext4_should_journal_data(inode))
4796                credits += 2;
4797        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4798        if (IS_ERR(handle)) {
4799                ret = PTR_ERR(handle);
4800                ext4_std_error(inode->i_sb, ret);
4801                goto out_mutex;
4802        }
4803
4804        inode->i_mtime = inode->i_ctime = current_time(inode);
4805        if (new_size) {
4806                ext4_update_inode_size(inode, new_size);
4807        } else {
4808                /*
4809                * Mark that we allocate beyond EOF so the subsequent truncate
4810                * can proceed even if the new size is the same as i_size.
4811                */
4812                if (offset + len > inode->i_size)
4813                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4814        }
4815        ext4_mark_inode_dirty(handle, inode);
4816
4817        /* Zero out partial block at the edges of the range */
4818        ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4819        if (ret >= 0)
4820                ext4_update_inode_fsync_trans(handle, inode, 1);
4821
4822        if (file->f_flags & O_SYNC)
4823                ext4_handle_sync(handle);
4824
4825        ext4_journal_stop(handle);
4826out_mutex:
4827        inode_unlock(inode);
4828        return ret;
4829}
4830
4831/*
4832 * preallocate space for a file. This implements ext4's fallocate file
4833 * operation, which gets called from sys_fallocate system call.
4834 * For block-mapped files, posix_fallocate should fall back to the method
4835 * of writing zeroes to the required new blocks (the same behavior which is
4836 * expected for file systems which do not support fallocate() system call).
4837 */
4838long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4839{
4840        struct inode *inode = file_inode(file);
4841        loff_t new_size = 0;
4842        unsigned int max_blocks;
4843        int ret = 0;
4844        int flags;
4845        ext4_lblk_t lblk;
4846        unsigned int blkbits = inode->i_blkbits;
4847
4848        /*
4849         * Encrypted inodes can't handle collapse range or insert
4850         * range since we would need to re-encrypt blocks with a
4851         * different IV or XTS tweak (which are based on the logical
4852         * block number).
4853         */
4854        if (IS_ENCRYPTED(inode) &&
4855            (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
4856                return -EOPNOTSUPP;
4857
4858        /* Return error if mode is not supported */
4859        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4860                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
4861                     FALLOC_FL_INSERT_RANGE))
4862                return -EOPNOTSUPP;
4863
4864        if (mode & FALLOC_FL_PUNCH_HOLE)
4865                return ext4_punch_hole(inode, offset, len);
4866
4867        ret = ext4_convert_inline_data(inode);
4868        if (ret)
4869                return ret;
4870
4871        if (mode & FALLOC_FL_COLLAPSE_RANGE)
4872                return ext4_collapse_range(inode, offset, len);
4873
4874        if (mode & FALLOC_FL_INSERT_RANGE)
4875                return ext4_insert_range(inode, offset, len);
4876
4877        if (mode & FALLOC_FL_ZERO_RANGE)
4878                return ext4_zero_range(file, offset, len, mode);
4879
4880        trace_ext4_fallocate_enter(inode, offset, len, mode);
4881        lblk = offset >> blkbits;
4882
4883        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4884        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4885        if (mode & FALLOC_FL_KEEP_SIZE)
4886                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4887
4888        inode_lock(inode);
4889
4890        /*
4891         * We only support preallocation for extent-based files only
4892         */
4893        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4894                ret = -EOPNOTSUPP;
4895                goto out;
4896        }
4897
4898        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4899            (offset + len > inode->i_size ||
4900             offset + len > EXT4_I(inode)->i_disksize)) {
4901                new_size = offset + len;
4902                ret = inode_newsize_ok(inode, new_size);
4903                if (ret)
4904                        goto out;
4905        }
4906
4907        /* Wait all existing dio workers, newcomers will block on i_mutex */
4908        inode_dio_wait(inode);
4909
4910        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
4911        if (ret)
4912                goto out;
4913
4914        if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4915                ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
4916                                                EXT4_I(inode)->i_sync_tid);
4917        }
4918out:
4919        inode_unlock(inode);
4920        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4921        return ret;
4922}
4923
4924/*
4925 * This function convert a range of blocks to written extents
4926 * The caller of this function will pass the start offset and the size.
4927 * all unwritten extents within this range will be converted to
4928 * written extents.
4929 *
4930 * This function is called from the direct IO end io call back
4931 * function, to convert the fallocated extents after IO is completed.
4932 * Returns 0 on success.
4933 */
4934int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4935                                   loff_t offset, ssize_t len)
4936{
4937        unsigned int max_blocks;
4938        int ret = 0;
4939        int ret2 = 0;
4940        struct ext4_map_blocks map;
4941        unsigned int blkbits = inode->i_blkbits;
4942        unsigned int credits = 0;
4943
4944        map.m_lblk = offset >> blkbits;
4945        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4946
4947        if (!handle) {
4948                /*
4949                 * credits to insert 1 extent into extent tree
4950                 */
4951                credits = ext4_chunk_trans_blocks(inode, max_blocks);
4952        }
4953        while (ret >= 0 && ret < max_blocks) {
4954                map.m_lblk += ret;
4955                map.m_len = (max_blocks -= ret);
4956                if (credits) {
4957                        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4958                                                    credits);
4959                        if (IS_ERR(handle)) {
4960                                ret = PTR_ERR(handle);
4961                                break;
4962                        }
4963                }
4964                ret = ext4_map_blocks(handle, inode, &map,
4965                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4966                if (ret <= 0)
4967                        ext4_warning(inode->i_sb,
4968                                     "inode #%lu: block %u: len %u: "
4969                                     "ext4_ext_map_blocks returned %d",
4970                                     inode->i_ino, map.m_lblk,
4971                                     map.m_len, ret);
4972                ext4_mark_inode_dirty(handle, inode);
4973                if (credits)
4974                        ret2 = ext4_journal_stop(handle);
4975                if (ret <= 0 || ret2)
4976                        break;
4977        }
4978        return ret > 0 ? ret2 : ret;
4979}
4980
4981int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
4982{
4983        int ret, err = 0;
4984        struct ext4_io_end_vec *io_end_vec;
4985
4986        /*
4987         * This is somewhat ugly but the idea is clear: When transaction is
4988         * reserved, everything goes into it. Otherwise we rather start several
4989         * smaller transactions for conversion of each extent separately.
4990         */
4991        if (handle) {
4992                handle = ext4_journal_start_reserved(handle,
4993                                                     EXT4_HT_EXT_CONVERT);
4994                if (IS_ERR(handle))
4995                        return PTR_ERR(handle);
4996        }
4997
4998        list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
4999                ret = ext4_convert_unwritten_extents(handle, io_end->inode,
5000                                                     io_end_vec->offset,
5001                                                     io_end_vec->size);
5002                if (ret)
5003                        break;
5004        }
5005
5006        if (handle)
5007                err = ext4_journal_stop(handle);
5008
5009        return ret < 0 ? ret : err;
5010}
5011
5012/*
5013 * If newes is not existing extent (newes->ec_pblk equals zero) find
5014 * delayed extent at start of newes and update newes accordingly and
5015 * return start of the next delayed extent.
5016 *
5017 * If newes is existing extent (newes->ec_pblk is not equal zero)
5018 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
5019 * extent found. Leave newes unmodified.
5020 */
5021static int ext4_find_delayed_extent(struct inode *inode,
5022                                    struct extent_status *newes)
5023{
5024        struct extent_status es;
5025        ext4_lblk_t block, next_del;
5026
5027        if (newes->es_pblk == 0) {
5028                ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
5029                                          newes->es_lblk,
5030                                          newes->es_lblk + newes->es_len - 1,
5031                                          &es);
5032
5033                /*
5034                 * No extent in extent-tree contains block @newes->es_pblk,
5035                 * then the block may stay in 1)a hole or 2)delayed-extent.
5036                 */
5037                if (es.es_len == 0)
5038                        /* A hole found. */
5039                        return 0;
5040
5041                if (es.es_lblk > newes->es_lblk) {
5042                        /* A hole found. */
5043                        newes->es_len = min(es.es_lblk - newes->es_lblk,
5044                                            newes->es_len);
5045                        return 0;
5046                }
5047
5048                newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
5049        }
5050
5051        block = newes->es_lblk + newes->es_len;
5052        ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
5053                                  EXT_MAX_BLOCKS, &es);
5054        if (es.es_len == 0)
5055                next_del = EXT_MAX_BLOCKS;
5056        else
5057                next_del = es.es_lblk;
5058
5059        return next_del;
5060}
5061
5062static int ext4_xattr_fiemap(struct inode *inode,
5063                                struct fiemap_extent_info *fieinfo)
5064{
5065        __u64 physical = 0;
5066        __u64 length;
5067        __u32 flags = FIEMAP_EXTENT_LAST;
5068        int blockbits = inode->i_sb->s_blocksize_bits;
5069        int error = 0;
5070
5071        /* in-inode? */
5072        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
5073                struct ext4_iloc iloc;
5074                int offset;     /* offset of xattr in inode */
5075
5076                error = ext4_get_inode_loc(inode, &iloc);
5077                if (error)
5078                        return error;
5079                physical = (__u64)iloc.bh->b_blocknr << blockbits;
5080                offset = EXT4_GOOD_OLD_INODE_SIZE +
5081                                EXT4_I(inode)->i_extra_isize;
5082                physical += offset;
5083                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
5084                flags |= FIEMAP_EXTENT_DATA_INLINE;
5085                brelse(iloc.bh);
5086        } else { /* external block */
5087                physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
5088                length = inode->i_sb->s_blocksize;
5089        }
5090
5091        if (physical)
5092                error = fiemap_fill_next_extent(fieinfo, 0, physical,
5093                                                length, flags);
5094        return (error < 0 ? error : 0);
5095}
5096
5097static int _ext4_fiemap(struct inode *inode,
5098                        struct fiemap_extent_info *fieinfo,
5099                        __u64 start, __u64 len,
5100                        int (*fill)(struct inode *, ext4_lblk_t,
5101                                    ext4_lblk_t,
5102                                    struct fiemap_extent_info *))
5103{
5104        ext4_lblk_t start_blk;
5105        u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
5106
5107        int error = 0;
5108
5109        if (ext4_has_inline_data(inode)) {
5110                int has_inline = 1;
5111
5112                error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
5113                                                start, len);
5114
5115                if (has_inline)
5116                        return error;
5117        }
5118
5119        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5120                error = ext4_ext_precache(inode);
5121                if (error)
5122                        return error;
5123                fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
5124        }
5125
5126        /* fallback to generic here if not in extents fmt */
5127        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
5128            fill == ext4_fill_fiemap_extents)
5129                return generic_block_fiemap(inode, fieinfo, start, len,
5130                        ext4_get_block);
5131
5132        if (fill == ext4_fill_es_cache_info)
5133                ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
5134        if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
5135                return -EBADR;
5136
5137        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
5138                error = ext4_xattr_fiemap(inode, fieinfo);
5139        } else {
5140                ext4_lblk_t len_blks;
5141                __u64 last_blk;
5142
5143                start_blk = start >> inode->i_sb->s_blocksize_bits;
5144                last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5145                if (last_blk >= EXT_MAX_BLOCKS)
5146                        last_blk = EXT_MAX_BLOCKS-1;
5147                len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5148
5149                /*
5150                 * Walk the extent tree gathering extent information
5151                 * and pushing extents back to the user.
5152                 */
5153                error = fill(inode, start_blk, len_blks, fieinfo);
5154        }
5155        return error;
5156}
5157
5158int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5159                __u64 start, __u64 len)
5160{
5161        return _ext4_fiemap(inode, fieinfo, start, len,
5162                            ext4_fill_fiemap_extents);
5163}
5164
5165int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
5166                      __u64 start, __u64 len)
5167{
5168        if (ext4_has_inline_data(inode)) {
5169                int has_inline;
5170
5171                down_read(&EXT4_I(inode)->xattr_sem);
5172                has_inline = ext4_has_inline_data(inode);
5173                up_read(&EXT4_I(inode)->xattr_sem);
5174                if (has_inline)
5175                        return 0;
5176        }
5177
5178        return _ext4_fiemap(inode, fieinfo, start, len,
5179                            ext4_fill_es_cache_info);
5180}
5181
5182
5183/*
5184 * ext4_access_path:
5185 * Function to access the path buffer for marking it dirty.
5186 * It also checks if there are sufficient credits left in the journal handle
5187 * to update path.
5188 */
5189static int
5190ext4_access_path(handle_t *handle, struct inode *inode,
5191                struct ext4_ext_path *path)
5192{
5193        int credits, err;
5194
5195        if (!ext4_handle_valid(handle))
5196                return 0;
5197
5198        /*
5199         * Check if need to extend journal credits
5200         * 3 for leaf, sb, and inode plus 2 (bmap and group
5201         * descriptor) for each block group; assume two block
5202         * groups
5203         */
5204        credits = ext4_writepage_trans_blocks(inode);
5205        err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
5206        if (err < 0)
5207                return err;
5208
5209        err = ext4_ext_get_access(handle, inode, path);
5210        return err;
5211}
5212
5213/*
5214 * ext4_ext_shift_path_extents:
5215 * Shift the extents of a path structure lying between path[depth].p_ext
5216 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
5217 * if it is right shift or left shift operation.
5218 */
5219static int
5220ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5221                            struct inode *inode, handle_t *handle,
5222                            enum SHIFT_DIRECTION SHIFT)
5223{
5224        int depth, err = 0;
5225        struct ext4_extent *ex_start, *ex_last;
5226        bool update = false;
5227        depth = path->p_depth;
5228
5229        while (depth >= 0) {
5230                if (depth == path->p_depth) {
5231                        ex_start = path[depth].p_ext;
5232                        if (!ex_start)
5233                                return -EFSCORRUPTED;
5234
5235                        ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5236
5237                        err = ext4_access_path(handle, inode, path + depth);
5238                        if (err)
5239                                goto out;
5240
5241                        if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5242                                update = true;
5243
5244                        while (ex_start <= ex_last) {
5245                                if (SHIFT == SHIFT_LEFT) {
5246                                        le32_add_cpu(&ex_start->ee_block,
5247                                                -shift);
5248                                        /* Try to merge to the left. */
5249                                        if ((ex_start >
5250                                            EXT_FIRST_EXTENT(path[depth].p_hdr))
5251                                            &&
5252                                            ext4_ext_try_to_merge_right(inode,
5253                                            path, ex_start - 1))
5254                                                ex_last--;
5255                                        else
5256                                                ex_start++;
5257                                } else {
5258                                        le32_add_cpu(&ex_last->ee_block, shift);
5259                                        ext4_ext_try_to_merge_right(inode, path,
5260                                                ex_last);
5261                                        ex_last--;
5262                                }
5263                        }
5264                        err = ext4_ext_dirty(handle, inode, path + depth);
5265                        if (err)
5266                                goto out;
5267
5268                        if (--depth < 0 || !update)
5269                                break;
5270                }
5271
5272                /* Update index too */
5273                err = ext4_access_path(handle, inode, path + depth);
5274                if (err)
5275                        goto out;
5276
5277                if (SHIFT == SHIFT_LEFT)
5278                        le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5279                else
5280                        le32_add_cpu(&path[depth].p_idx->ei_block, shift);
5281                err = ext4_ext_dirty(handle, inode, path + depth);
5282                if (err)
5283                        goto out;
5284
5285                /* we are done if current index is not a starting index */
5286                if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5287                        break;
5288
5289                depth--;
5290        }
5291
5292out:
5293        return err;
5294}
5295
5296/*
5297 * ext4_ext_shift_extents:
5298 * All the extents which lies in the range from @start to the last allocated
5299 * block for the @inode are shifted either towards left or right (depending
5300 * upon @SHIFT) by @shift blocks.
5301 * On success, 0 is returned, error otherwise.
5302 */
5303static int
5304ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5305                       ext4_lblk_t start, ext4_lblk_t shift,
5306                       enum SHIFT_DIRECTION SHIFT)
5307{
5308        struct ext4_ext_path *path;
5309        int ret = 0, depth;
5310        struct ext4_extent *extent;
5311        ext4_lblk_t stop, *iterator, ex_start, ex_end;
5312
5313        /* Let path point to the last extent */
5314        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
5315                                EXT4_EX_NOCACHE);
5316        if (IS_ERR(path))
5317                return PTR_ERR(path);
5318
5319        depth = path->p_depth;
5320        extent = path[depth].p_ext;
5321        if (!extent)
5322                goto out;
5323
5324        stop = le32_to_cpu(extent->ee_block);
5325
5326       /*
5327        * For left shifts, make sure the hole on the left is big enough to
5328        * accommodate the shift.  For right shifts, make sure the last extent
5329        * won't be shifted beyond EXT_MAX_BLOCKS.
5330        */
5331        if (SHIFT == SHIFT_LEFT) {
5332                path = ext4_find_extent(inode, start - 1, &path,
5333                                        EXT4_EX_NOCACHE);
5334                if (IS_ERR(path))
5335                        return PTR_ERR(path);
5336                depth = path->p_depth;
5337                extent =  path[depth].p_ext;
5338                if (extent) {
5339                        ex_start = le32_to_cpu(extent->ee_block);
5340                        ex_end = le32_to_cpu(extent->ee_block) +
5341                                ext4_ext_get_actual_len(extent);
5342                } else {
5343                        ex_start = 0;
5344                        ex_end = 0;
5345                }
5346
5347                if ((start == ex_start && shift > ex_start) ||
5348                    (shift > start - ex_end)) {
5349                        ret = -EINVAL;
5350                        goto out;
5351                }
5352        } else {
5353                if (shift > EXT_MAX_BLOCKS -
5354                    (stop + ext4_ext_get_actual_len(extent))) {
5355                        ret = -EINVAL;
5356                        goto out;
5357                }
5358        }
5359
5360        /*
5361         * In case of left shift, iterator points to start and it is increased
5362         * till we reach stop. In case of right shift, iterator points to stop
5363         * and it is decreased till we reach start.
5364         */
5365        if (SHIFT == SHIFT_LEFT)
5366                iterator = &start;
5367        else
5368                iterator = &stop;
5369
5370        /*
5371         * Its safe to start updating extents.  Start and stop are unsigned, so
5372         * in case of right shift if extent with 0 block is reached, iterator
5373         * becomes NULL to indicate the end of the loop.
5374         */
5375        while (iterator && start <= stop) {
5376                path = ext4_find_extent(inode, *iterator, &path,
5377                                        EXT4_EX_NOCACHE);
5378                if (IS_ERR(path))
5379                        return PTR_ERR(path);
5380                depth = path->p_depth;
5381                extent = path[depth].p_ext;
5382                if (!extent) {
5383                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5384                                         (unsigned long) *iterator);
5385                        return -EFSCORRUPTED;
5386                }
5387                if (SHIFT == SHIFT_LEFT && *iterator >
5388                    le32_to_cpu(extent->ee_block)) {
5389                        /* Hole, move to the next extent */
5390                        if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5391                                path[depth].p_ext++;
5392                        } else {
5393                                *iterator = ext4_ext_next_allocated_block(path);
5394                                continue;
5395                        }
5396                }
5397
5398                if (SHIFT == SHIFT_LEFT) {
5399                        extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5400                        *iterator = le32_to_cpu(extent->ee_block) +
5401                                        ext4_ext_get_actual_len(extent);
5402                } else {
5403                        extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
5404                        if (le32_to_cpu(extent->ee_block) > 0)
5405                                *iterator = le32_to_cpu(extent->ee_block) - 1;
5406                        else
5407                                /* Beginning is reached, end of the loop */
5408                                iterator = NULL;
5409                        /* Update path extent in case we need to stop */
5410                        while (le32_to_cpu(extent->ee_block) < start)
5411                                extent++;
5412                        path[depth].p_ext = extent;
5413                }
5414                ret = ext4_ext_shift_path_extents(path, shift, inode,
5415                                handle, SHIFT);
5416                if (ret)
5417                        break;
5418        }
5419out:
5420        ext4_ext_drop_refs(path);
5421        kfree(path);
5422        return ret;
5423}
5424
5425/*
5426 * ext4_collapse_range:
5427 * This implements the fallocate's collapse range functionality for ext4
5428 * Returns: 0 and non-zero on error.
5429 */
5430static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5431{
5432        struct super_block *sb = inode->i_sb;
5433        ext4_lblk_t punch_start, punch_stop;
5434        handle_t *handle;
5435        unsigned int credits;
5436        loff_t new_size, ioffset;
5437        int ret;
5438
5439        /*
5440         * We need to test this early because xfstests assumes that a
5441         * collapse range of (0, 1) will return EOPNOTSUPP if the file
5442         * system does not support collapse range.
5443         */
5444        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5445                return -EOPNOTSUPP;
5446
5447        /* Collapse range works only on fs cluster size aligned regions. */
5448        if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5449                return -EINVAL;
5450
5451        trace_ext4_collapse_range(inode, offset, len);
5452
5453        punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5454        punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5455
5456        /* Call ext4_force_commit to flush all data in case of data=journal. */
5457        if (ext4_should_journal_data(inode)) {
5458                ret = ext4_force_commit(inode->i_sb);
5459                if (ret)
5460                        return ret;
5461        }
5462
5463        inode_lock(inode);
5464        /*
5465         * There is no need to overlap collapse range with EOF, in which case
5466         * it is effectively a truncate operation
5467         */
5468        if (offset + len >= inode->i_size) {
5469                ret = -EINVAL;
5470                goto out_mutex;
5471        }
5472
5473        /* Currently just for extent based files */
5474        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5475                ret = -EOPNOTSUPP;
5476                goto out_mutex;
5477        }
5478
5479        /* Wait for existing dio to complete */
5480        inode_dio_wait(inode);
5481
5482        /*
5483         * Prevent page faults from reinstantiating pages we have released from
5484         * page cache.
5485         */
5486        down_write(&EXT4_I(inode)->i_mmap_sem);
5487
5488        ret = ext4_break_layouts(inode);
5489        if (ret)
5490                goto out_mmap;
5491
5492        /*
5493         * Need to round down offset to be aligned with page size boundary
5494         * for page size > block size.
5495         */
5496        ioffset = round_down(offset, PAGE_SIZE);
5497        /*
5498         * Write tail of the last page before removed range since it will get
5499         * removed from the page cache below.
5500         */
5501        ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
5502        if (ret)
5503                goto out_mmap;
5504        /*
5505         * Write data that will be shifted to preserve them when discarding
5506         * page cache below. We are also protected from pages becoming dirty
5507         * by i_mmap_sem.
5508         */
5509        ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
5510                                           LLONG_MAX);
5511        if (ret)
5512                goto out_mmap;
5513        truncate_pagecache(inode, ioffset);
5514
5515        credits = ext4_writepage_trans_blocks(inode);
5516        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5517        if (IS_ERR(handle)) {
5518                ret = PTR_ERR(handle);
5519                goto out_mmap;
5520        }
5521
5522        down_write(&EXT4_I(inode)->i_data_sem);
5523        ext4_discard_preallocations(inode);
5524
5525        ret = ext4_es_remove_extent(inode, punch_start,
5526                                    EXT_MAX_BLOCKS - punch_start);
5527        if (ret) {
5528                up_write(&EXT4_I(inode)->i_data_sem);
5529                goto out_stop;
5530        }
5531
5532        ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5533        if (ret) {
5534                up_write(&EXT4_I(inode)->i_data_sem);
5535                goto out_stop;
5536        }
5537        ext4_discard_preallocations(inode);
5538
5539        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5540                                     punch_stop - punch_start, SHIFT_LEFT);
5541        if (ret) {
5542                up_write(&EXT4_I(inode)->i_data_sem);
5543                goto out_stop;
5544        }
5545
5546        new_size = inode->i_size - len;
5547        i_size_write(inode, new_size);
5548        EXT4_I(inode)->i_disksize = new_size;
5549
5550        up_write(&EXT4_I(inode)->i_data_sem);
5551        if (IS_SYNC(inode))
5552                ext4_handle_sync(handle);
5553        inode->i_mtime = inode->i_ctime = current_time(inode);
5554        ext4_mark_inode_dirty(handle, inode);
5555        ext4_update_inode_fsync_trans(handle, inode, 1);
5556
5557out_stop:
5558        ext4_journal_stop(handle);
5559out_mmap:
5560        up_write(&EXT4_I(inode)->i_mmap_sem);
5561out_mutex:
5562        inode_unlock(inode);
5563        return ret;
5564}
5565
5566/*
5567 * ext4_insert_range:
5568 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5569 * The data blocks starting from @offset to the EOF are shifted by @len
5570 * towards right to create a hole in the @inode. Inode size is increased
5571 * by len bytes.
5572 * Returns 0 on success, error otherwise.
5573 */
5574static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5575{
5576        struct super_block *sb = inode->i_sb;
5577        handle_t *handle;
5578        struct ext4_ext_path *path;
5579        struct ext4_extent *extent;
5580        ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
5581        unsigned int credits, ee_len;
5582        int ret = 0, depth, split_flag = 0;
5583        loff_t ioffset;
5584
5585        /*
5586         * We need to test this early because xfstests assumes that an
5587         * insert range of (0, 1) will return EOPNOTSUPP if the file
5588         * system does not support insert range.
5589         */
5590        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5591                return -EOPNOTSUPP;
5592
5593        /* Insert range works only on fs cluster size aligned regions. */
5594        if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5595                return -EINVAL;
5596
5597        trace_ext4_insert_range(inode, offset, len);
5598
5599        offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5600        len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
5601
5602        /* Call ext4_force_commit to flush all data in case of data=journal */
5603        if (ext4_should_journal_data(inode)) {
5604                ret = ext4_force_commit(inode->i_sb);
5605                if (ret)
5606                        return ret;
5607        }
5608
5609        inode_lock(inode);
5610        /* Currently just for extent based files */
5611        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5612                ret = -EOPNOTSUPP;
5613                goto out_mutex;
5614        }
5615
5616        /* Check whether the maximum file size would be exceeded */
5617        if (len > inode->i_sb->s_maxbytes - inode->i_size) {
5618                ret = -EFBIG;
5619                goto out_mutex;
5620        }
5621
5622        /* Offset must be less than i_size */
5623        if (offset >= inode->i_size) {
5624                ret = -EINVAL;
5625                goto out_mutex;
5626        }
5627
5628        /* Wait for existing dio to complete */
5629        inode_dio_wait(inode);
5630
5631        /*
5632         * Prevent page faults from reinstantiating pages we have released from
5633         * page cache.
5634         */
5635        down_write(&EXT4_I(inode)->i_mmap_sem);
5636
5637        ret = ext4_break_layouts(inode);
5638        if (ret)
5639                goto out_mmap;
5640
5641        /*
5642         * Need to round down to align start offset to page size boundary
5643         * for page size > block size.
5644         */
5645        ioffset = round_down(offset, PAGE_SIZE);
5646        /* Write out all dirty pages */
5647        ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5648                        LLONG_MAX);
5649        if (ret)
5650                goto out_mmap;
5651        truncate_pagecache(inode, ioffset);
5652
5653        credits = ext4_writepage_trans_blocks(inode);
5654        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5655        if (IS_ERR(handle)) {
5656                ret = PTR_ERR(handle);
5657                goto out_mmap;
5658        }
5659
5660        /* Expand file to avoid data loss if there is error while shifting */
5661        inode->i_size += len;
5662        EXT4_I(inode)->i_disksize += len;
5663        inode->i_mtime = inode->i_ctime = current_time(inode);
5664        ret = ext4_mark_inode_dirty(handle, inode);
5665        if (ret)
5666                goto out_stop;
5667
5668        down_write(&EXT4_I(inode)->i_data_sem);
5669        ext4_discard_preallocations(inode);
5670
5671        path = ext4_find_extent(inode, offset_lblk, NULL, 0);
5672        if (IS_ERR(path)) {
5673                up_write(&EXT4_I(inode)->i_data_sem);
5674                goto out_stop;
5675        }
5676
5677        depth = ext_depth(inode);
5678        extent = path[depth].p_ext;
5679        if (extent) {
5680                ee_start_lblk = le32_to_cpu(extent->ee_block);
5681                ee_len = ext4_ext_get_actual_len(extent);
5682
5683                /*
5684                 * If offset_lblk is not the starting block of extent, split
5685                 * the extent @offset_lblk
5686                 */
5687                if ((offset_lblk > ee_start_lblk) &&
5688                                (offset_lblk < (ee_start_lblk + ee_len))) {
5689                        if (ext4_ext_is_unwritten(extent))
5690                                split_flag = EXT4_EXT_MARK_UNWRIT1 |
5691                                        EXT4_EXT_MARK_UNWRIT2;
5692                        ret = ext4_split_extent_at(handle, inode, &path,
5693                                        offset_lblk, split_flag,
5694                                        EXT4_EX_NOCACHE |
5695                                        EXT4_GET_BLOCKS_PRE_IO |
5696                                        EXT4_GET_BLOCKS_METADATA_NOFAIL);
5697                }
5698
5699                ext4_ext_drop_refs(path);
5700                kfree(path);
5701                if (ret < 0) {
5702                        up_write(&EXT4_I(inode)->i_data_sem);
5703                        goto out_stop;
5704                }
5705        } else {
5706                ext4_ext_drop_refs(path);
5707                kfree(path);
5708        }
5709
5710        ret = ext4_es_remove_extent(inode, offset_lblk,
5711                        EXT_MAX_BLOCKS - offset_lblk);
5712        if (ret) {
5713                up_write(&EXT4_I(inode)->i_data_sem);
5714                goto out_stop;
5715        }
5716
5717        /*
5718         * if offset_lblk lies in a hole which is at start of file, use
5719         * ee_start_lblk to shift extents
5720         */
5721        ret = ext4_ext_shift_extents(inode, handle,
5722                ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
5723                len_lblk, SHIFT_RIGHT);
5724
5725        up_write(&EXT4_I(inode)->i_data_sem);
5726        if (IS_SYNC(inode))
5727                ext4_handle_sync(handle);
5728        if (ret >= 0)
5729                ext4_update_inode_fsync_trans(handle, inode, 1);
5730
5731out_stop:
5732        ext4_journal_stop(handle);
5733out_mmap:
5734        up_write(&EXT4_I(inode)->i_mmap_sem);
5735out_mutex:
5736        inode_unlock(inode);
5737        return ret;
5738}
5739
5740/**
5741 * ext4_swap_extents() - Swap extents between two inodes
5742 * @handle: handle for this transaction
5743 * @inode1:     First inode
5744 * @inode2:     Second inode
5745 * @lblk1:      Start block for first inode
5746 * @lblk2:      Start block for second inode
5747 * @count:      Number of blocks to swap
5748 * @unwritten: Mark second inode's extents as unwritten after swap
5749 * @erp:        Pointer to save error value
5750 *
5751 * This helper routine does exactly what is promise "swap extents". All other
5752 * stuff such as page-cache locking consistency, bh mapping consistency or
5753 * extent's data copying must be performed by caller.
5754 * Locking:
5755 *              i_mutex is held for both inodes
5756 *              i_data_sem is locked for write for both inodes
5757 * Assumptions:
5758 *              All pages from requested range are locked for both inodes
5759 */
5760int
5761ext4_swap_extents(handle_t *handle, struct inode *inode1,
5762                  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5763                  ext4_lblk_t count, int unwritten, int *erp)
5764{
5765        struct ext4_ext_path *path1 = NULL;
5766        struct ext4_ext_path *path2 = NULL;
5767        int replaced_count = 0;
5768
5769        BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5770        BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5771        BUG_ON(!inode_is_locked(inode1));
5772        BUG_ON(!inode_is_locked(inode2));
5773
5774        *erp = ext4_es_remove_extent(inode1, lblk1, count);
5775        if (unlikely(*erp))
5776                return 0;
5777        *erp = ext4_es_remove_extent(inode2, lblk2, count);
5778        if (unlikely(*erp))
5779                return 0;
5780
5781        while (count) {
5782                struct ext4_extent *ex1, *ex2, tmp_ex;
5783                ext4_lblk_t e1_blk, e2_blk;
5784                int e1_len, e2_len, len;
5785                int split = 0;
5786
5787                path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5788                if (IS_ERR(path1)) {
5789                        *erp = PTR_ERR(path1);
5790                        path1 = NULL;
5791                finish:
5792                        count = 0;
5793                        goto repeat;
5794                }
5795                path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5796                if (IS_ERR(path2)) {
5797                        *erp = PTR_ERR(path2);
5798                        path2 = NULL;
5799                        goto finish;
5800                }
5801                ex1 = path1[path1->p_depth].p_ext;
5802                ex2 = path2[path2->p_depth].p_ext;
5803                /* Do we have somthing to swap ? */
5804                if (unlikely(!ex2 || !ex1))
5805                        goto finish;
5806
5807                e1_blk = le32_to_cpu(ex1->ee_block);
5808                e2_blk = le32_to_cpu(ex2->ee_block);
5809                e1_len = ext4_ext_get_actual_len(ex1);
5810                e2_len = ext4_ext_get_actual_len(ex2);
5811
5812                /* Hole handling */
5813                if (!in_range(lblk1, e1_blk, e1_len) ||
5814                    !in_range(lblk2, e2_blk, e2_len)) {
5815                        ext4_lblk_t next1, next2;
5816
5817                        /* if hole after extent, then go to next extent */
5818                        next1 = ext4_ext_next_allocated_block(path1);
5819                        next2 = ext4_ext_next_allocated_block(path2);
5820                        /* If hole before extent, then shift to that extent */
5821                        if (e1_blk > lblk1)
5822                                next1 = e1_blk;
5823                        if (e2_blk > lblk2)
5824                                next2 = e2_blk;
5825                        /* Do we have something to swap */
5826                        if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5827                                goto finish;
5828                        /* Move to the rightest boundary */
5829                        len = next1 - lblk1;
5830                        if (len < next2 - lblk2)
5831                                len = next2 - lblk2;
5832                        if (len > count)
5833                                len = count;
5834                        lblk1 += len;
5835                        lblk2 += len;
5836                        count -= len;
5837                        goto repeat;
5838                }
5839
5840                /* Prepare left boundary */
5841                if (e1_blk < lblk1) {
5842                        split = 1;
5843                        *erp = ext4_force_split_extent_at(handle, inode1,
5844                                                &path1, lblk1, 0);
5845                        if (unlikely(*erp))
5846                                goto finish;
5847                }
5848                if (e2_blk < lblk2) {
5849                        split = 1;
5850                        *erp = ext4_force_split_extent_at(handle, inode2,
5851                                                &path2,  lblk2, 0);
5852                        if (unlikely(*erp))
5853                                goto finish;
5854                }
5855                /* ext4_split_extent_at() may result in leaf extent split,
5856                 * path must to be revalidated. */
5857                if (split)
5858                        goto repeat;
5859
5860                /* Prepare right boundary */
5861                len = count;
5862                if (len > e1_blk + e1_len - lblk1)
5863                        len = e1_blk + e1_len - lblk1;
5864                if (len > e2_blk + e2_len - lblk2)
5865                        len = e2_blk + e2_len - lblk2;
5866
5867                if (len != e1_len) {
5868                        split = 1;
5869                        *erp = ext4_force_split_extent_at(handle, inode1,
5870                                                &path1, lblk1 + len, 0);
5871                        if (unlikely(*erp))
5872                                goto finish;
5873                }
5874                if (len != e2_len) {
5875                        split = 1;
5876                        *erp = ext4_force_split_extent_at(handle, inode2,
5877                                                &path2, lblk2 + len, 0);
5878                        if (*erp)
5879                                goto finish;
5880                }
5881                /* ext4_split_extent_at() may result in leaf extent split,
5882                 * path must to be revalidated. */
5883                if (split)
5884                        goto repeat;
5885
5886                BUG_ON(e2_len != e1_len);
5887                *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5888                if (unlikely(*erp))
5889                        goto finish;
5890                *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5891                if (unlikely(*erp))
5892                        goto finish;
5893
5894                /* Both extents are fully inside boundaries. Swap it now */
5895                tmp_ex = *ex1;
5896                ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5897                ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5898                ex1->ee_len = cpu_to_le16(e2_len);
5899                ex2->ee_len = cpu_to_le16(e1_len);
5900                if (unwritten)
5901                        ext4_ext_mark_unwritten(ex2);
5902                if (ext4_ext_is_unwritten(&tmp_ex))
5903                        ext4_ext_mark_unwritten(ex1);
5904
5905                ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5906                ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5907                *erp = ext4_ext_dirty(handle, inode2, path2 +
5908                                      path2->p_depth);
5909                if (unlikely(*erp))
5910                        goto finish;
5911                *erp = ext4_ext_dirty(handle, inode1, path1 +
5912                                      path1->p_depth);
5913                /*
5914                 * Looks scarry ah..? second inode already points to new blocks,
5915                 * and it was successfully dirtied. But luckily error may happen
5916                 * only due to journal error, so full transaction will be
5917                 * aborted anyway.
5918                 */
5919                if (unlikely(*erp))
5920                        goto finish;
5921                lblk1 += len;
5922                lblk2 += len;
5923                replaced_count += len;
5924                count -= len;
5925
5926        repeat:
5927                ext4_ext_drop_refs(path1);
5928                kfree(path1);
5929                ext4_ext_drop_refs(path2);
5930                kfree(path2);
5931                path1 = path2 = NULL;
5932        }
5933        return replaced_count;
5934}
5935
5936/*
5937 * ext4_clu_mapped - determine whether any block in a logical cluster has
5938 *                   been mapped to a physical cluster
5939 *
5940 * @inode - file containing the logical cluster
5941 * @lclu - logical cluster of interest
5942 *
5943 * Returns 1 if any block in the logical cluster is mapped, signifying
5944 * that a physical cluster has been allocated for it.  Otherwise,
5945 * returns 0.  Can also return negative error codes.  Derived from
5946 * ext4_ext_map_blocks().
5947 */
5948int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
5949{
5950        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5951        struct ext4_ext_path *path;
5952        int depth, mapped = 0, err = 0;
5953        struct ext4_extent *extent;
5954        ext4_lblk_t first_lblk, first_lclu, last_lclu;
5955
5956        /* search for the extent closest to the first block in the cluster */
5957        path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
5958        if (IS_ERR(path)) {
5959                err = PTR_ERR(path);
5960                path = NULL;
5961                goto out;
5962        }
5963
5964        depth = ext_depth(inode);
5965
5966        /*
5967         * A consistent leaf must not be empty.  This situation is possible,
5968         * though, _during_ tree modification, and it's why an assert can't
5969         * be put in ext4_find_extent().
5970         */
5971        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
5972                EXT4_ERROR_INODE(inode,
5973                    "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
5974                                 (unsigned long) EXT4_C2B(sbi, lclu),
5975                                 depth, path[depth].p_block);
5976                err = -EFSCORRUPTED;
5977                goto out;
5978        }
5979
5980        extent = path[depth].p_ext;
5981
5982        /* can't be mapped if the extent tree is empty */
5983        if (extent == NULL)
5984                goto out;
5985
5986        first_lblk = le32_to_cpu(extent->ee_block);
5987        first_lclu = EXT4_B2C(sbi, first_lblk);
5988
5989        /*
5990         * Three possible outcomes at this point - found extent spanning
5991         * the target cluster, to the left of the target cluster, or to the
5992         * right of the target cluster.  The first two cases are handled here.
5993         * The last case indicates the target cluster is not mapped.
5994         */
5995        if (lclu >= first_lclu) {
5996                last_lclu = EXT4_B2C(sbi, first_lblk +
5997                                     ext4_ext_get_actual_len(extent) - 1);
5998                if (lclu <= last_lclu) {
5999                        mapped = 1;
6000                } else {
6001                        first_lblk = ext4_ext_next_allocated_block(path);
6002                        first_lclu = EXT4_B2C(sbi, first_lblk);
6003                        if (lclu == first_lclu)
6004                                mapped = 1;
6005                }
6006        }
6007
6008out:
6009        ext4_ext_drop_refs(path);
6010        kfree(path);
6011
6012        return err ? err : mapped;
6013}
6014