linux/fs/ext4/extents.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   3 * Written by Alex Tomas <alex@clusterfs.com>
   4 *
   5 * Architecture independence:
   6 *   Copyright (c) 2005, Bull S.A.
   7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License version 2 as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful,
  14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 * GNU General Public License for more details.
  17 *
  18 * You should have received a copy of the GNU General Public Licens
  19 * along with this program; if not, write to the Free Software
  20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  21 */
  22
  23/*
  24 * Extents support for EXT4
  25 *
  26 * TODO:
  27 *   - ext4*_error() should be used in some situations
  28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
  29 *   - smart tree reduction
  30 */
  31
  32#include <linux/fs.h>
  33#include <linux/time.h>
  34#include <linux/jbd2.h>
  35#include <linux/highuid.h>
  36#include <linux/pagemap.h>
  37#include <linux/quotaops.h>
  38#include <linux/string.h>
  39#include <linux/slab.h>
  40#include <asm/uaccess.h>
  41#include <linux/fiemap.h>
  42#include "ext4_jbd2.h"
  43#include "ext4_extents.h"
  44#include "xattr.h"
  45
  46#include <trace/events/ext4.h>
  47
  48/*
  49 * used by extent splitting.
  50 */
  51#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
  52                                        due to ENOSPC */
  53#define EXT4_EXT_MARK_UNWRIT1   0x2  /* mark first half unwritten */
  54#define EXT4_EXT_MARK_UNWRIT2   0x4  /* mark second half unwritten */
  55
  56#define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
  57#define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
  58
  59static __le32 ext4_extent_block_csum(struct inode *inode,
  60                                     struct ext4_extent_header *eh)
  61{
  62        struct ext4_inode_info *ei = EXT4_I(inode);
  63        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  64        __u32 csum;
  65
  66        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
  67                           EXT4_EXTENT_TAIL_OFFSET(eh));
  68        return cpu_to_le32(csum);
  69}
  70
  71static int ext4_extent_block_csum_verify(struct inode *inode,
  72                                         struct ext4_extent_header *eh)
  73{
  74        struct ext4_extent_tail *et;
  75
  76        if (!ext4_has_metadata_csum(inode->i_sb))
  77                return 1;
  78
  79        et = find_ext4_extent_tail(eh);
  80        if (et->et_checksum != ext4_extent_block_csum(inode, eh))
  81                return 0;
  82        return 1;
  83}
  84
  85static void ext4_extent_block_csum_set(struct inode *inode,
  86                                       struct ext4_extent_header *eh)
  87{
  88        struct ext4_extent_tail *et;
  89
  90        if (!ext4_has_metadata_csum(inode->i_sb))
  91                return;
  92
  93        et = find_ext4_extent_tail(eh);
  94        et->et_checksum = ext4_extent_block_csum(inode, eh);
  95}
  96
  97static int ext4_split_extent(handle_t *handle,
  98                                struct inode *inode,
  99                                struct ext4_ext_path *path,
 100                                struct ext4_map_blocks *map,
 101                                int split_flag,
 102                                int flags);
 103
 104static int ext4_split_extent_at(handle_t *handle,
 105                             struct inode *inode,
 106                             struct ext4_ext_path *path,
 107                             ext4_lblk_t split,
 108                             int split_flag,
 109                             int flags);
 110
 111static int ext4_find_delayed_extent(struct inode *inode,
 112                                    struct extent_status *newes);
 113
 114static int ext4_ext_truncate_extend_restart(handle_t *handle,
 115                                            struct inode *inode,
 116                                            int needed)
 117{
 118        int err;
 119
 120        if (!ext4_handle_valid(handle))
 121                return 0;
 122        if (handle->h_buffer_credits > needed)
 123                return 0;
 124        err = ext4_journal_extend(handle, needed);
 125        if (err <= 0)
 126                return err;
 127        err = ext4_truncate_restart_trans(handle, inode, needed);
 128        if (err == 0)
 129                err = -EAGAIN;
 130
 131        return err;
 132}
 133
 134/*
 135 * could return:
 136 *  - EROFS
 137 *  - ENOMEM
 138 */
 139static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 140                                struct ext4_ext_path *path)
 141{
 142        if (path->p_bh) {
 143                /* path points to block */
 144                BUFFER_TRACE(path->p_bh, "get_write_access");
 145                return ext4_journal_get_write_access(handle, path->p_bh);
 146        }
 147        /* path points to leaf/index in inode body */
 148        /* we use in-core data, no need to protect them */
 149        return 0;
 150}
 151
 152/*
 153 * could return:
 154 *  - EROFS
 155 *  - ENOMEM
 156 *  - EIO
 157 */
 158int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
 159                     struct inode *inode, struct ext4_ext_path *path)
 160{
 161        int err;
 162
 163        WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
 164        if (path->p_bh) {
 165                ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
 166                /* path points to block */
 167                err = __ext4_handle_dirty_metadata(where, line, handle,
 168                                                   inode, path->p_bh);
 169        } else {
 170                /* path points to leaf/index in inode body */
 171                err = ext4_mark_inode_dirty(handle, inode);
 172        }
 173        return err;
 174}
 175
 176static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 177                              struct ext4_ext_path *path,
 178                              ext4_lblk_t block)
 179{
 180        if (path) {
 181                int depth = path->p_depth;
 182                struct ext4_extent *ex;
 183
 184                /*
 185                 * Try to predict block placement assuming that we are
 186                 * filling in a file which will eventually be
 187                 * non-sparse --- i.e., in the case of libbfd writing
 188                 * an ELF object sections out-of-order but in a way
 189                 * the eventually results in a contiguous object or
 190                 * executable file, or some database extending a table
 191                 * space file.  However, this is actually somewhat
 192                 * non-ideal if we are writing a sparse file such as
 193                 * qemu or KVM writing a raw image file that is going
 194                 * to stay fairly sparse, since it will end up
 195                 * fragmenting the file system's free space.  Maybe we
 196                 * should have some hueristics or some way to allow
 197                 * userspace to pass a hint to file system,
 198                 * especially if the latter case turns out to be
 199                 * common.
 200                 */
 201                ex = path[depth].p_ext;
 202                if (ex) {
 203                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
 204                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
 205
 206                        if (block > ext_block)
 207                                return ext_pblk + (block - ext_block);
 208                        else
 209                                return ext_pblk - (ext_block - block);
 210                }
 211
 212                /* it looks like index is empty;
 213                 * try to find starting block from index itself */
 214                if (path[depth].p_bh)
 215                        return path[depth].p_bh->b_blocknr;
 216        }
 217
 218        /* OK. use inode's group */
 219        return ext4_inode_to_goal_block(inode);
 220}
 221
 222/*
 223 * Allocation for a meta data block
 224 */
 225static ext4_fsblk_t
 226ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 227                        struct ext4_ext_path *path,
 228                        struct ext4_extent *ex, int *err, unsigned int flags)
 229{
 230        ext4_fsblk_t goal, newblock;
 231
 232        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
 233        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
 234                                        NULL, err);
 235        return newblock;
 236}
 237
 238static inline int ext4_ext_space_block(struct inode *inode, int check)
 239{
 240        int size;
 241
 242        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 243                        / sizeof(struct ext4_extent);
 244#ifdef AGGRESSIVE_TEST
 245        if (!check && size > 6)
 246                size = 6;
 247#endif
 248        return size;
 249}
 250
 251static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 252{
 253        int size;
 254
 255        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 256                        / sizeof(struct ext4_extent_idx);
 257#ifdef AGGRESSIVE_TEST
 258        if (!check && size > 5)
 259                size = 5;
 260#endif
 261        return size;
 262}
 263
 264static inline int ext4_ext_space_root(struct inode *inode, int check)
 265{
 266        int size;
 267
 268        size = sizeof(EXT4_I(inode)->i_data);
 269        size -= sizeof(struct ext4_extent_header);
 270        size /= sizeof(struct ext4_extent);
 271#ifdef AGGRESSIVE_TEST
 272        if (!check && size > 3)
 273                size = 3;
 274#endif
 275        return size;
 276}
 277
 278static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 279{
 280        int size;
 281
 282        size = sizeof(EXT4_I(inode)->i_data);
 283        size -= sizeof(struct ext4_extent_header);
 284        size /= sizeof(struct ext4_extent_idx);
 285#ifdef AGGRESSIVE_TEST
 286        if (!check && size > 4)
 287                size = 4;
 288#endif
 289        return size;
 290}
 291
 292/*
 293 * Calculate the number of metadata blocks needed
 294 * to allocate @blocks
 295 * Worse case is one block per extent
 296 */
 297int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 298{
 299        struct ext4_inode_info *ei = EXT4_I(inode);
 300        int idxs;
 301
 302        idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 303                / sizeof(struct ext4_extent_idx));
 304
 305        /*
 306         * If the new delayed allocation block is contiguous with the
 307         * previous da block, it can share index blocks with the
 308         * previous block, so we only need to allocate a new index
 309         * block every idxs leaf blocks.  At ldxs**2 blocks, we need
 310         * an additional index block, and at ldxs**3 blocks, yet
 311         * another index blocks.
 312         */
 313        if (ei->i_da_metadata_calc_len &&
 314            ei->i_da_metadata_calc_last_lblock+1 == lblock) {
 315                int num = 0;
 316
 317                if ((ei->i_da_metadata_calc_len % idxs) == 0)
 318                        num++;
 319                if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
 320                        num++;
 321                if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
 322                        num++;
 323                        ei->i_da_metadata_calc_len = 0;
 324                } else
 325                        ei->i_da_metadata_calc_len++;
 326                ei->i_da_metadata_calc_last_lblock++;
 327                return num;
 328        }
 329
 330        /*
 331         * In the worst case we need a new set of index blocks at
 332         * every level of the inode's extent tree.
 333         */
 334        ei->i_da_metadata_calc_len = 1;
 335        ei->i_da_metadata_calc_last_lblock = lblock;
 336        return ext_depth(inode) + 1;
 337}
 338
 339static int
 340ext4_ext_max_entries(struct inode *inode, int depth)
 341{
 342        int max;
 343
 344        if (depth == ext_depth(inode)) {
 345                if (depth == 0)
 346                        max = ext4_ext_space_root(inode, 1);
 347                else
 348                        max = ext4_ext_space_root_idx(inode, 1);
 349        } else {
 350                if (depth == 0)
 351                        max = ext4_ext_space_block(inode, 1);
 352                else
 353                        max = ext4_ext_space_block_idx(inode, 1);
 354        }
 355
 356        return max;
 357}
 358
 359static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 360{
 361        ext4_fsblk_t block = ext4_ext_pblock(ext);
 362        int len = ext4_ext_get_actual_len(ext);
 363        ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
 364
 365        /*
 366         * We allow neither:
 367         *  - zero length
 368         *  - overflow/wrap-around
 369         */
 370        if (lblock + len <= lblock)
 371                return 0;
 372        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 373}
 374
 375static int ext4_valid_extent_idx(struct inode *inode,
 376                                struct ext4_extent_idx *ext_idx)
 377{
 378        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
 379
 380        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 381}
 382
 383static int ext4_valid_extent_entries(struct inode *inode,
 384                                struct ext4_extent_header *eh,
 385                                int depth)
 386{
 387        unsigned short entries;
 388        if (eh->eh_entries == 0)
 389                return 1;
 390
 391        entries = le16_to_cpu(eh->eh_entries);
 392
 393        if (depth == 0) {
 394                /* leaf entries */
 395                struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
 396                struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 397                ext4_fsblk_t pblock = 0;
 398                ext4_lblk_t lblock = 0;
 399                ext4_lblk_t prev = 0;
 400                int len = 0;
 401                while (entries) {
 402                        if (!ext4_valid_extent(inode, ext))
 403                                return 0;
 404
 405                        /* Check for overlapping extents */
 406                        lblock = le32_to_cpu(ext->ee_block);
 407                        len = ext4_ext_get_actual_len(ext);
 408                        if ((lblock <= prev) && prev) {
 409                                pblock = ext4_ext_pblock(ext);
 410                                es->s_last_error_block = cpu_to_le64(pblock);
 411                                return 0;
 412                        }
 413                        ext++;
 414                        entries--;
 415                        prev = lblock + len - 1;
 416                }
 417        } else {
 418                struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
 419                while (entries) {
 420                        if (!ext4_valid_extent_idx(inode, ext_idx))
 421                                return 0;
 422                        ext_idx++;
 423                        entries--;
 424                }
 425        }
 426        return 1;
 427}
 428
 429static int __ext4_ext_check(const char *function, unsigned int line,
 430                            struct inode *inode, struct ext4_extent_header *eh,
 431                            int depth, ext4_fsblk_t pblk)
 432{
 433        const char *error_msg;
 434        int max = 0;
 435
 436        if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
 437                error_msg = "invalid magic";
 438                goto corrupted;
 439        }
 440        if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
 441                error_msg = "unexpected eh_depth";
 442                goto corrupted;
 443        }
 444        if (unlikely(eh->eh_max == 0)) {
 445                error_msg = "invalid eh_max";
 446                goto corrupted;
 447        }
 448        max = ext4_ext_max_entries(inode, depth);
 449        if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
 450                error_msg = "too large eh_max";
 451                goto corrupted;
 452        }
 453        if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
 454                error_msg = "invalid eh_entries";
 455                goto corrupted;
 456        }
 457        if (!ext4_valid_extent_entries(inode, eh, depth)) {
 458                error_msg = "invalid extent entries";
 459                goto corrupted;
 460        }
 461        /* Verify checksum on non-root extent tree nodes */
 462        if (ext_depth(inode) != depth &&
 463            !ext4_extent_block_csum_verify(inode, eh)) {
 464                error_msg = "extent tree corrupted";
 465                goto corrupted;
 466        }
 467        return 0;
 468
 469corrupted:
 470        ext4_error_inode(inode, function, line, 0,
 471                         "pblk %llu bad header/extent: %s - magic %x, "
 472                         "entries %u, max %u(%u), depth %u(%u)",
 473                         (unsigned long long) pblk, error_msg,
 474                         le16_to_cpu(eh->eh_magic),
 475                         le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
 476                         max, le16_to_cpu(eh->eh_depth), depth);
 477        return -EIO;
 478}
 479
 480#define ext4_ext_check(inode, eh, depth, pblk)                  \
 481        __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
 482
 483int ext4_ext_check_inode(struct inode *inode)
 484{
 485        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
 486}
 487
 488static struct buffer_head *
 489__read_extent_tree_block(const char *function, unsigned int line,
 490                         struct inode *inode, ext4_fsblk_t pblk, int depth,
 491                         int flags)
 492{
 493        struct buffer_head              *bh;
 494        int                             err;
 495
 496        bh = sb_getblk(inode->i_sb, pblk);
 497        if (unlikely(!bh))
 498                return ERR_PTR(-ENOMEM);
 499
 500        if (!bh_uptodate_or_lock(bh)) {
 501                trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
 502                err = bh_submit_read(bh);
 503                if (err < 0)
 504                        goto errout;
 505        }
 506        if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
 507                return bh;
 508        err = __ext4_ext_check(function, line, inode,
 509                               ext_block_hdr(bh), depth, pblk);
 510        if (err)
 511                goto errout;
 512        set_buffer_verified(bh);
 513        /*
 514         * If this is a leaf block, cache all of its entries
 515         */
 516        if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
 517                struct ext4_extent_header *eh = ext_block_hdr(bh);
 518                struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
 519                ext4_lblk_t prev = 0;
 520                int i;
 521
 522                for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
 523                        unsigned int status = EXTENT_STATUS_WRITTEN;
 524                        ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
 525                        int len = ext4_ext_get_actual_len(ex);
 526
 527                        if (prev && (prev != lblk))
 528                                ext4_es_cache_extent(inode, prev,
 529                                                     lblk - prev, ~0,
 530                                                     EXTENT_STATUS_HOLE);
 531
 532                        if (ext4_ext_is_unwritten(ex))
 533                                status = EXTENT_STATUS_UNWRITTEN;
 534                        ext4_es_cache_extent(inode, lblk, len,
 535                                             ext4_ext_pblock(ex), status);
 536                        prev = lblk + len;
 537                }
 538        }
 539        return bh;
 540errout:
 541        put_bh(bh);
 542        return ERR_PTR(err);
 543
 544}
 545
 546#define read_extent_tree_block(inode, pblk, depth, flags)               \
 547        __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
 548                                 (depth), (flags))
 549
 550/*
 551 * This function is called to cache a file's extent information in the
 552 * extent status tree
 553 */
 554int ext4_ext_precache(struct inode *inode)
 555{
 556        struct ext4_inode_info *ei = EXT4_I(inode);
 557        struct ext4_ext_path *path = NULL;
 558        struct buffer_head *bh;
 559        int i = 0, depth, ret = 0;
 560
 561        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 562                return 0;       /* not an extent-mapped inode */
 563
 564        down_read(&ei->i_data_sem);
 565        depth = ext_depth(inode);
 566
 567        path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
 568                       GFP_NOFS);
 569        if (path == NULL) {
 570                up_read(&ei->i_data_sem);
 571                return -ENOMEM;
 572        }
 573
 574        /* Don't cache anything if there are no external extent blocks */
 575        if (depth == 0)
 576                goto out;
 577        path[0].p_hdr = ext_inode_hdr(inode);
 578        ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
 579        if (ret)
 580                goto out;
 581        path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
 582        while (i >= 0) {
 583                /*
 584                 * If this is a leaf block or we've reached the end of
 585                 * the index block, go up
 586                 */
 587                if ((i == depth) ||
 588                    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
 589                        brelse(path[i].p_bh);
 590                        path[i].p_bh = NULL;
 591                        i--;
 592                        continue;
 593                }
 594                bh = read_extent_tree_block(inode,
 595                                            ext4_idx_pblock(path[i].p_idx++),
 596                                            depth - i - 1,
 597                                            EXT4_EX_FORCE_CACHE);
 598                if (IS_ERR(bh)) {
 599                        ret = PTR_ERR(bh);
 600                        break;
 601                }
 602                i++;
 603                path[i].p_bh = bh;
 604                path[i].p_hdr = ext_block_hdr(bh);
 605                path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
 606        }
 607        ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
 608out:
 609        up_read(&ei->i_data_sem);
 610        ext4_ext_drop_refs(path);
 611        kfree(path);
 612        return ret;
 613}
 614
 615#ifdef EXT_DEBUG
 616static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 617{
 618        int k, l = path->p_depth;
 619
 620        ext_debug("path:");
 621        for (k = 0; k <= l; k++, path++) {
 622                if (path->p_idx) {
 623                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
 624                            ext4_idx_pblock(path->p_idx));
 625                } else if (path->p_ext) {
 626                        ext_debug("  %d:[%d]%d:%llu ",
 627                                  le32_to_cpu(path->p_ext->ee_block),
 628                                  ext4_ext_is_unwritten(path->p_ext),
 629                                  ext4_ext_get_actual_len(path->p_ext),
 630                                  ext4_ext_pblock(path->p_ext));
 631                } else
 632                        ext_debug("  []");
 633        }
 634        ext_debug("\n");
 635}
 636
 637static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 638{
 639        int depth = ext_depth(inode);
 640        struct ext4_extent_header *eh;
 641        struct ext4_extent *ex;
 642        int i;
 643
 644        if (!path)
 645                return;
 646
 647        eh = path[depth].p_hdr;
 648        ex = EXT_FIRST_EXTENT(eh);
 649
 650        ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
 651
 652        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
 653                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
 654                          ext4_ext_is_unwritten(ex),
 655                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
 656        }
 657        ext_debug("\n");
 658}
 659
 660static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
 661                        ext4_fsblk_t newblock, int level)
 662{
 663        int depth = ext_depth(inode);
 664        struct ext4_extent *ex;
 665
 666        if (depth != level) {
 667                struct ext4_extent_idx *idx;
 668                idx = path[level].p_idx;
 669                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
 670                        ext_debug("%d: move %d:%llu in new index %llu\n", level,
 671                                        le32_to_cpu(idx->ei_block),
 672                                        ext4_idx_pblock(idx),
 673                                        newblock);
 674                        idx++;
 675                }
 676
 677                return;
 678        }
 679
 680        ex = path[depth].p_ext;
 681        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
 682                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
 683                                le32_to_cpu(ex->ee_block),
 684                                ext4_ext_pblock(ex),
 685                                ext4_ext_is_unwritten(ex),
 686                                ext4_ext_get_actual_len(ex),
 687                                newblock);
 688                ex++;
 689        }
 690}
 691
 692#else
 693#define ext4_ext_show_path(inode, path)
 694#define ext4_ext_show_leaf(inode, path)
 695#define ext4_ext_show_move(inode, path, newblock, level)
 696#endif
 697
 698void ext4_ext_drop_refs(struct ext4_ext_path *path)
 699{
 700        int depth = path->p_depth;
 701        int i;
 702
 703        for (i = 0; i <= depth; i++, path++)
 704                if (path->p_bh) {
 705                        brelse(path->p_bh);
 706                        path->p_bh = NULL;
 707                }
 708}
 709
 710/*
 711 * ext4_ext_binsearch_idx:
 712 * binary search for the closest index of the given block
 713 * the header must be checked before calling this
 714 */
 715static void
 716ext4_ext_binsearch_idx(struct inode *inode,
 717                        struct ext4_ext_path *path, ext4_lblk_t block)
 718{
 719        struct ext4_extent_header *eh = path->p_hdr;
 720        struct ext4_extent_idx *r, *l, *m;
 721
 722
 723        ext_debug("binsearch for %u(idx):  ", block);
 724
 725        l = EXT_FIRST_INDEX(eh) + 1;
 726        r = EXT_LAST_INDEX(eh);
 727        while (l <= r) {
 728                m = l + (r - l) / 2;
 729                if (block < le32_to_cpu(m->ei_block))
 730                        r = m - 1;
 731                else
 732                        l = m + 1;
 733                ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
 734                                m, le32_to_cpu(m->ei_block),
 735                                r, le32_to_cpu(r->ei_block));
 736        }
 737
 738        path->p_idx = l - 1;
 739        ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
 740                  ext4_idx_pblock(path->p_idx));
 741
 742#ifdef CHECK_BINSEARCH
 743        {
 744                struct ext4_extent_idx *chix, *ix;
 745                int k;
 746
 747                chix = ix = EXT_FIRST_INDEX(eh);
 748                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
 749                  if (k != 0 &&
 750                      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
 751                                printk(KERN_DEBUG "k=%d, ix=0x%p, "
 752                                       "first=0x%p\n", k,
 753                                       ix, EXT_FIRST_INDEX(eh));
 754                                printk(KERN_DEBUG "%u <= %u\n",
 755                                       le32_to_cpu(ix->ei_block),
 756                                       le32_to_cpu(ix[-1].ei_block));
 757                        }
 758                        BUG_ON(k && le32_to_cpu(ix->ei_block)
 759                                           <= le32_to_cpu(ix[-1].ei_block));
 760                        if (block < le32_to_cpu(ix->ei_block))
 761                                break;
 762                        chix = ix;
 763                }
 764                BUG_ON(chix != path->p_idx);
 765        }
 766#endif
 767
 768}
 769
 770/*
 771 * ext4_ext_binsearch:
 772 * binary search for closest extent of the given block
 773 * the header must be checked before calling this
 774 */
 775static void
 776ext4_ext_binsearch(struct inode *inode,
 777                struct ext4_ext_path *path, ext4_lblk_t block)
 778{
 779        struct ext4_extent_header *eh = path->p_hdr;
 780        struct ext4_extent *r, *l, *m;
 781
 782        if (eh->eh_entries == 0) {
 783                /*
 784                 * this leaf is empty:
 785                 * we get such a leaf in split/add case
 786                 */
 787                return;
 788        }
 789
 790        ext_debug("binsearch for %u:  ", block);
 791
 792        l = EXT_FIRST_EXTENT(eh) + 1;
 793        r = EXT_LAST_EXTENT(eh);
 794
 795        while (l <= r) {
 796                m = l + (r - l) / 2;
 797                if (block < le32_to_cpu(m->ee_block))
 798                        r = m - 1;
 799                else
 800                        l = m + 1;
 801                ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
 802                                m, le32_to_cpu(m->ee_block),
 803                                r, le32_to_cpu(r->ee_block));
 804        }
 805
 806        path->p_ext = l - 1;
 807        ext_debug("  -> %d:%llu:[%d]%d ",
 808                        le32_to_cpu(path->p_ext->ee_block),
 809                        ext4_ext_pblock(path->p_ext),
 810                        ext4_ext_is_unwritten(path->p_ext),
 811                        ext4_ext_get_actual_len(path->p_ext));
 812
 813#ifdef CHECK_BINSEARCH
 814        {
 815                struct ext4_extent *chex, *ex;
 816                int k;
 817
 818                chex = ex = EXT_FIRST_EXTENT(eh);
 819                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
 820                        BUG_ON(k && le32_to_cpu(ex->ee_block)
 821                                          <= le32_to_cpu(ex[-1].ee_block));
 822                        if (block < le32_to_cpu(ex->ee_block))
 823                                break;
 824                        chex = ex;
 825                }
 826                BUG_ON(chex != path->p_ext);
 827        }
 828#endif
 829
 830}
 831
 832int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 833{
 834        struct ext4_extent_header *eh;
 835
 836        eh = ext_inode_hdr(inode);
 837        eh->eh_depth = 0;
 838        eh->eh_entries = 0;
 839        eh->eh_magic = EXT4_EXT_MAGIC;
 840        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
 841        ext4_mark_inode_dirty(handle, inode);
 842        return 0;
 843}
 844
 845struct ext4_ext_path *
 846ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 847                     struct ext4_ext_path *path, int flags)
 848{
 849        struct ext4_extent_header *eh;
 850        struct buffer_head *bh;
 851        short int depth, i, ppos = 0, alloc = 0;
 852        int ret;
 853
 854        eh = ext_inode_hdr(inode);
 855        depth = ext_depth(inode);
 856
 857        /* account possible depth increase */
 858        if (!path) {
 859                path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
 860                                GFP_NOFS);
 861                if (!path)
 862                        return ERR_PTR(-ENOMEM);
 863                alloc = 1;
 864        }
 865        path[0].p_hdr = eh;
 866        path[0].p_bh = NULL;
 867
 868        i = depth;
 869        /* walk through the tree */
 870        while (i) {
 871                ext_debug("depth %d: num %d, max %d\n",
 872                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 873
 874                ext4_ext_binsearch_idx(inode, path + ppos, block);
 875                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 876                path[ppos].p_depth = i;
 877                path[ppos].p_ext = NULL;
 878
 879                bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
 880                                            flags);
 881                if (IS_ERR(bh)) {
 882                        ret = PTR_ERR(bh);
 883                        goto err;
 884                }
 885
 886                eh = ext_block_hdr(bh);
 887                ppos++;
 888                if (unlikely(ppos > depth)) {
 889                        put_bh(bh);
 890                        EXT4_ERROR_INODE(inode,
 891                                         "ppos %d > depth %d", ppos, depth);
 892                        ret = -EIO;
 893                        goto err;
 894                }
 895                path[ppos].p_bh = bh;
 896                path[ppos].p_hdr = eh;
 897        }
 898
 899        path[ppos].p_depth = i;
 900        path[ppos].p_ext = NULL;
 901        path[ppos].p_idx = NULL;
 902
 903        /* find extent */
 904        ext4_ext_binsearch(inode, path + ppos, block);
 905        /* if not an empty leaf */
 906        if (path[ppos].p_ext)
 907                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 908
 909        ext4_ext_show_path(inode, path);
 910
 911        return path;
 912
 913err:
 914        ext4_ext_drop_refs(path);
 915        if (alloc)
 916                kfree(path);
 917        return ERR_PTR(ret);
 918}
 919
 920/*
 921 * ext4_ext_insert_index:
 922 * insert new index [@logical;@ptr] into the block at @curp;
 923 * check where to insert: before @curp or after @curp
 924 */
 925static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 926                                 struct ext4_ext_path *curp,
 927                                 int logical, ext4_fsblk_t ptr)
 928{
 929        struct ext4_extent_idx *ix;
 930        int len, err;
 931
 932        err = ext4_ext_get_access(handle, inode, curp);
 933        if (err)
 934                return err;
 935
 936        if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
 937                EXT4_ERROR_INODE(inode,
 938                                 "logical %d == ei_block %d!",
 939                                 logical, le32_to_cpu(curp->p_idx->ei_block));
 940                return -EIO;
 941        }
 942
 943        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
 944                             >= le16_to_cpu(curp->p_hdr->eh_max))) {
 945                EXT4_ERROR_INODE(inode,
 946                                 "eh_entries %d >= eh_max %d!",
 947                                 le16_to_cpu(curp->p_hdr->eh_entries),
 948                                 le16_to_cpu(curp->p_hdr->eh_max));
 949                return -EIO;
 950        }
 951
 952        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
 953                /* insert after */
 954                ext_debug("insert new index %d after: %llu\n", logical, ptr);
 955                ix = curp->p_idx + 1;
 956        } else {
 957                /* insert before */
 958                ext_debug("insert new index %d before: %llu\n", logical, ptr);
 959                ix = curp->p_idx;
 960        }
 961
 962        len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
 963        BUG_ON(len < 0);
 964        if (len > 0) {
 965                ext_debug("insert new index %d: "
 966                                "move %d indices from 0x%p to 0x%p\n",
 967                                logical, len, ix, ix + 1);
 968                memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
 969        }
 970
 971        if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
 972                EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
 973                return -EIO;
 974        }
 975
 976        ix->ei_block = cpu_to_le32(logical);
 977        ext4_idx_store_pblock(ix, ptr);
 978        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
 979
 980        if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
 981                EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
 982                return -EIO;
 983        }
 984
 985        err = ext4_ext_dirty(handle, inode, curp);
 986        ext4_std_error(inode->i_sb, err);
 987
 988        return err;
 989}
 990
 991/*
 992 * ext4_ext_split:
 993 * inserts new subtree into the path, using free index entry
 994 * at depth @at:
 995 * - allocates all needed blocks (new leaf and all intermediate index blocks)
 996 * - makes decision where to split
 997 * - moves remaining extents and index entries (right to the split point)
 998 *   into the newly allocated blocks
 999 * - initializes subtree
1000 */
1001static int ext4_ext_split(handle_t *handle, struct inode *inode,
1002                          unsigned int flags,
1003                          struct ext4_ext_path *path,
1004                          struct ext4_extent *newext, int at)
1005{
1006        struct buffer_head *bh = NULL;
1007        int depth = ext_depth(inode);
1008        struct ext4_extent_header *neh;
1009        struct ext4_extent_idx *fidx;
1010        int i = at, k, m, a;
1011        ext4_fsblk_t newblock, oldblock;
1012        __le32 border;
1013        ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1014        int err = 0;
1015
1016        /* make decision: where to split? */
1017        /* FIXME: now decision is simplest: at current extent */
1018
1019        /* if current leaf will be split, then we should use
1020         * border from split point */
1021        if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1022                EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1023                return -EIO;
1024        }
1025        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1026                border = path[depth].p_ext[1].ee_block;
1027                ext_debug("leaf will be split."
1028                                " next leaf starts at %d\n",
1029                                  le32_to_cpu(border));
1030        } else {
1031                border = newext->ee_block;
1032                ext_debug("leaf will be added."
1033                                " next leaf starts at %d\n",
1034                                le32_to_cpu(border));
1035        }
1036
1037        /*
1038         * If error occurs, then we break processing
1039         * and mark filesystem read-only. index won't
1040         * be inserted and tree will be in consistent
1041         * state. Next mount will repair buffers too.
1042         */
1043
1044        /*
1045         * Get array to track all allocated blocks.
1046         * We need this to handle errors and free blocks
1047         * upon them.
1048         */
1049        ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
1050        if (!ablocks)
1051                return -ENOMEM;
1052
1053        /* allocate all needed blocks */
1054        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1055        for (a = 0; a < depth - at; a++) {
1056                newblock = ext4_ext_new_meta_block(handle, inode, path,
1057                                                   newext, &err, flags);
1058                if (newblock == 0)
1059                        goto cleanup;
1060                ablocks[a] = newblock;
1061        }
1062
1063        /* initialize new leaf */
1064        newblock = ablocks[--a];
1065        if (unlikely(newblock == 0)) {
1066                EXT4_ERROR_INODE(inode, "newblock == 0!");
1067                err = -EIO;
1068                goto cleanup;
1069        }
1070        bh = sb_getblk(inode->i_sb, newblock);
1071        if (unlikely(!bh)) {
1072                err = -ENOMEM;
1073                goto cleanup;
1074        }
1075        lock_buffer(bh);
1076
1077        err = ext4_journal_get_create_access(handle, bh);
1078        if (err)
1079                goto cleanup;
1080
1081        neh = ext_block_hdr(bh);
1082        neh->eh_entries = 0;
1083        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1084        neh->eh_magic = EXT4_EXT_MAGIC;
1085        neh->eh_depth = 0;
1086
1087        /* move remainder of path[depth] to the new leaf */
1088        if (unlikely(path[depth].p_hdr->eh_entries !=
1089                     path[depth].p_hdr->eh_max)) {
1090                EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1091                                 path[depth].p_hdr->eh_entries,
1092                                 path[depth].p_hdr->eh_max);
1093                err = -EIO;
1094                goto cleanup;
1095        }
1096        /* start copy from next extent */
1097        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1098        ext4_ext_show_move(inode, path, newblock, depth);
1099        if (m) {
1100                struct ext4_extent *ex;
1101                ex = EXT_FIRST_EXTENT(neh);
1102                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1103                le16_add_cpu(&neh->eh_entries, m);
1104        }
1105
1106        ext4_extent_block_csum_set(inode, neh);
1107        set_buffer_uptodate(bh);
1108        unlock_buffer(bh);
1109
1110        err = ext4_handle_dirty_metadata(handle, inode, bh);
1111        if (err)
1112                goto cleanup;
1113        brelse(bh);
1114        bh = NULL;
1115
1116        /* correct old leaf */
1117        if (m) {
1118                err = ext4_ext_get_access(handle, inode, path + depth);
1119                if (err)
1120                        goto cleanup;
1121                le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1122                err = ext4_ext_dirty(handle, inode, path + depth);
1123                if (err)
1124                        goto cleanup;
1125
1126        }
1127
1128        /* create intermediate indexes */
1129        k = depth - at - 1;
1130        if (unlikely(k < 0)) {
1131                EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1132                err = -EIO;
1133                goto cleanup;
1134        }
1135        if (k)
1136                ext_debug("create %d intermediate indices\n", k);
1137        /* insert new index into current index block */
1138        /* current depth stored in i var */
1139        i = depth - 1;
1140        while (k--) {
1141                oldblock = newblock;
1142                newblock = ablocks[--a];
1143                bh = sb_getblk(inode->i_sb, newblock);
1144                if (unlikely(!bh)) {
1145                        err = -ENOMEM;
1146                        goto cleanup;
1147                }
1148                lock_buffer(bh);
1149
1150                err = ext4_journal_get_create_access(handle, bh);
1151                if (err)
1152                        goto cleanup;
1153
1154                neh = ext_block_hdr(bh);
1155                neh->eh_entries = cpu_to_le16(1);
1156                neh->eh_magic = EXT4_EXT_MAGIC;
1157                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1158                neh->eh_depth = cpu_to_le16(depth - i);
1159                fidx = EXT_FIRST_INDEX(neh);
1160                fidx->ei_block = border;
1161                ext4_idx_store_pblock(fidx, oldblock);
1162
1163                ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1164                                i, newblock, le32_to_cpu(border), oldblock);
1165
1166                /* move remainder of path[i] to the new index block */
1167                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1168                                        EXT_LAST_INDEX(path[i].p_hdr))) {
1169                        EXT4_ERROR_INODE(inode,
1170                                         "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1171                                         le32_to_cpu(path[i].p_ext->ee_block));
1172                        err = -EIO;
1173                        goto cleanup;
1174                }
1175                /* start copy indexes */
1176                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1177                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1178                                EXT_MAX_INDEX(path[i].p_hdr));
1179                ext4_ext_show_move(inode, path, newblock, i);
1180                if (m) {
1181                        memmove(++fidx, path[i].p_idx,
1182                                sizeof(struct ext4_extent_idx) * m);
1183                        le16_add_cpu(&neh->eh_entries, m);
1184                }
1185                ext4_extent_block_csum_set(inode, neh);
1186                set_buffer_uptodate(bh);
1187                unlock_buffer(bh);
1188
1189                err = ext4_handle_dirty_metadata(handle, inode, bh);
1190                if (err)
1191                        goto cleanup;
1192                brelse(bh);
1193                bh = NULL;
1194
1195                /* correct old index */
1196                if (m) {
1197                        err = ext4_ext_get_access(handle, inode, path + i);
1198                        if (err)
1199                                goto cleanup;
1200                        le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1201                        err = ext4_ext_dirty(handle, inode, path + i);
1202                        if (err)
1203                                goto cleanup;
1204                }
1205
1206                i--;
1207        }
1208
1209        /* insert new index */
1210        err = ext4_ext_insert_index(handle, inode, path + at,
1211                                    le32_to_cpu(border), newblock);
1212
1213cleanup:
1214        if (bh) {
1215                if (buffer_locked(bh))
1216                        unlock_buffer(bh);
1217                brelse(bh);
1218        }
1219
1220        if (err) {
1221                /* free all allocated blocks in error case */
1222                for (i = 0; i < depth; i++) {
1223                        if (!ablocks[i])
1224                                continue;
1225                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1226                                         EXT4_FREE_BLOCKS_METADATA);
1227                }
1228        }
1229        kfree(ablocks);
1230
1231        return err;
1232}
1233
1234/*
1235 * ext4_ext_grow_indepth:
1236 * implements tree growing procedure:
1237 * - allocates new block
1238 * - moves top-level data (index block or leaf) into the new block
1239 * - initializes new top-level, creating index that points to the
1240 *   just created block
1241 */
1242static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1243                                 unsigned int flags,
1244                                 struct ext4_extent *newext)
1245{
1246        struct ext4_extent_header *neh;
1247        struct buffer_head *bh;
1248        ext4_fsblk_t newblock;
1249        int err = 0;
1250
1251        newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1252                newext, &err, flags);
1253        if (newblock == 0)
1254                return err;
1255
1256        bh = sb_getblk(inode->i_sb, newblock);
1257        if (unlikely(!bh))
1258                return -ENOMEM;
1259        lock_buffer(bh);
1260
1261        err = ext4_journal_get_create_access(handle, bh);
1262        if (err) {
1263                unlock_buffer(bh);
1264                goto out;
1265        }
1266
1267        /* move top-level index/leaf into new block */
1268        memmove(bh->b_data, EXT4_I(inode)->i_data,
1269                sizeof(EXT4_I(inode)->i_data));
1270
1271        /* set size of new block */
1272        neh = ext_block_hdr(bh);
1273        /* old root could have indexes or leaves
1274         * so calculate e_max right way */
1275        if (ext_depth(inode))
1276                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1277        else
1278                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1279        neh->eh_magic = EXT4_EXT_MAGIC;
1280        ext4_extent_block_csum_set(inode, neh);
1281        set_buffer_uptodate(bh);
1282        unlock_buffer(bh);
1283
1284        err = ext4_handle_dirty_metadata(handle, inode, bh);
1285        if (err)
1286                goto out;
1287
1288        /* Update top-level index: num,max,pointer */
1289        neh = ext_inode_hdr(inode);
1290        neh->eh_entries = cpu_to_le16(1);
1291        ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1292        if (neh->eh_depth == 0) {
1293                /* Root extent block becomes index block */
1294                neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1295                EXT_FIRST_INDEX(neh)->ei_block =
1296                        EXT_FIRST_EXTENT(neh)->ee_block;
1297        }
1298        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1299                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1300                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1301                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1302
1303        le16_add_cpu(&neh->eh_depth, 1);
1304        ext4_mark_inode_dirty(handle, inode);
1305out:
1306        brelse(bh);
1307
1308        return err;
1309}
1310
1311/*
1312 * ext4_ext_create_new_leaf:
1313 * finds empty index and adds new leaf.
1314 * if no free index is found, then it requests in-depth growing.
1315 */
1316static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1317                                    unsigned int mb_flags,
1318                                    unsigned int gb_flags,
1319                                    struct ext4_ext_path *path,
1320                                    struct ext4_extent *newext)
1321{
1322        struct ext4_ext_path *curp;
1323        int depth, i, err = 0;
1324
1325repeat:
1326        i = depth = ext_depth(inode);
1327
1328        /* walk up to the tree and look for free index entry */
1329        curp = path + depth;
1330        while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1331                i--;
1332                curp--;
1333        }
1334
1335        /* we use already allocated block for index block,
1336         * so subsequent data blocks should be contiguous */
1337        if (EXT_HAS_FREE_INDEX(curp)) {
1338                /* if we found index with free entry, then use that
1339                 * entry: create all needed subtree and add new leaf */
1340                err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1341                if (err)
1342                        goto out;
1343
1344                /* refill path */
1345                ext4_ext_drop_refs(path);
1346                path = ext4_ext_find_extent(inode,
1347                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1348                                    path, gb_flags);
1349                if (IS_ERR(path))
1350                        err = PTR_ERR(path);
1351        } else {
1352                /* tree is full, time to grow in depth */
1353                err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
1354                if (err)
1355                        goto out;
1356
1357                /* refill path */
1358                ext4_ext_drop_refs(path);
1359                path = ext4_ext_find_extent(inode,
1360                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1361                                    path, gb_flags);
1362                if (IS_ERR(path)) {
1363                        err = PTR_ERR(path);
1364                        goto out;
1365                }
1366
1367                /*
1368                 * only first (depth 0 -> 1) produces free space;
1369                 * in all other cases we have to split the grown tree
1370                 */
1371                depth = ext_depth(inode);
1372                if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1373                        /* now we need to split */
1374                        goto repeat;
1375                }
1376        }
1377
1378out:
1379        return err;
1380}
1381
1382/*
1383 * search the closest allocated block to the left for *logical
1384 * and returns it at @logical + it's physical address at @phys
1385 * if *logical is the smallest allocated block, the function
1386 * returns 0 at @phys
1387 * return value contains 0 (success) or error code
1388 */
1389static int ext4_ext_search_left(struct inode *inode,
1390                                struct ext4_ext_path *path,
1391                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
1392{
1393        struct ext4_extent_idx *ix;
1394        struct ext4_extent *ex;
1395        int depth, ee_len;
1396
1397        if (unlikely(path == NULL)) {
1398                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1399                return -EIO;
1400        }
1401        depth = path->p_depth;
1402        *phys = 0;
1403
1404        if (depth == 0 && path->p_ext == NULL)
1405                return 0;
1406
1407        /* usually extent in the path covers blocks smaller
1408         * then *logical, but it can be that extent is the
1409         * first one in the file */
1410
1411        ex = path[depth].p_ext;
1412        ee_len = ext4_ext_get_actual_len(ex);
1413        if (*logical < le32_to_cpu(ex->ee_block)) {
1414                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1415                        EXT4_ERROR_INODE(inode,
1416                                         "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1417                                         *logical, le32_to_cpu(ex->ee_block));
1418                        return -EIO;
1419                }
1420                while (--depth >= 0) {
1421                        ix = path[depth].p_idx;
1422                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1423                                EXT4_ERROR_INODE(inode,
1424                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1425                                  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1426                                  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1427                le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1428                                  depth);
1429                                return -EIO;
1430                        }
1431                }
1432                return 0;
1433        }
1434
1435        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1436                EXT4_ERROR_INODE(inode,
1437                                 "logical %d < ee_block %d + ee_len %d!",
1438                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
1439                return -EIO;
1440        }
1441
1442        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1443        *phys = ext4_ext_pblock(ex) + ee_len - 1;
1444        return 0;
1445}
1446
1447/*
1448 * search the closest allocated block to the right for *logical
1449 * and returns it at @logical + it's physical address at @phys
1450 * if *logical is the largest allocated block, the function
1451 * returns 0 at @phys
1452 * return value contains 0 (success) or error code
1453 */
1454static int ext4_ext_search_right(struct inode *inode,
1455                                 struct ext4_ext_path *path,
1456                                 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1457                                 struct ext4_extent **ret_ex)
1458{
1459        struct buffer_head *bh = NULL;
1460        struct ext4_extent_header *eh;
1461        struct ext4_extent_idx *ix;
1462        struct ext4_extent *ex;
1463        ext4_fsblk_t block;
1464        int depth;      /* Note, NOT eh_depth; depth from top of tree */
1465        int ee_len;
1466
1467        if (unlikely(path == NULL)) {
1468                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1469                return -EIO;
1470        }
1471        depth = path->p_depth;
1472        *phys = 0;
1473
1474        if (depth == 0 && path->p_ext == NULL)
1475                return 0;
1476
1477        /* usually extent in the path covers blocks smaller
1478         * then *logical, but it can be that extent is the
1479         * first one in the file */
1480
1481        ex = path[depth].p_ext;
1482        ee_len = ext4_ext_get_actual_len(ex);
1483        if (*logical < le32_to_cpu(ex->ee_block)) {
1484                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1485                        EXT4_ERROR_INODE(inode,
1486                                         "first_extent(path[%d].p_hdr) != ex",
1487                                         depth);
1488                        return -EIO;
1489                }
1490                while (--depth >= 0) {
1491                        ix = path[depth].p_idx;
1492                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1493                                EXT4_ERROR_INODE(inode,
1494                                                 "ix != EXT_FIRST_INDEX *logical %d!",
1495                                                 *logical);
1496                                return -EIO;
1497                        }
1498                }
1499                goto found_extent;
1500        }
1501
1502        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1503                EXT4_ERROR_INODE(inode,
1504                                 "logical %d < ee_block %d + ee_len %d!",
1505                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
1506                return -EIO;
1507        }
1508
1509        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1510                /* next allocated block in this leaf */
1511                ex++;
1512                goto found_extent;
1513        }
1514
1515        /* go up and search for index to the right */
1516        while (--depth >= 0) {
1517                ix = path[depth].p_idx;
1518                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1519                        goto got_index;
1520        }
1521
1522        /* we've gone up to the root and found no index to the right */
1523        return 0;
1524
1525got_index:
1526        /* we've found index to the right, let's
1527         * follow it and find the closest allocated
1528         * block to the right */
1529        ix++;
1530        block = ext4_idx_pblock(ix);
1531        while (++depth < path->p_depth) {
1532                /* subtract from p_depth to get proper eh_depth */
1533                bh = read_extent_tree_block(inode, block,
1534                                            path->p_depth - depth, 0);
1535                if (IS_ERR(bh))
1536                        return PTR_ERR(bh);
1537                eh = ext_block_hdr(bh);
1538                ix = EXT_FIRST_INDEX(eh);
1539                block = ext4_idx_pblock(ix);
1540                put_bh(bh);
1541        }
1542
1543        bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1544        if (IS_ERR(bh))
1545                return PTR_ERR(bh);
1546        eh = ext_block_hdr(bh);
1547        ex = EXT_FIRST_EXTENT(eh);
1548found_extent:
1549        *logical = le32_to_cpu(ex->ee_block);
1550        *phys = ext4_ext_pblock(ex);
1551        *ret_ex = ex;
1552        if (bh)
1553                put_bh(bh);
1554        return 0;
1555}
1556
1557/*
1558 * ext4_ext_next_allocated_block:
1559 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1560 * NOTE: it considers block number from index entry as
1561 * allocated block. Thus, index entries have to be consistent
1562 * with leaves.
1563 */
1564static ext4_lblk_t
1565ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1566{
1567        int depth;
1568
1569        BUG_ON(path == NULL);
1570        depth = path->p_depth;
1571
1572        if (depth == 0 && path->p_ext == NULL)
1573                return EXT_MAX_BLOCKS;
1574
1575        while (depth >= 0) {
1576                if (depth == path->p_depth) {
1577                        /* leaf */
1578                        if (path[depth].p_ext &&
1579                                path[depth].p_ext !=
1580                                        EXT_LAST_EXTENT(path[depth].p_hdr))
1581                          return le32_to_cpu(path[depth].p_ext[1].ee_block);
1582                } else {
1583                        /* index */
1584                        if (path[depth].p_idx !=
1585                                        EXT_LAST_INDEX(path[depth].p_hdr))
1586                          return le32_to_cpu(path[depth].p_idx[1].ei_block);
1587                }
1588                depth--;
1589        }
1590
1591        return EXT_MAX_BLOCKS;
1592}
1593
1594/*
1595 * ext4_ext_next_leaf_block:
1596 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1597 */
1598static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1599{
1600        int depth;
1601
1602        BUG_ON(path == NULL);
1603        depth = path->p_depth;
1604
1605        /* zero-tree has no leaf blocks at all */
1606        if (depth == 0)
1607                return EXT_MAX_BLOCKS;
1608
1609        /* go to index block */
1610        depth--;
1611
1612        while (depth >= 0) {
1613                if (path[depth].p_idx !=
1614                                EXT_LAST_INDEX(path[depth].p_hdr))
1615                        return (ext4_lblk_t)
1616                                le32_to_cpu(path[depth].p_idx[1].ei_block);
1617                depth--;
1618        }
1619
1620        return EXT_MAX_BLOCKS;
1621}
1622
1623/*
1624 * ext4_ext_correct_indexes:
1625 * if leaf gets modified and modified extent is first in the leaf,
1626 * then we have to correct all indexes above.
1627 * TODO: do we need to correct tree in all cases?
1628 */
1629static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1630                                struct ext4_ext_path *path)
1631{
1632        struct ext4_extent_header *eh;
1633        int depth = ext_depth(inode);
1634        struct ext4_extent *ex;
1635        __le32 border;
1636        int k, err = 0;
1637
1638        eh = path[depth].p_hdr;
1639        ex = path[depth].p_ext;
1640
1641        if (unlikely(ex == NULL || eh == NULL)) {
1642                EXT4_ERROR_INODE(inode,
1643                                 "ex %p == NULL or eh %p == NULL", ex, eh);
1644                return -EIO;
1645        }
1646
1647        if (depth == 0) {
1648                /* there is no tree at all */
1649                return 0;
1650        }
1651
1652        if (ex != EXT_FIRST_EXTENT(eh)) {
1653                /* we correct tree if first leaf got modified only */
1654                return 0;
1655        }
1656
1657        /*
1658         * TODO: we need correction if border is smaller than current one
1659         */
1660        k = depth - 1;
1661        border = path[depth].p_ext->ee_block;
1662        err = ext4_ext_get_access(handle, inode, path + k);
1663        if (err)
1664                return err;
1665        path[k].p_idx->ei_block = border;
1666        err = ext4_ext_dirty(handle, inode, path + k);
1667        if (err)
1668                return err;
1669
1670        while (k--) {
1671                /* change all left-side indexes */
1672                if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1673                        break;
1674                err = ext4_ext_get_access(handle, inode, path + k);
1675                if (err)
1676                        break;
1677                path[k].p_idx->ei_block = border;
1678                err = ext4_ext_dirty(handle, inode, path + k);
1679                if (err)
1680                        break;
1681        }
1682
1683        return err;
1684}
1685
1686int
1687ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1688                                struct ext4_extent *ex2)
1689{
1690        unsigned short ext1_ee_len, ext2_ee_len;
1691
1692        if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1693                return 0;
1694
1695        ext1_ee_len = ext4_ext_get_actual_len(ex1);
1696        ext2_ee_len = ext4_ext_get_actual_len(ex2);
1697
1698        if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1699                        le32_to_cpu(ex2->ee_block))
1700                return 0;
1701
1702        /*
1703         * To allow future support for preallocated extents to be added
1704         * as an RO_COMPAT feature, refuse to merge to extents if
1705         * this can result in the top bit of ee_len being set.
1706         */
1707        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1708                return 0;
1709        if (ext4_ext_is_unwritten(ex1) &&
1710            (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1711             atomic_read(&EXT4_I(inode)->i_unwritten) ||
1712             (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
1713                return 0;
1714#ifdef AGGRESSIVE_TEST
1715        if (ext1_ee_len >= 4)
1716                return 0;
1717#endif
1718
1719        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1720                return 1;
1721        return 0;
1722}
1723
1724/*
1725 * This function tries to merge the "ex" extent to the next extent in the tree.
1726 * It always tries to merge towards right. If you want to merge towards
1727 * left, pass "ex - 1" as argument instead of "ex".
1728 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1729 * 1 if they got merged.
1730 */
1731static int ext4_ext_try_to_merge_right(struct inode *inode,
1732                                 struct ext4_ext_path *path,
1733                                 struct ext4_extent *ex)
1734{
1735        struct ext4_extent_header *eh;
1736        unsigned int depth, len;
1737        int merge_done = 0, unwritten;
1738
1739        depth = ext_depth(inode);
1740        BUG_ON(path[depth].p_hdr == NULL);
1741        eh = path[depth].p_hdr;
1742
1743        while (ex < EXT_LAST_EXTENT(eh)) {
1744                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1745                        break;
1746                /* merge with next extent! */
1747                unwritten = ext4_ext_is_unwritten(ex);
1748                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1749                                + ext4_ext_get_actual_len(ex + 1));
1750                if (unwritten)
1751                        ext4_ext_mark_unwritten(ex);
1752
1753                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1754                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
1755                                * sizeof(struct ext4_extent);
1756                        memmove(ex + 1, ex + 2, len);
1757                }
1758                le16_add_cpu(&eh->eh_entries, -1);
1759                merge_done = 1;
1760                WARN_ON(eh->eh_entries == 0);
1761                if (!eh->eh_entries)
1762                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1763        }
1764
1765        return merge_done;
1766}
1767
1768/*
1769 * This function does a very simple check to see if we can collapse
1770 * an extent tree with a single extent tree leaf block into the inode.
1771 */
1772static void ext4_ext_try_to_merge_up(handle_t *handle,
1773                                     struct inode *inode,
1774                                     struct ext4_ext_path *path)
1775{
1776        size_t s;
1777        unsigned max_root = ext4_ext_space_root(inode, 0);
1778        ext4_fsblk_t blk;
1779
1780        if ((path[0].p_depth != 1) ||
1781            (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1782            (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1783                return;
1784
1785        /*
1786         * We need to modify the block allocation bitmap and the block
1787         * group descriptor to release the extent tree block.  If we
1788         * can't get the journal credits, give up.
1789         */
1790        if (ext4_journal_extend(handle, 2))
1791                return;
1792
1793        /*
1794         * Copy the extent data up to the inode
1795         */
1796        blk = ext4_idx_pblock(path[0].p_idx);
1797        s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1798                sizeof(struct ext4_extent_idx);
1799        s += sizeof(struct ext4_extent_header);
1800
1801        memcpy(path[0].p_hdr, path[1].p_hdr, s);
1802        path[0].p_depth = 0;
1803        path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1804                (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1805        path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1806
1807        brelse(path[1].p_bh);
1808        ext4_free_blocks(handle, inode, NULL, blk, 1,
1809                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1810}
1811
1812/*
1813 * This function tries to merge the @ex extent to neighbours in the tree.
1814 * return 1 if merge left else 0.
1815 */
1816static void ext4_ext_try_to_merge(handle_t *handle,
1817                                  struct inode *inode,
1818                                  struct ext4_ext_path *path,
1819                                  struct ext4_extent *ex) {
1820        struct ext4_extent_header *eh;
1821        unsigned int depth;
1822        int merge_done = 0;
1823
1824        depth = ext_depth(inode);
1825        BUG_ON(path[depth].p_hdr == NULL);
1826        eh = path[depth].p_hdr;
1827
1828        if (ex > EXT_FIRST_EXTENT(eh))
1829                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1830
1831        if (!merge_done)
1832                (void) ext4_ext_try_to_merge_right(inode, path, ex);
1833
1834        ext4_ext_try_to_merge_up(handle, inode, path);
1835}
1836
1837/*
1838 * check if a portion of the "newext" extent overlaps with an
1839 * existing extent.
1840 *
1841 * If there is an overlap discovered, it updates the length of the newext
1842 * such that there will be no overlap, and then returns 1.
1843 * If there is no overlap found, it returns 0.
1844 */
1845static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1846                                           struct inode *inode,
1847                                           struct ext4_extent *newext,
1848                                           struct ext4_ext_path *path)
1849{
1850        ext4_lblk_t b1, b2;
1851        unsigned int depth, len1;
1852        unsigned int ret = 0;
1853
1854        b1 = le32_to_cpu(newext->ee_block);
1855        len1 = ext4_ext_get_actual_len(newext);
1856        depth = ext_depth(inode);
1857        if (!path[depth].p_ext)
1858                goto out;
1859        b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1860
1861        /*
1862         * get the next allocated block if the extent in the path
1863         * is before the requested block(s)
1864         */
1865        if (b2 < b1) {
1866                b2 = ext4_ext_next_allocated_block(path);
1867                if (b2 == EXT_MAX_BLOCKS)
1868                        goto out;
1869                b2 = EXT4_LBLK_CMASK(sbi, b2);
1870        }
1871
1872        /* check for wrap through zero on extent logical start block*/
1873        if (b1 + len1 < b1) {
1874                len1 = EXT_MAX_BLOCKS - b1;
1875                newext->ee_len = cpu_to_le16(len1);
1876                ret = 1;
1877        }
1878
1879        /* check for overlap */
1880        if (b1 + len1 > b2) {
1881                newext->ee_len = cpu_to_le16(b2 - b1);
1882                ret = 1;
1883        }
1884out:
1885        return ret;
1886}
1887
1888/*
1889 * ext4_ext_insert_extent:
1890 * tries to merge requsted extent into the existing extent or
1891 * inserts requested extent as new one into the tree,
1892 * creating new leaf in the no-space case.
1893 */
1894int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1895                                struct ext4_ext_path *path,
1896                                struct ext4_extent *newext, int gb_flags)
1897{
1898        struct ext4_extent_header *eh;
1899        struct ext4_extent *ex, *fex;
1900        struct ext4_extent *nearex; /* nearest extent */
1901        struct ext4_ext_path *npath = NULL;
1902        int depth, len, err;
1903        ext4_lblk_t next;
1904        int mb_flags = 0, unwritten;
1905
1906        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1907                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1908                return -EIO;
1909        }
1910        depth = ext_depth(inode);
1911        ex = path[depth].p_ext;
1912        eh = path[depth].p_hdr;
1913        if (unlikely(path[depth].p_hdr == NULL)) {
1914                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1915                return -EIO;
1916        }
1917
1918        /* try to insert block into found extent and return */
1919        if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1920
1921                /*
1922                 * Try to see whether we should rather test the extent on
1923                 * right from ex, or from the left of ex. This is because
1924                 * ext4_ext_find_extent() can return either extent on the
1925                 * left, or on the right from the searched position. This
1926                 * will make merging more effective.
1927                 */
1928                if (ex < EXT_LAST_EXTENT(eh) &&
1929                    (le32_to_cpu(ex->ee_block) +
1930                    ext4_ext_get_actual_len(ex) <
1931                    le32_to_cpu(newext->ee_block))) {
1932                        ex += 1;
1933                        goto prepend;
1934                } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1935                           (le32_to_cpu(newext->ee_block) +
1936                           ext4_ext_get_actual_len(newext) <
1937                           le32_to_cpu(ex->ee_block)))
1938                        ex -= 1;
1939
1940                /* Try to append newex to the ex */
1941                if (ext4_can_extents_be_merged(inode, ex, newext)) {
1942                        ext_debug("append [%d]%d block to %u:[%d]%d"
1943                                  "(from %llu)\n",
1944                                  ext4_ext_is_unwritten(newext),
1945                                  ext4_ext_get_actual_len(newext),
1946                                  le32_to_cpu(ex->ee_block),
1947                                  ext4_ext_is_unwritten(ex),
1948                                  ext4_ext_get_actual_len(ex),
1949                                  ext4_ext_pblock(ex));
1950                        err = ext4_ext_get_access(handle, inode,
1951                                                  path + depth);
1952                        if (err)
1953                                return err;
1954                        unwritten = ext4_ext_is_unwritten(ex);
1955                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1956                                        + ext4_ext_get_actual_len(newext));
1957                        if (unwritten)
1958                                ext4_ext_mark_unwritten(ex);
1959                        eh = path[depth].p_hdr;
1960                        nearex = ex;
1961                        goto merge;
1962                }
1963
1964prepend:
1965                /* Try to prepend newex to the ex */
1966                if (ext4_can_extents_be_merged(inode, newext, ex)) {
1967                        ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1968                                  "(from %llu)\n",
1969                                  le32_to_cpu(newext->ee_block),
1970                                  ext4_ext_is_unwritten(newext),
1971                                  ext4_ext_get_actual_len(newext),
1972                                  le32_to_cpu(ex->ee_block),
1973                                  ext4_ext_is_unwritten(ex),
1974                                  ext4_ext_get_actual_len(ex),
1975                                  ext4_ext_pblock(ex));
1976                        err = ext4_ext_get_access(handle, inode,
1977                                                  path + depth);
1978                        if (err)
1979                                return err;
1980
1981                        unwritten = ext4_ext_is_unwritten(ex);
1982                        ex->ee_block = newext->ee_block;
1983                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
1984                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1985                                        + ext4_ext_get_actual_len(newext));
1986                        if (unwritten)
1987                                ext4_ext_mark_unwritten(ex);
1988                        eh = path[depth].p_hdr;
1989                        nearex = ex;
1990                        goto merge;
1991                }
1992        }
1993
1994        depth = ext_depth(inode);
1995        eh = path[depth].p_hdr;
1996        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1997                goto has_space;
1998
1999        /* probably next leaf has space for us? */
2000        fex = EXT_LAST_EXTENT(eh);
2001        next = EXT_MAX_BLOCKS;
2002        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2003                next = ext4_ext_next_leaf_block(path);
2004        if (next != EXT_MAX_BLOCKS) {
2005                ext_debug("next leaf block - %u\n", next);
2006                BUG_ON(npath != NULL);
2007                npath = ext4_ext_find_extent(inode, next, NULL, 0);
2008                if (IS_ERR(npath))
2009                        return PTR_ERR(npath);
2010                BUG_ON(npath->p_depth != path->p_depth);
2011                eh = npath[depth].p_hdr;
2012                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2013                        ext_debug("next leaf isn't full(%d)\n",
2014                                  le16_to_cpu(eh->eh_entries));
2015                        path = npath;
2016                        goto has_space;
2017                }
2018                ext_debug("next leaf has no free space(%d,%d)\n",
2019                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2020        }
2021
2022        /*
2023         * There is no free space in the found leaf.
2024         * We're gonna add a new leaf in the tree.
2025         */
2026        if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2027                mb_flags = EXT4_MB_USE_RESERVED;
2028        err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2029                                       path, newext);
2030        if (err)
2031                goto cleanup;
2032        depth = ext_depth(inode);
2033        eh = path[depth].p_hdr;
2034
2035has_space:
2036        nearex = path[depth].p_ext;
2037
2038        err = ext4_ext_get_access(handle, inode, path + depth);
2039        if (err)
2040                goto cleanup;
2041
2042        if (!nearex) {
2043                /* there is no extent in this leaf, create first one */
2044                ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2045                                le32_to_cpu(newext->ee_block),
2046                                ext4_ext_pblock(newext),
2047                                ext4_ext_is_unwritten(newext),
2048                                ext4_ext_get_actual_len(newext));
2049                nearex = EXT_FIRST_EXTENT(eh);
2050        } else {
2051                if (le32_to_cpu(newext->ee_block)
2052                           > le32_to_cpu(nearex->ee_block)) {
2053                        /* Insert after */
2054                        ext_debug("insert %u:%llu:[%d]%d before: "
2055                                        "nearest %p\n",
2056                                        le32_to_cpu(newext->ee_block),
2057                                        ext4_ext_pblock(newext),
2058                                        ext4_ext_is_unwritten(newext),
2059                                        ext4_ext_get_actual_len(newext),
2060                                        nearex);
2061                        nearex++;
2062                } else {
2063                        /* Insert before */
2064                        BUG_ON(newext->ee_block == nearex->ee_block);
2065                        ext_debug("insert %u:%llu:[%d]%d after: "
2066                                        "nearest %p\n",
2067                                        le32_to_cpu(newext->ee_block),
2068                                        ext4_ext_pblock(newext),
2069                                        ext4_ext_is_unwritten(newext),
2070                                        ext4_ext_get_actual_len(newext),
2071                                        nearex);
2072                }
2073                len = EXT_LAST_EXTENT(eh) - nearex + 1;
2074                if (len > 0) {
2075                        ext_debug("insert %u:%llu:[%d]%d: "
2076                                        "move %d extents from 0x%p to 0x%p\n",
2077                                        le32_to_cpu(newext->ee_block),
2078                                        ext4_ext_pblock(newext),
2079                                        ext4_ext_is_unwritten(newext),
2080                                        ext4_ext_get_actual_len(newext),
2081                                        len, nearex, nearex + 1);
2082                        memmove(nearex + 1, nearex,
2083                                len * sizeof(struct ext4_extent));
2084                }
2085        }
2086
2087        le16_add_cpu(&eh->eh_entries, 1);
2088        path[depth].p_ext = nearex;
2089        nearex->ee_block = newext->ee_block;
2090        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2091        nearex->ee_len = newext->ee_len;
2092
2093merge:
2094        /* try to merge extents */
2095        if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2096                ext4_ext_try_to_merge(handle, inode, path, nearex);
2097
2098
2099        /* time to correct all indexes above */
2100        err = ext4_ext_correct_indexes(handle, inode, path);
2101        if (err)
2102                goto cleanup;
2103
2104        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2105
2106cleanup:
2107        if (npath) {
2108                ext4_ext_drop_refs(npath);
2109                kfree(npath);
2110        }
2111        return err;
2112}
2113
2114static int ext4_fill_fiemap_extents(struct inode *inode,
2115                                    ext4_lblk_t block, ext4_lblk_t num,
2116                                    struct fiemap_extent_info *fieinfo)
2117{
2118        struct ext4_ext_path *path = NULL;
2119        struct ext4_extent *ex;
2120        struct extent_status es;
2121        ext4_lblk_t next, next_del, start = 0, end = 0;
2122        ext4_lblk_t last = block + num;
2123        int exists, depth = 0, err = 0;
2124        unsigned int flags = 0;
2125        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2126
2127        while (block < last && block != EXT_MAX_BLOCKS) {
2128                num = last - block;
2129                /* find extent for this block */
2130                down_read(&EXT4_I(inode)->i_data_sem);
2131
2132                if (path && ext_depth(inode) != depth) {
2133                        /* depth was changed. we have to realloc path */
2134                        kfree(path);
2135                        path = NULL;
2136                }
2137
2138                path = ext4_ext_find_extent(inode, block, path, 0);
2139                if (IS_ERR(path)) {
2140                        up_read(&EXT4_I(inode)->i_data_sem);
2141                        err = PTR_ERR(path);
2142                        path = NULL;
2143                        break;
2144                }
2145
2146                depth = ext_depth(inode);
2147                if (unlikely(path[depth].p_hdr == NULL)) {
2148                        up_read(&EXT4_I(inode)->i_data_sem);
2149                        EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2150                        err = -EIO;
2151                        break;
2152                }
2153                ex = path[depth].p_ext;
2154                next = ext4_ext_next_allocated_block(path);
2155                ext4_ext_drop_refs(path);
2156
2157                flags = 0;
2158                exists = 0;
2159                if (!ex) {
2160                        /* there is no extent yet, so try to allocate
2161                         * all requested space */
2162                        start = block;
2163                        end = block + num;
2164                } else if (le32_to_cpu(ex->ee_block) > block) {
2165                        /* need to allocate space before found extent */
2166                        start = block;
2167                        end = le32_to_cpu(ex->ee_block);
2168                        if (block + num < end)
2169                                end = block + num;
2170                } else if (block >= le32_to_cpu(ex->ee_block)
2171                                        + ext4_ext_get_actual_len(ex)) {
2172                        /* need to allocate space after found extent */
2173                        start = block;
2174                        end = block + num;
2175                        if (end >= next)
2176                                end = next;
2177                } else if (block >= le32_to_cpu(ex->ee_block)) {
2178                        /*
2179                         * some part of requested space is covered
2180                         * by found extent
2181                         */
2182                        start = block;
2183                        end = le32_to_cpu(ex->ee_block)
2184                                + ext4_ext_get_actual_len(ex);
2185                        if (block + num < end)
2186                                end = block + num;
2187                        exists = 1;
2188                } else {
2189                        BUG();
2190                }
2191                BUG_ON(end <= start);
2192
2193                if (!exists) {
2194                        es.es_lblk = start;
2195                        es.es_len = end - start;
2196                        es.es_pblk = 0;
2197                } else {
2198                        es.es_lblk = le32_to_cpu(ex->ee_block);
2199                        es.es_len = ext4_ext_get_actual_len(ex);
2200                        es.es_pblk = ext4_ext_pblock(ex);
2201                        if (ext4_ext_is_unwritten(ex))
2202                                flags |= FIEMAP_EXTENT_UNWRITTEN;
2203                }
2204
2205                /*
2206                 * Find delayed extent and update es accordingly. We call
2207                 * it even in !exists case to find out whether es is the
2208                 * last existing extent or not.
2209                 */
2210                next_del = ext4_find_delayed_extent(inode, &es);
2211                if (!exists && next_del) {
2212                        exists = 1;
2213                        flags |= (FIEMAP_EXTENT_DELALLOC |
2214                                  FIEMAP_EXTENT_UNKNOWN);
2215                }
2216                up_read(&EXT4_I(inode)->i_data_sem);
2217
2218                if (unlikely(es.es_len == 0)) {
2219                        EXT4_ERROR_INODE(inode, "es.es_len == 0");
2220                        err = -EIO;
2221                        break;
2222                }
2223
2224                /*
2225                 * This is possible iff next == next_del == EXT_MAX_BLOCKS.
2226                 * we need to check next == EXT_MAX_BLOCKS because it is
2227                 * possible that an extent is with unwritten and delayed
2228                 * status due to when an extent is delayed allocated and
2229                 * is allocated by fallocate status tree will track both of
2230                 * them in a extent.
2231                 *
2232                 * So we could return a unwritten and delayed extent, and
2233                 * its block is equal to 'next'.
2234                 */
2235                if (next == next_del && next == EXT_MAX_BLOCKS) {
2236                        flags |= FIEMAP_EXTENT_LAST;
2237                        if (unlikely(next_del != EXT_MAX_BLOCKS ||
2238                                     next != EXT_MAX_BLOCKS)) {
2239                                EXT4_ERROR_INODE(inode,
2240                                                 "next extent == %u, next "
2241                                                 "delalloc extent = %u",
2242                                                 next, next_del);
2243                                err = -EIO;
2244                                break;
2245                        }
2246                }
2247
2248                if (exists) {
2249                        err = fiemap_fill_next_extent(fieinfo,
2250                                (__u64)es.es_lblk << blksize_bits,
2251                                (__u64)es.es_pblk << blksize_bits,
2252                                (__u64)es.es_len << blksize_bits,
2253                                flags);
2254                        if (err < 0)
2255                                break;
2256                        if (err == 1) {
2257                                err = 0;
2258                                break;
2259                        }
2260                }
2261
2262                block = es.es_lblk + es.es_len;
2263        }
2264
2265        if (path) {
2266                ext4_ext_drop_refs(path);
2267                kfree(path);
2268        }
2269
2270        return err;
2271}
2272
2273/*
2274 * ext4_ext_determine_hole - determine hole around given block
2275 * @inode:      inode we lookup in
2276 * @path:       path in extent tree to @lblk
2277 * @lblk:       pointer to logical block around which we want to determine hole
2278 *
2279 * Determine hole length (and start if easily possible) around given logical
2280 * block. We don't try too hard to find the beginning of the hole but @path
2281 * actually points to extent before @lblk, we provide it.
2282 *
2283 * The function returns the length of a hole starting at @lblk. We update @lblk
2284 * to the beginning of the hole if we managed to find it.
2285 */
2286static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2287                                           struct ext4_ext_path *path,
2288                                           ext4_lblk_t *lblk)
2289{
2290        int depth = ext_depth(inode);
2291        struct ext4_extent *ex;
2292        ext4_lblk_t len;
2293
2294        ex = path[depth].p_ext;
2295        if (ex == NULL) {
2296                /* there is no extent yet, so gap is [0;-] */
2297                *lblk = 0;
2298                len = EXT_MAX_BLOCKS;
2299        } else if (*lblk < le32_to_cpu(ex->ee_block)) {
2300                len = le32_to_cpu(ex->ee_block) - *lblk;
2301        } else if (*lblk >= le32_to_cpu(ex->ee_block)
2302                        + ext4_ext_get_actual_len(ex)) {
2303                ext4_lblk_t next;
2304
2305                *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2306                next = ext4_ext_next_allocated_block(path);
2307                BUG_ON(next == *lblk);
2308                len = next - *lblk;
2309        } else {
2310                BUG();
2311        }
2312        return len;
2313}
2314
2315/*
2316 * ext4_ext_put_gap_in_cache:
2317 * calculate boundaries of the gap that the requested block fits into
2318 * and cache this gap
2319 */
2320static void
2321ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2322                          ext4_lblk_t hole_len)
2323{
2324        struct extent_status es;
2325
2326        ext4_es_find_delayed_extent_range(inode, hole_start,
2327                                          hole_start + hole_len - 1, &es);
2328        if (es.es_len) {
2329                /* There's delayed extent containing lblock? */
2330                if (es.es_lblk <= hole_start)
2331                        return;
2332                hole_len = min(es.es_lblk - hole_start, hole_len);
2333        }
2334        ext_debug(" -> %u:%u\n", hole_start, hole_len);
2335        ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2336                              EXTENT_STATUS_HOLE);
2337}
2338
2339/*
2340 * ext4_ext_rm_idx:
2341 * removes index from the index block.
2342 */
2343static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2344                        struct ext4_ext_path *path, int depth)
2345{
2346        int err;
2347        ext4_fsblk_t leaf;
2348
2349        /* free index block */
2350        depth--;
2351        path = path + depth;
2352        leaf = ext4_idx_pblock(path->p_idx);
2353        if (unlikely(path->p_hdr->eh_entries == 0)) {
2354                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2355                return -EIO;
2356        }
2357        err = ext4_ext_get_access(handle, inode, path);
2358        if (err)
2359                return err;
2360
2361        if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2362                int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2363                len *= sizeof(struct ext4_extent_idx);
2364                memmove(path->p_idx, path->p_idx + 1, len);
2365        }
2366
2367        le16_add_cpu(&path->p_hdr->eh_entries, -1);
2368        err = ext4_ext_dirty(handle, inode, path);
2369        if (err)
2370                return err;
2371        ext_debug("index is empty, remove it, free block %llu\n", leaf);
2372        trace_ext4_ext_rm_idx(inode, leaf);
2373
2374        ext4_free_blocks(handle, inode, NULL, leaf, 1,
2375                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2376
2377        while (--depth >= 0) {
2378                if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2379                        break;
2380                path--;
2381                err = ext4_ext_get_access(handle, inode, path);
2382                if (err)
2383                        break;
2384                path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2385                err = ext4_ext_dirty(handle, inode, path);
2386                if (err)
2387                        break;
2388        }
2389        return err;
2390}
2391
2392/*
2393 * ext4_ext_calc_credits_for_single_extent:
2394 * This routine returns max. credits that needed to insert an extent
2395 * to the extent tree.
2396 * When pass the actual path, the caller should calculate credits
2397 * under i_data_sem.
2398 */
2399int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2400                                                struct ext4_ext_path *path)
2401{
2402        if (path) {
2403                int depth = ext_depth(inode);
2404                int ret = 0;
2405
2406                /* probably there is space in leaf? */
2407                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2408                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2409
2410                        /*
2411                         *  There are some space in the leaf tree, no
2412                         *  need to account for leaf block credit
2413                         *
2414                         *  bitmaps and block group descriptor blocks
2415                         *  and other metadata blocks still need to be
2416                         *  accounted.
2417                         */
2418                        /* 1 bitmap, 1 block group descriptor */
2419                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2420                        return ret;
2421                }
2422        }
2423
2424        return ext4_chunk_trans_blocks(inode, nrblocks);
2425}
2426
2427/*
2428 * How many index/leaf blocks need to change/allocate to add @extents extents?
2429 *
2430 * If we add a single extent, then in the worse case, each tree level
2431 * index/leaf need to be changed in case of the tree split.
2432 *
2433 * If more extents are inserted, they could cause the whole tree split more
2434 * than once, but this is really rare.
2435 */
2436int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2437{
2438        int index;
2439        int depth;
2440
2441        /* If we are converting the inline data, only one is needed here. */
2442        if (ext4_has_inline_data(inode))
2443                return 1;
2444
2445        depth = ext_depth(inode);
2446
2447        if (extents <= 1)
2448                index = depth * 2;
2449        else
2450                index = depth * 3;
2451
2452        return index;
2453}
2454
2455static inline int get_default_free_blocks_flags(struct inode *inode)
2456{
2457        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2458                return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2459        else if (ext4_should_journal_data(inode))
2460                return EXT4_FREE_BLOCKS_FORGET;
2461        return 0;
2462}
2463
2464static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2465                              struct ext4_extent *ex,
2466                              long long *partial_cluster,
2467                              ext4_lblk_t from, ext4_lblk_t to)
2468{
2469        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2470        unsigned short ee_len = ext4_ext_get_actual_len(ex);
2471        ext4_fsblk_t pblk;
2472        int flags = get_default_free_blocks_flags(inode);
2473
2474        /*
2475         * For bigalloc file systems, we never free a partial cluster
2476         * at the beginning of the extent.  Instead, we make a note
2477         * that we tried freeing the cluster, and check to see if we
2478         * need to free it on a subsequent call to ext4_remove_blocks,
2479         * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2480         */
2481        flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2482
2483        trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2484        /*
2485         * If we have a partial cluster, and it's different from the
2486         * cluster of the last block, we need to explicitly free the
2487         * partial cluster here.
2488         */
2489        pblk = ext4_ext_pblock(ex) + ee_len - 1;
2490        if (*partial_cluster > 0 &&
2491            *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
2492                ext4_free_blocks(handle, inode, NULL,
2493                                 EXT4_C2B(sbi, *partial_cluster),
2494                                 sbi->s_cluster_ratio, flags);
2495                *partial_cluster = 0;
2496        }
2497
2498#ifdef EXTENTS_STATS
2499        {
2500                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2501                spin_lock(&sbi->s_ext_stats_lock);
2502                sbi->s_ext_blocks += ee_len;
2503                sbi->s_ext_extents++;
2504                if (ee_len < sbi->s_ext_min)
2505                        sbi->s_ext_min = ee_len;
2506                if (ee_len > sbi->s_ext_max)
2507                        sbi->s_ext_max = ee_len;
2508                if (ext_depth(inode) > sbi->s_depth_max)
2509                        sbi->s_depth_max = ext_depth(inode);
2510                spin_unlock(&sbi->s_ext_stats_lock);
2511        }
2512#endif
2513        if (from >= le32_to_cpu(ex->ee_block)
2514            && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2515                /* tail removal */
2516                ext4_lblk_t num;
2517                long long first_cluster;
2518
2519                num = le32_to_cpu(ex->ee_block) + ee_len - from;
2520                pblk = ext4_ext_pblock(ex) + ee_len - num;
2521                /*
2522                 * Usually we want to free partial cluster at the end of the
2523                 * extent, except for the situation when the cluster is still
2524                 * used by any other extent (partial_cluster is negative).
2525                 */
2526                if (*partial_cluster < 0 &&
2527                    *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
2528                        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2529
2530                ext_debug("free last %u blocks starting %llu partial %lld\n",
2531                          num, pblk, *partial_cluster);
2532                ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2533                /*
2534                 * If the block range to be freed didn't start at the
2535                 * beginning of a cluster, and we removed the entire
2536                 * extent and the cluster is not used by any other extent,
2537                 * save the partial cluster here, since we might need to
2538                 * delete if we determine that the truncate or punch hole
2539                 * operation has removed all of the blocks in the cluster.
2540                 * If that cluster is used by another extent, preserve its
2541                 * negative value so it isn't freed later on.
2542                 *
2543                 * If the whole extent wasn't freed, we've reached the
2544                 * start of the truncated/punched region and have finished
2545                 * removing blocks.  If there's a partial cluster here it's
2546                 * shared with the remainder of the extent and is no longer
2547                 * a candidate for removal.
2548                 */
2549                if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
2550                        first_cluster = (long long) EXT4_B2C(sbi, pblk);
2551                        if (first_cluster != -*partial_cluster)
2552                                *partial_cluster = first_cluster;
2553                } else {
2554                        *partial_cluster = 0;
2555                }
2556        } else
2557                ext4_error(sbi->s_sb, "strange request: removal(2) "
2558                           "%u-%u from %u:%u\n",
2559                           from, to, le32_to_cpu(ex->ee_block), ee_len);
2560        return 0;
2561}
2562
2563
2564/*
2565 * ext4_ext_rm_leaf() Removes the extents associated with the
2566 * blocks appearing between "start" and "end".  Both "start"
2567 * and "end" must appear in the same extent or EIO is returned.
2568 *
2569 * @handle: The journal handle
2570 * @inode:  The files inode
2571 * @path:   The path to the leaf
2572 * @partial_cluster: The cluster which we'll have to free if all extents
2573 *                   has been released from it.  However, if this value is
2574 *                   negative, it's a cluster just to the right of the
2575 *                   punched region and it must not be freed.
2576 * @start:  The first block to remove
2577 * @end:   The last block to remove
2578 */
2579static int
2580ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2581                 struct ext4_ext_path *path,
2582                 long long *partial_cluster,
2583                 ext4_lblk_t start, ext4_lblk_t end)
2584{
2585        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2586        int err = 0, correct_index = 0;
2587        int depth = ext_depth(inode), credits;
2588        struct ext4_extent_header *eh;
2589        ext4_lblk_t a, b;
2590        unsigned num;
2591        ext4_lblk_t ex_ee_block;
2592        unsigned short ex_ee_len;
2593        unsigned unwritten = 0;
2594        struct ext4_extent *ex;
2595        ext4_fsblk_t pblk;
2596
2597        /* the header must be checked already in ext4_ext_remove_space() */
2598        ext_debug("truncate since %u in leaf to %u\n", start, end);
2599        if (!path[depth].p_hdr)
2600                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2601        eh = path[depth].p_hdr;
2602        if (unlikely(path[depth].p_hdr == NULL)) {
2603                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2604                return -EIO;
2605        }
2606        /* find where to start removing */
2607        ex = path[depth].p_ext;
2608        if (!ex)
2609                ex = EXT_LAST_EXTENT(eh);
2610
2611        ex_ee_block = le32_to_cpu(ex->ee_block);
2612        ex_ee_len = ext4_ext_get_actual_len(ex);
2613
2614        trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2615
2616        while (ex >= EXT_FIRST_EXTENT(eh) &&
2617                        ex_ee_block + ex_ee_len > start) {
2618
2619                if (ext4_ext_is_unwritten(ex))
2620                        unwritten = 1;
2621                else
2622                        unwritten = 0;
2623
2624                ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2625                          unwritten, ex_ee_len);
2626                path[depth].p_ext = ex;
2627
2628                a = ex_ee_block > start ? ex_ee_block : start;
2629                b = ex_ee_block+ex_ee_len - 1 < end ?
2630                        ex_ee_block+ex_ee_len - 1 : end;
2631
2632                ext_debug("  border %u:%u\n", a, b);
2633
2634                /* If this extent is beyond the end of the hole, skip it */
2635                if (end < ex_ee_block) {
2636                        /*
2637                         * We're going to skip this extent and move to another,
2638                         * so note that its first cluster is in use to avoid
2639                         * freeing it when removing blocks.  Eventually, the
2640                         * right edge of the truncated/punched region will
2641                         * be just to the left.
2642                         */
2643                        if (sbi->s_cluster_ratio > 1) {
2644                                pblk = ext4_ext_pblock(ex);
2645                                *partial_cluster =
2646                                        -(long long) EXT4_B2C(sbi, pblk);
2647                        }
2648                        ex--;
2649                        ex_ee_block = le32_to_cpu(ex->ee_block);
2650                        ex_ee_len = ext4_ext_get_actual_len(ex);
2651                        continue;
2652                } else if (b != ex_ee_block + ex_ee_len - 1) {
2653                        EXT4_ERROR_INODE(inode,
2654                                         "can not handle truncate %u:%u "
2655                                         "on extent %u:%u",
2656                                         start, end, ex_ee_block,
2657                                         ex_ee_block + ex_ee_len - 1);
2658                        err = -EIO;
2659                        goto out;
2660                } else if (a != ex_ee_block) {
2661                        /* remove tail of the extent */
2662                        num = a - ex_ee_block;
2663                } else {
2664                        /* remove whole extent: excellent! */
2665                        num = 0;
2666                }
2667                /*
2668                 * 3 for leaf, sb, and inode plus 2 (bmap and group
2669                 * descriptor) for each block group; assume two block
2670                 * groups plus ex_ee_len/blocks_per_block_group for
2671                 * the worst case
2672                 */
2673                credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2674                if (ex == EXT_FIRST_EXTENT(eh)) {
2675                        correct_index = 1;
2676                        credits += (ext_depth(inode)) + 1;
2677                }
2678                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2679
2680                err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2681                if (err)
2682                        goto out;
2683
2684                err = ext4_ext_get_access(handle, inode, path + depth);
2685                if (err)
2686                        goto out;
2687
2688                err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2689                                         a, b);
2690                if (err)
2691                        goto out;
2692
2693                if (num == 0)
2694                        /* this extent is removed; mark slot entirely unused */
2695                        ext4_ext_store_pblock(ex, 0);
2696
2697                ex->ee_len = cpu_to_le16(num);
2698                /*
2699                 * Do not mark unwritten if all the blocks in the
2700                 * extent have been removed.
2701                 */
2702                if (unwritten && num)
2703                        ext4_ext_mark_unwritten(ex);
2704                /*
2705                 * If the extent was completely released,
2706                 * we need to remove it from the leaf
2707                 */
2708                if (num == 0) {
2709                        if (end != EXT_MAX_BLOCKS - 1) {
2710                                /*
2711                                 * For hole punching, we need to scoot all the
2712                                 * extents up when an extent is removed so that
2713                                 * we dont have blank extents in the middle
2714                                 */
2715                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2716                                        sizeof(struct ext4_extent));
2717
2718                                /* Now get rid of the one at the end */
2719                                memset(EXT_LAST_EXTENT(eh), 0,
2720                                        sizeof(struct ext4_extent));
2721                        }
2722                        le16_add_cpu(&eh->eh_entries, -1);
2723                }
2724
2725                err = ext4_ext_dirty(handle, inode, path + depth);
2726                if (err)
2727                        goto out;
2728
2729                ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2730                                ext4_ext_pblock(ex));
2731                ex--;
2732                ex_ee_block = le32_to_cpu(ex->ee_block);
2733                ex_ee_len = ext4_ext_get_actual_len(ex);
2734        }
2735
2736        if (correct_index && eh->eh_entries)
2737                err = ext4_ext_correct_indexes(handle, inode, path);
2738
2739        /*
2740         * If there's a partial cluster and at least one extent remains in
2741         * the leaf, free the partial cluster if it isn't shared with the
2742         * current extent.  If it is shared with the current extent
2743         * we zero partial_cluster because we've reached the start of the
2744         * truncated/punched region and we're done removing blocks.
2745         */
2746        if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
2747                pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2748                if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
2749                        ext4_free_blocks(handle, inode, NULL,
2750                                         EXT4_C2B(sbi, *partial_cluster),
2751                                         sbi->s_cluster_ratio,
2752                                         get_default_free_blocks_flags(inode));
2753                }
2754                *partial_cluster = 0;
2755        }
2756
2757        /* if this leaf is free, then we should
2758         * remove it from index block above */
2759        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2760                err = ext4_ext_rm_idx(handle, inode, path, depth);
2761
2762out:
2763        return err;
2764}
2765
2766/*
2767 * ext4_ext_more_to_rm:
2768 * returns 1 if current index has to be freed (even partial)
2769 */
2770static int
2771ext4_ext_more_to_rm(struct ext4_ext_path *path)
2772{
2773        BUG_ON(path->p_idx == NULL);
2774
2775        if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2776                return 0;
2777
2778        /*
2779         * if truncate on deeper level happened, it wasn't partial,
2780         * so we have to consider current index for truncation
2781         */
2782        if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2783                return 0;
2784        return 1;
2785}
2786
2787int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2788                          ext4_lblk_t end)
2789{
2790        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2791        int depth = ext_depth(inode);
2792        struct ext4_ext_path *path = NULL;
2793        long long partial_cluster = 0;
2794        handle_t *handle;
2795        int i = 0, err = 0;
2796
2797        ext_debug("truncate since %u to %u\n", start, end);
2798
2799        /* probably first extent we're gonna free will be last in block */
2800        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
2801        if (IS_ERR(handle))
2802                return PTR_ERR(handle);
2803
2804again:
2805        trace_ext4_ext_remove_space(inode, start, end, depth);
2806
2807        /*
2808         * Check if we are removing extents inside the extent tree. If that
2809         * is the case, we are going to punch a hole inside the extent tree
2810         * so we have to check whether we need to split the extent covering
2811         * the last block to remove so we can easily remove the part of it
2812         * in ext4_ext_rm_leaf().
2813         */
2814        if (end < EXT_MAX_BLOCKS - 1) {
2815                struct ext4_extent *ex;
2816                ext4_lblk_t ee_block, ex_end, lblk;
2817                ext4_fsblk_t pblk;
2818
2819                /* find extent for or closest extent to this block */
2820                path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2821                if (IS_ERR(path)) {
2822                        ext4_journal_stop(handle);
2823                        return PTR_ERR(path);
2824                }
2825                depth = ext_depth(inode);
2826                /* Leaf not may not exist only if inode has no blocks at all */
2827                ex = path[depth].p_ext;
2828                if (!ex) {
2829                        if (depth) {
2830                                EXT4_ERROR_INODE(inode,
2831                                                 "path[%d].p_hdr == NULL",
2832                                                 depth);
2833                                err = -EIO;
2834                        }
2835                        goto out;
2836                }
2837
2838                ee_block = le32_to_cpu(ex->ee_block);
2839                ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2840
2841                /*
2842                 * See if the last block is inside the extent, if so split
2843                 * the extent at 'end' block so we can easily remove the
2844                 * tail of the first part of the split extent in
2845                 * ext4_ext_rm_leaf().
2846                 */
2847                if (end >= ee_block && end < ex_end) {
2848                        int split_flag = 0;
2849
2850                        /*
2851                         * If we're going to split the extent, note that
2852                         * the cluster containing the block after 'end' is
2853                         * in use to avoid freeing it when removing blocks.
2854                         */
2855                        if (sbi->s_cluster_ratio > 1) {
2856                                pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
2857                                partial_cluster =
2858                                        -(long long) EXT4_B2C(sbi, pblk);
2859                        }
2860
2861                        if (ext4_ext_is_unwritten(ex))
2862                                split_flag = EXT4_EXT_MARK_UNWRIT1 |
2863                                             EXT4_EXT_MARK_UNWRIT2;
2864
2865                        /*
2866                         * Split the extent in two so that 'end' is the last
2867                         * block in the first new extent. Also we should not
2868                         * fail removing space due to ENOSPC so try to use
2869                         * reserved block if that happens.
2870                         */
2871                        err = ext4_split_extent_at(handle, inode, path,
2872                                        end + 1, split_flag,
2873                                        EXT4_EX_NOCACHE |
2874                                        EXT4_GET_BLOCKS_PRE_IO |
2875                                        EXT4_GET_BLOCKS_METADATA_NOFAIL);
2876
2877                        if (err < 0)
2878                                goto out;
2879
2880                } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
2881                        /*
2882                         * If there's an extent to the right its first cluster
2883                         * contains the immediate right boundary of the
2884                         * truncated/punched region.  Set partial_cluster to
2885                         * its negative value so it won't be freed if shared
2886                         * with the current extent.  The end < ee_block case
2887                         * is handled in ext4_ext_rm_leaf().
2888                         */
2889                        lblk = ex_end + 1;
2890                        err = ext4_ext_search_right(inode, path, &lblk, &pblk,
2891                                                    &ex);
2892                        if (err)
2893                                goto out;
2894                        if (pblk)
2895                                partial_cluster =
2896                                        -(long long) EXT4_B2C(sbi, pblk);
2897                }
2898        }
2899        /*
2900         * We start scanning from right side, freeing all the blocks
2901         * after i_size and walking into the tree depth-wise.
2902         */
2903        depth = ext_depth(inode);
2904        if (path) {
2905                int k = i = depth;
2906                while (--k > 0)
2907                        path[k].p_block =
2908                                le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2909        } else {
2910                path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
2911                               GFP_NOFS);
2912                if (path == NULL) {
2913                        ext4_journal_stop(handle);
2914                        return -ENOMEM;
2915                }
2916                path[0].p_depth = depth;
2917                path[0].p_hdr = ext_inode_hdr(inode);
2918                i = 0;
2919
2920                if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2921                        err = -EIO;
2922                        goto out;
2923                }
2924        }
2925        err = 0;
2926
2927        while (i >= 0 && err == 0) {
2928                if (i == depth) {
2929                        /* this is leaf block */
2930                        err = ext4_ext_rm_leaf(handle, inode, path,
2931                                               &partial_cluster, start,
2932                                               end);
2933                        /* root level has p_bh == NULL, brelse() eats this */
2934                        brelse(path[i].p_bh);
2935                        path[i].p_bh = NULL;
2936                        i--;
2937                        continue;
2938                }
2939
2940                /* this is index block */
2941                if (!path[i].p_hdr) {
2942                        ext_debug("initialize header\n");
2943                        path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2944                }
2945
2946                if (!path[i].p_idx) {
2947                        /* this level hasn't been touched yet */
2948                        path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2949                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2950                        ext_debug("init index ptr: hdr 0x%p, num %d\n",
2951                                  path[i].p_hdr,
2952                                  le16_to_cpu(path[i].p_hdr->eh_entries));
2953                } else {
2954                        /* we were already here, see at next index */
2955                        path[i].p_idx--;
2956                }
2957
2958                ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2959                                i, EXT_FIRST_INDEX(path[i].p_hdr),
2960                                path[i].p_idx);
2961                if (ext4_ext_more_to_rm(path + i)) {
2962                        struct buffer_head *bh;
2963                        /* go to the next level */
2964                        ext_debug("move to level %d (block %llu)\n",
2965                                  i + 1, ext4_idx_pblock(path[i].p_idx));
2966                        memset(path + i + 1, 0, sizeof(*path));
2967                        bh = read_extent_tree_block(inode,
2968                                ext4_idx_pblock(path[i].p_idx), depth - i - 1,
2969                                EXT4_EX_NOCACHE);
2970                        if (IS_ERR(bh)) {
2971                                /* should we reset i_size? */
2972                                err = PTR_ERR(bh);
2973                                break;
2974                        }
2975                        /* Yield here to deal with large extent trees.
2976                         * Should be a no-op if we did IO above. */
2977                        cond_resched();
2978                        if (WARN_ON(i + 1 > depth)) {
2979                                err = -EIO;
2980                                break;
2981                        }
2982                        path[i + 1].p_bh = bh;
2983
2984                        /* save actual number of indexes since this
2985                         * number is changed at the next iteration */
2986                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2987                        i++;
2988                } else {
2989                        /* we finished processing this index, go up */
2990                        if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2991                                /* index is empty, remove it;
2992                                 * handle must be already prepared by the
2993                                 * truncatei_leaf() */
2994                                err = ext4_ext_rm_idx(handle, inode, path, i);
2995                        }
2996                        /* root level has p_bh == NULL, brelse() eats this */
2997                        brelse(path[i].p_bh);
2998                        path[i].p_bh = NULL;
2999                        i--;
3000                        ext_debug("return to level %d\n", i);
3001                }
3002        }
3003
3004        trace_ext4_ext_remove_space_done(inode, start, end, depth,
3005                        partial_cluster, path->p_hdr->eh_entries);
3006
3007        /*
3008         * If we still have something in the partial cluster and we have removed
3009         * even the first extent, then we should free the blocks in the partial
3010         * cluster as well.  (This code will only run when there are no leaves
3011         * to the immediate left of the truncated/punched region.)
3012         */
3013        if (partial_cluster > 0 && err == 0) {
3014                /* don't zero partial_cluster since it's not used afterwards */
3015                ext4_free_blocks(handle, inode, NULL,
3016                                 EXT4_C2B(sbi, partial_cluster),
3017                                 sbi->s_cluster_ratio,
3018                                 get_default_free_blocks_flags(inode));
3019        }
3020
3021        /* TODO: flexible tree reduction should be here */
3022        if (path->p_hdr->eh_entries == 0) {
3023                /*
3024                 * truncate to zero freed all the tree,
3025                 * so we need to correct eh_depth
3026                 */
3027                err = ext4_ext_get_access(handle, inode, path);
3028                if (err == 0) {
3029                        ext_inode_hdr(inode)->eh_depth = 0;
3030                        ext_inode_hdr(inode)->eh_max =
3031                                cpu_to_le16(ext4_ext_space_root(inode, 0));
3032                        err = ext4_ext_dirty(handle, inode, path);
3033                }
3034        }
3035out:
3036        ext4_ext_drop_refs(path);
3037        kfree(path);
3038        if (err == -EAGAIN) {
3039                path = NULL;
3040                goto again;
3041        }
3042        ext4_journal_stop(handle);
3043
3044        return err;
3045}
3046
3047/*
3048 * called at mount time
3049 */
3050void ext4_ext_init(struct super_block *sb)
3051{
3052        /*
3053         * possible initialization would be here
3054         */
3055
3056        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
3057#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3058                printk(KERN_INFO "EXT4-fs: file extents enabled"
3059#ifdef AGGRESSIVE_TEST
3060                       ", aggressive tests"
3061#endif
3062#ifdef CHECK_BINSEARCH
3063                       ", check binsearch"
3064#endif
3065#ifdef EXTENTS_STATS
3066                       ", stats"
3067#endif
3068                       "\n");
3069#endif
3070#ifdef EXTENTS_STATS
3071                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3072                EXT4_SB(sb)->s_ext_min = 1 << 30;
3073                EXT4_SB(sb)->s_ext_max = 0;
3074#endif
3075        }
3076}
3077
3078/*
3079 * called at umount time
3080 */
3081void ext4_ext_release(struct super_block *sb)
3082{
3083        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
3084                return;
3085
3086#ifdef EXTENTS_STATS
3087        if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3088                struct ext4_sb_info *sbi = EXT4_SB(sb);
3089                printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3090                        sbi->s_ext_blocks, sbi->s_ext_extents,
3091                        sbi->s_ext_blocks / sbi->s_ext_extents);
3092                printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3093                        sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3094        }
3095#endif
3096}
3097
3098static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3099{
3100        ext4_lblk_t  ee_block;
3101        ext4_fsblk_t ee_pblock;
3102        unsigned int ee_len;
3103
3104        ee_block  = le32_to_cpu(ex->ee_block);
3105        ee_len    = ext4_ext_get_actual_len(ex);
3106        ee_pblock = ext4_ext_pblock(ex);
3107
3108        if (ee_len == 0)
3109                return 0;
3110
3111        return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3112                                     EXTENT_STATUS_WRITTEN);
3113}
3114
3115/* FIXME!! we need to try to merge to left or right after zero-out  */
3116static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3117{
3118        ext4_fsblk_t ee_pblock;
3119        unsigned int ee_len;
3120
3121        ee_len    = ext4_ext_get_actual_len(ex);
3122        ee_pblock = ext4_ext_pblock(ex);
3123
3124        return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
3125                                  ee_len);
3126}
3127
3128/*
3129 * ext4_split_extent_at() splits an extent at given block.
3130 *
3131 * @handle: the journal handle
3132 * @inode: the file inode
3133 * @path: the path to the extent
3134 * @split: the logical block where the extent is splitted.
3135 * @split_flags: indicates if the extent could be zeroout if split fails, and
3136 *               the states(init or unwritten) of new extents.
3137 * @flags: flags used to insert new extent to extent tree.
3138 *
3139 *
3140 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3141 * of which are deterimined by split_flag.
3142 *
3143 * There are two cases:
3144 *  a> the extent are splitted into two extent.
3145 *  b> split is not needed, and just mark the extent.
3146 *
3147 * return 0 on success.
3148 */
3149static int ext4_split_extent_at(handle_t *handle,
3150                             struct inode *inode,
3151                             struct ext4_ext_path *path,
3152                             ext4_lblk_t split,
3153                             int split_flag,
3154                             int flags)
3155{
3156        ext4_fsblk_t newblock;
3157        ext4_lblk_t ee_block;
3158        struct ext4_extent *ex, newex, orig_ex, zero_ex;
3159        struct ext4_extent *ex2 = NULL;
3160        unsigned int ee_len, depth;
3161        int err = 0;
3162
3163        BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3164               (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3165
3166        ext_debug("ext4_split_extents_at: inode %lu, logical"
3167                "block %llu\n", inode->i_ino, (unsigned long long)split);
3168
3169        ext4_ext_show_leaf(inode, path);
3170
3171        depth = ext_depth(inode);
3172        ex = path[depth].p_ext;
3173        ee_block = le32_to_cpu(ex->ee_block);
3174        ee_len = ext4_ext_get_actual_len(ex);
3175        newblock = split - ee_block + ext4_ext_pblock(ex);
3176
3177        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3178        BUG_ON(!ext4_ext_is_unwritten(ex) &&
3179               split_flag & (EXT4_EXT_MAY_ZEROOUT |
3180                             EXT4_EXT_MARK_UNWRIT1 |
3181                             EXT4_EXT_MARK_UNWRIT2));
3182
3183        err = ext4_ext_get_access(handle, inode, path + depth);
3184        if (err)
3185                goto out;
3186
3187        if (split == ee_block) {
3188                /*
3189                 * case b: block @split is the block that the extent begins with
3190                 * then we just change the state of the extent, and splitting
3191                 * is not needed.
3192                 */
3193                if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3194                        ext4_ext_mark_unwritten(ex);
3195                else
3196                        ext4_ext_mark_initialized(ex);
3197
3198                if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3199                        ext4_ext_try_to_merge(handle, inode, path, ex);
3200
3201                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3202                goto out;
3203        }
3204
3205        /* case a */
3206        memcpy(&orig_ex, ex, sizeof(orig_ex));
3207        ex->ee_len = cpu_to_le16(split - ee_block);
3208        if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3209                ext4_ext_mark_unwritten(ex);
3210
3211        /*
3212         * path may lead to new leaf, not to original leaf any more
3213         * after ext4_ext_insert_extent() returns,
3214         */
3215        err = ext4_ext_dirty(handle, inode, path + depth);
3216        if (err)
3217                goto fix_extent_len;
3218
3219        ex2 = &newex;
3220        ex2->ee_block = cpu_to_le32(split);
3221        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3222        ext4_ext_store_pblock(ex2, newblock);
3223        if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3224                ext4_ext_mark_unwritten(ex2);
3225
3226        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3227        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3228                if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3229                        if (split_flag & EXT4_EXT_DATA_VALID1) {
3230                                err = ext4_ext_zeroout(inode, ex2);
3231                                zero_ex.ee_block = ex2->ee_block;
3232                                zero_ex.ee_len = cpu_to_le16(
3233                                                ext4_ext_get_actual_len(ex2));
3234                                ext4_ext_store_pblock(&zero_ex,
3235                                                      ext4_ext_pblock(ex2));
3236                        } else {
3237                                err = ext4_ext_zeroout(inode, ex);
3238                                zero_ex.ee_block = ex->ee_block;
3239                                zero_ex.ee_len = cpu_to_le16(
3240                                                ext4_ext_get_actual_len(ex));
3241                                ext4_ext_store_pblock(&zero_ex,
3242                                                      ext4_ext_pblock(ex));
3243                        }
3244                } else {
3245                        err = ext4_ext_zeroout(inode, &orig_ex);
3246                        zero_ex.ee_block = orig_ex.ee_block;
3247                        zero_ex.ee_len = cpu_to_le16(
3248                                                ext4_ext_get_actual_len(&orig_ex));
3249                        ext4_ext_store_pblock(&zero_ex,
3250                                              ext4_ext_pblock(&orig_ex));
3251                }
3252
3253                if (err)
3254                        goto fix_extent_len;
3255                /* update the extent length and mark as initialized */
3256                ex->ee_len = cpu_to_le16(ee_len);
3257                ext4_ext_try_to_merge(handle, inode, path, ex);
3258                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3259                if (err)
3260                        goto fix_extent_len;
3261
3262                /* update extent status tree */
3263                err = ext4_zeroout_es(inode, &zero_ex);
3264
3265                goto out;
3266        } else if (err)
3267                goto fix_extent_len;
3268
3269out:
3270        ext4_ext_show_leaf(inode, path);
3271        return err;
3272
3273fix_extent_len:
3274        ex->ee_len = orig_ex.ee_len;
3275        ext4_ext_dirty(handle, inode, path + path->p_depth);
3276        return err;
3277}
3278
3279/*
3280 * ext4_split_extents() splits an extent and mark extent which is covered
3281 * by @map as split_flags indicates
3282 *
3283 * It may result in splitting the extent into multiple extents (up to three)
3284 * There are three possibilities:
3285 *   a> There is no split required
3286 *   b> Splits in two extents: Split is happening at either end of the extent
3287 *   c> Splits in three extents: Somone is splitting in middle of the extent
3288 *
3289 */
3290static int ext4_split_extent(handle_t *handle,
3291                              struct inode *inode,
3292                              struct ext4_ext_path *path,
3293                              struct ext4_map_blocks *map,
3294                              int split_flag,
3295                              int flags)
3296{
3297        ext4_lblk_t ee_block;
3298        struct ext4_extent *ex;
3299        unsigned int ee_len, depth;
3300        int err = 0;
3301        int unwritten;
3302        int split_flag1, flags1;
3303        int allocated = map->m_len;
3304
3305        depth = ext_depth(inode);
3306        ex = path[depth].p_ext;
3307        ee_block = le32_to_cpu(ex->ee_block);
3308        ee_len = ext4_ext_get_actual_len(ex);
3309        unwritten = ext4_ext_is_unwritten(ex);
3310
3311        if (map->m_lblk + map->m_len < ee_block + ee_len) {
3312                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3313                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3314                if (unwritten)
3315                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3316                                       EXT4_EXT_MARK_UNWRIT2;
3317                if (split_flag & EXT4_EXT_DATA_VALID2)
3318                        split_flag1 |= EXT4_EXT_DATA_VALID1;
3319                err = ext4_split_extent_at(handle, inode, path,
3320                                map->m_lblk + map->m_len, split_flag1, flags1);
3321                if (err)
3322                        goto out;
3323        } else {
3324                allocated = ee_len - (map->m_lblk - ee_block);
3325        }
3326        /*
3327         * Update path is required because previous ext4_split_extent_at() may
3328         * result in split of original leaf or extent zeroout.
3329         */
3330        ext4_ext_drop_refs(path);
3331        path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3332        if (IS_ERR(path))
3333                return PTR_ERR(path);
3334        depth = ext_depth(inode);
3335        ex = path[depth].p_ext;
3336        if (!ex) {
3337                EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3338                                 (unsigned long) map->m_lblk);
3339                return -EIO;
3340        }
3341        unwritten = ext4_ext_is_unwritten(ex);
3342        split_flag1 = 0;
3343
3344        if (map->m_lblk >= ee_block) {
3345                split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3346                if (unwritten) {
3347                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3348                        split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3349                                                     EXT4_EXT_MARK_UNWRIT2);
3350                }
3351                err = ext4_split_extent_at(handle, inode, path,
3352                                map->m_lblk, split_flag1, flags);
3353                if (err)
3354                        goto out;
3355        }
3356
3357        ext4_ext_show_leaf(inode, path);
3358out:
3359        return err ? err : allocated;
3360}
3361
3362/*
3363 * This function is called by ext4_ext_map_blocks() if someone tries to write
3364 * to an unwritten extent. It may result in splitting the unwritten
3365 * extent into multiple extents (up to three - one initialized and two
3366 * unwritten).
3367 * There are three possibilities:
3368 *   a> There is no split required: Entire extent should be initialized
3369 *   b> Splits in two extents: Write is happening at either end of the extent
3370 *   c> Splits in three extents: Somone is writing in middle of the extent
3371 *
3372 * Pre-conditions:
3373 *  - The extent pointed to by 'path' is unwritten.
3374 *  - The extent pointed to by 'path' contains a superset
3375 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3376 *
3377 * Post-conditions on success:
3378 *  - the returned value is the number of blocks beyond map->l_lblk
3379 *    that are allocated and initialized.
3380 *    It is guaranteed to be >= map->m_len.
3381 */
3382static int ext4_ext_convert_to_initialized(handle_t *handle,
3383                                           struct inode *inode,
3384                                           struct ext4_map_blocks *map,
3385                                           struct ext4_ext_path *path,
3386                                           int flags)
3387{
3388        struct ext4_sb_info *sbi;
3389        struct ext4_extent_header *eh;
3390        struct ext4_map_blocks split_map;
3391        struct ext4_extent zero_ex;
3392        struct ext4_extent *ex, *abut_ex;
3393        ext4_lblk_t ee_block, eof_block;
3394        unsigned int ee_len, depth, map_len = map->m_len;
3395        int allocated = 0, max_zeroout = 0;
3396        int err = 0;
3397        int split_flag = 0;
3398
3399        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3400                "block %llu, max_blocks %u\n", inode->i_ino,
3401                (unsigned long long)map->m_lblk, map_len);
3402
3403        sbi = EXT4_SB(inode->i_sb);
3404        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3405                inode->i_sb->s_blocksize_bits;
3406        if (eof_block < map->m_lblk + map_len)
3407                eof_block = map->m_lblk + map_len;
3408
3409        depth = ext_depth(inode);
3410        eh = path[depth].p_hdr;
3411        ex = path[depth].p_ext;
3412        ee_block = le32_to_cpu(ex->ee_block);
3413        ee_len = ext4_ext_get_actual_len(ex);
3414        zero_ex.ee_len = 0;
3415
3416        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3417
3418        /* Pre-conditions */
3419        BUG_ON(!ext4_ext_is_unwritten(ex));
3420        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3421
3422        /*
3423         * Attempt to transfer newly initialized blocks from the currently
3424         * unwritten extent to its neighbor. This is much cheaper
3425         * than an insertion followed by a merge as those involve costly
3426         * memmove() calls. Transferring to the left is the common case in
3427         * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3428         * followed by append writes.
3429         *
3430         * Limitations of the current logic:
3431         *  - L1: we do not deal with writes covering the whole extent.
3432         *    This would require removing the extent if the transfer
3433         *    is possible.
3434         *  - L2: we only attempt to merge with an extent stored in the
3435         *    same extent tree node.
3436         */
3437        if ((map->m_lblk == ee_block) &&
3438                /* See if we can merge left */
3439                (map_len < ee_len) &&           /*L1*/
3440                (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3441                ext4_lblk_t prev_lblk;
3442                ext4_fsblk_t prev_pblk, ee_pblk;
3443                unsigned int prev_len;
3444
3445                abut_ex = ex - 1;
3446                prev_lblk = le32_to_cpu(abut_ex->ee_block);
3447                prev_len = ext4_ext_get_actual_len(abut_ex);
3448                prev_pblk = ext4_ext_pblock(abut_ex);
3449                ee_pblk = ext4_ext_pblock(ex);
3450
3451                /*
3452                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3453                 * upon those conditions:
3454                 * - C1: abut_ex is initialized,
3455                 * - C2: abut_ex is logically abutting ex,
3456                 * - C3: abut_ex is physically abutting ex,
3457                 * - C4: abut_ex can receive the additional blocks without
3458                 *   overflowing the (initialized) length limit.
3459                 */
3460                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3461                        ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3462                        ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3463                        (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3464                        err = ext4_ext_get_access(handle, inode, path + depth);
3465                        if (err)
3466                                goto out;
3467
3468                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
3469                                map, ex, abut_ex);
3470
3471                        /* Shift the start of ex by 'map_len' blocks */
3472                        ex->ee_block = cpu_to_le32(ee_block + map_len);
3473                        ext4_ext_store_pblock(ex, ee_pblk + map_len);
3474                        ex->ee_len = cpu_to_le16(ee_len - map_len);
3475                        ext4_ext_mark_unwritten(ex); /* Restore the flag */
3476
3477                        /* Extend abut_ex by 'map_len' blocks */
3478                        abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3479
3480                        /* Result: number of initialized blocks past m_lblk */
3481                        allocated = map_len;
3482                }
3483        } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3484                   (map_len < ee_len) &&        /*L1*/
3485                   ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3486                /* See if we can merge right */
3487                ext4_lblk_t next_lblk;
3488                ext4_fsblk_t next_pblk, ee_pblk;
3489                unsigned int next_len;
3490
3491                abut_ex = ex + 1;
3492                next_lblk = le32_to_cpu(abut_ex->ee_block);
3493                next_len = ext4_ext_get_actual_len(abut_ex);
3494                next_pblk = ext4_ext_pblock(abut_ex);
3495                ee_pblk = ext4_ext_pblock(ex);
3496
3497                /*
3498                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3499                 * upon those conditions:
3500                 * - C1: abut_ex is initialized,
3501                 * - C2: abut_ex is logically abutting ex,
3502                 * - C3: abut_ex is physically abutting ex,
3503                 * - C4: abut_ex can receive the additional blocks without
3504                 *   overflowing the (initialized) length limit.
3505                 */
3506                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3507                    ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3508                    ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3509                    (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3510                        err = ext4_ext_get_access(handle, inode, path + depth);
3511                        if (err)
3512                                goto out;
3513
3514                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
3515                                map, ex, abut_ex);
3516
3517                        /* Shift the start of abut_ex by 'map_len' blocks */
3518                        abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3519                        ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3520                        ex->ee_len = cpu_to_le16(ee_len - map_len);
3521                        ext4_ext_mark_unwritten(ex); /* Restore the flag */
3522
3523                        /* Extend abut_ex by 'map_len' blocks */
3524                        abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3525
3526                        /* Result: number of initialized blocks past m_lblk */
3527                        allocated = map_len;
3528                }
3529        }
3530        if (allocated) {
3531                /* Mark the block containing both extents as dirty */
3532                ext4_ext_dirty(handle, inode, path + depth);
3533
3534                /* Update path to point to the right extent */
3535                path[depth].p_ext = abut_ex;
3536                goto out;
3537        } else
3538                allocated = ee_len - (map->m_lblk - ee_block);
3539
3540        WARN_ON(map->m_lblk < ee_block);
3541        /*
3542         * It is safe to convert extent to initialized via explicit
3543         * zeroout only if extent is fully inside i_size or new_size.
3544         */
3545        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3546
3547        if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3548                max_zeroout = sbi->s_extent_max_zeroout_kb >>
3549                        (inode->i_sb->s_blocksize_bits - 10);
3550
3551        /* If extent is less than s_max_zeroout_kb, zeroout directly */
3552        if (max_zeroout && (ee_len <= max_zeroout)) {
3553                err = ext4_ext_zeroout(inode, ex);
3554                if (err)
3555                        goto out;
3556                zero_ex.ee_block = ex->ee_block;
3557                zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
3558                ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
3559
3560                err = ext4_ext_get_access(handle, inode, path + depth);
3561                if (err)
3562                        goto out;
3563                ext4_ext_mark_initialized(ex);
3564                ext4_ext_try_to_merge(handle, inode, path, ex);
3565                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3566                goto out;
3567        }
3568
3569        /*
3570         * four cases:
3571         * 1. split the extent into three extents.
3572         * 2. split the extent into two extents, zeroout the first half.
3573         * 3. split the extent into two extents, zeroout the second half.
3574         * 4. split the extent into two extents with out zeroout.
3575         */
3576        split_map.m_lblk = map->m_lblk;
3577        split_map.m_len = map->m_len;
3578
3579        if (max_zeroout && (allocated > map->m_len)) {
3580                if (allocated <= max_zeroout) {
3581                        /* case 3 */
3582                        zero_ex.ee_block =
3583                                         cpu_to_le32(map->m_lblk);
3584                        zero_ex.ee_len = cpu_to_le16(allocated);
3585                        ext4_ext_store_pblock(&zero_ex,
3586                                ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3587                        err = ext4_ext_zeroout(inode, &zero_ex);
3588                        if (err)
3589                                goto out;
3590                        split_map.m_lblk = map->m_lblk;
3591                        split_map.m_len = allocated;
3592                } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
3593                        /* case 2 */
3594                        if (map->m_lblk != ee_block) {
3595                                zero_ex.ee_block = ex->ee_block;
3596                                zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3597                                                        ee_block);
3598                                ext4_ext_store_pblock(&zero_ex,
3599                                                      ext4_ext_pblock(ex));
3600                                err = ext4_ext_zeroout(inode, &zero_ex);
3601                                if (err)
3602                                        goto out;
3603                        }
3604
3605                        split_map.m_lblk = ee_block;
3606                        split_map.m_len = map->m_lblk - ee_block + map->m_len;
3607                        allocated = map->m_len;
3608                }
3609        }
3610
3611        err = ext4_split_extent(handle, inode, path,
3612                                      &split_map, split_flag, flags);
3613        if (err > 0)
3614                err = 0;
3615
3616out:
3617        /* If we have gotten a failure, don't zero out status tree */
3618        if (!err)
3619                err = ext4_zeroout_es(inode, &zero_ex);
3620        return err ? err : allocated;
3621}
3622
3623/*
3624 * This function is called by ext4_ext_map_blocks() from
3625 * ext4_get_blocks_dio_write() when DIO to write
3626 * to an unwritten extent.
3627 *
3628 * Writing to an unwritten extent may result in splitting the unwritten
3629 * extent into multiple initialized/unwritten extents (up to three)
3630 * There are three possibilities:
3631 *   a> There is no split required: Entire extent should be unwritten
3632 *   b> Splits in two extents: Write is happening at either end of the extent
3633 *   c> Splits in three extents: Somone is writing in middle of the extent
3634 *
3635 * This works the same way in the case of initialized -> unwritten conversion.
3636 *
3637 * One of more index blocks maybe needed if the extent tree grow after
3638 * the unwritten extent split. To prevent ENOSPC occur at the IO
3639 * complete, we need to split the unwritten extent before DIO submit
3640 * the IO. The unwritten extent called at this time will be split
3641 * into three unwritten extent(at most). After IO complete, the part
3642 * being filled will be convert to initialized by the end_io callback function
3643 * via ext4_convert_unwritten_extents().
3644 *
3645 * Returns the size of unwritten extent to be written on success.
3646 */
3647static int ext4_split_convert_extents(handle_t *handle,
3648                                        struct inode *inode,
3649                                        struct ext4_map_blocks *map,
3650                                        struct ext4_ext_path *path,
3651                                        int flags)
3652{
3653        ext4_lblk_t eof_block;
3654        ext4_lblk_t ee_block;
3655        struct ext4_extent *ex;
3656        unsigned int ee_len;
3657        int split_flag = 0, depth;
3658
3659        ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3660                  __func__, inode->i_ino,
3661                  (unsigned long long)map->m_lblk, map->m_len);
3662
3663        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3664                inode->i_sb->s_blocksize_bits;
3665        if (eof_block < map->m_lblk + map->m_len)
3666                eof_block = map->m_lblk + map->m_len;
3667        /*
3668         * It is safe to convert extent to initialized via explicit
3669         * zeroout only if extent is fully insde i_size or new_size.
3670         */
3671        depth = ext_depth(inode);
3672        ex = path[depth].p_ext;
3673        ee_block = le32_to_cpu(ex->ee_block);
3674        ee_len = ext4_ext_get_actual_len(ex);
3675
3676        /* Convert to unwritten */
3677        if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3678                split_flag |= EXT4_EXT_DATA_VALID1;
3679        /* Convert to initialized */
3680        } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3681                split_flag |= ee_block + ee_len <= eof_block ?
3682                              EXT4_EXT_MAY_ZEROOUT : 0;
3683                split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3684        }
3685        flags |= EXT4_GET_BLOCKS_PRE_IO;
3686        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3687}
3688
3689static int ext4_convert_initialized_extents(handle_t *handle,
3690                                            struct inode *inode,
3691                                            struct ext4_map_blocks *map,
3692                                            struct ext4_ext_path *path)
3693{
3694        struct ext4_extent *ex;
3695        ext4_lblk_t ee_block;
3696        unsigned int ee_len;
3697        int depth;
3698        int err = 0;
3699
3700        depth = ext_depth(inode);
3701        ex = path[depth].p_ext;
3702        ee_block = le32_to_cpu(ex->ee_block);
3703        ee_len = ext4_ext_get_actual_len(ex);
3704
3705        ext_debug("%s: inode %lu, logical"
3706                "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3707                  (unsigned long long)ee_block, ee_len);
3708
3709        if (ee_block != map->m_lblk || ee_len > map->m_len) {
3710                err = ext4_split_convert_extents(handle, inode, map, path,
3711                                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3712                if (err < 0)
3713                        goto out;
3714                ext4_ext_drop_refs(path);
3715                path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3716                if (IS_ERR(path)) {
3717                        err = PTR_ERR(path);
3718                        goto out;
3719                }
3720                depth = ext_depth(inode);
3721                ex = path[depth].p_ext;
3722                if (!ex) {
3723                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3724                                         (unsigned long) map->m_lblk);
3725                        err = -EIO;
3726                        goto out;
3727                }
3728        }
3729
3730        err = ext4_ext_get_access(handle, inode, path + depth);
3731        if (err)
3732                goto out;
3733        /* first mark the extent as unwritten */
3734        ext4_ext_mark_unwritten(ex);
3735
3736        /* note: ext4_ext_correct_indexes() isn't needed here because
3737         * borders are not changed
3738         */
3739        ext4_ext_try_to_merge(handle, inode, path, ex);
3740
3741        /* Mark modified extent as dirty */
3742        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3743out:
3744        ext4_ext_show_leaf(inode, path);
3745        return err;
3746}
3747
3748
3749static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3750                                                struct inode *inode,
3751                                                struct ext4_map_blocks *map,
3752                                                struct ext4_ext_path *path)
3753{
3754        struct ext4_extent *ex;
3755        ext4_lblk_t ee_block;
3756        unsigned int ee_len;
3757        int depth;
3758        int err = 0;
3759
3760        depth = ext_depth(inode);
3761        ex = path[depth].p_ext;
3762        ee_block = le32_to_cpu(ex->ee_block);
3763        ee_len = ext4_ext_get_actual_len(ex);
3764
3765        ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3766                "block %llu, max_blocks %u\n", inode->i_ino,
3767                  (unsigned long long)ee_block, ee_len);
3768
3769        /* If extent is larger than requested it is a clear sign that we still
3770         * have some extent state machine issues left. So extent_split is still
3771         * required.
3772         * TODO: Once all related issues will be fixed this situation should be
3773         * illegal.
3774         */
3775        if (ee_block != map->m_lblk || ee_len > map->m_len) {
3776#ifdef EXT4_DEBUG
3777                ext4_warning("Inode (%ld) finished: extent logical block %llu,"
3778                             " len %u; IO logical block %llu, len %u\n",
3779                             inode->i_ino, (unsigned long long)ee_block, ee_len,
3780                             (unsigned long long)map->m_lblk, map->m_len);
3781#endif
3782                err = ext4_split_convert_extents(handle, inode, map, path,
3783                                                 EXT4_GET_BLOCKS_CONVERT);
3784                if (err < 0)
3785                        goto out;
3786                ext4_ext_drop_refs(path);
3787                path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3788                if (IS_ERR(path)) {
3789                        err = PTR_ERR(path);
3790                        goto out;
3791                }
3792                depth = ext_depth(inode);
3793                ex = path[depth].p_ext;
3794        }
3795
3796        err = ext4_ext_get_access(handle, inode, path + depth);
3797        if (err)
3798                goto out;
3799        /* first mark the extent as initialized */
3800        ext4_ext_mark_initialized(ex);
3801
3802        /* note: ext4_ext_correct_indexes() isn't needed here because
3803         * borders are not changed
3804         */
3805        ext4_ext_try_to_merge(handle, inode, path, ex);
3806
3807        /* Mark modified extent as dirty */
3808        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3809out:
3810        ext4_ext_show_leaf(inode, path);
3811        return err;
3812}
3813
3814static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3815                        sector_t block, int count)
3816{
3817        int i;
3818        for (i = 0; i < count; i++)
3819                unmap_underlying_metadata(bdev, block + i);
3820}
3821
3822/*
3823 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3824 */
3825static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3826                              ext4_lblk_t lblk,
3827                              struct ext4_ext_path *path,
3828                              unsigned int len)
3829{
3830        int i, depth;
3831        struct ext4_extent_header *eh;
3832        struct ext4_extent *last_ex;
3833
3834        if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3835                return 0;
3836
3837        depth = ext_depth(inode);
3838        eh = path[depth].p_hdr;
3839
3840        /*
3841         * We're going to remove EOFBLOCKS_FL entirely in future so we
3842         * do not care for this case anymore. Simply remove the flag
3843         * if there are no extents.
3844         */
3845        if (unlikely(!eh->eh_entries))
3846                goto out;
3847        last_ex = EXT_LAST_EXTENT(eh);
3848        /*
3849         * We should clear the EOFBLOCKS_FL flag if we are writing the
3850         * last block in the last extent in the file.  We test this by
3851         * first checking to see if the caller to
3852         * ext4_ext_get_blocks() was interested in the last block (or
3853         * a block beyond the last block) in the current extent.  If
3854         * this turns out to be false, we can bail out from this
3855         * function immediately.
3856         */
3857        if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3858            ext4_ext_get_actual_len(last_ex))
3859                return 0;
3860        /*
3861         * If the caller does appear to be planning to write at or
3862         * beyond the end of the current extent, we then test to see
3863         * if the current extent is the last extent in the file, by
3864         * checking to make sure it was reached via the rightmost node
3865         * at each level of the tree.
3866         */
3867        for (i = depth-1; i >= 0; i--)
3868                if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3869                        return 0;
3870out:
3871        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3872        return ext4_mark_inode_dirty(handle, inode);
3873}
3874
3875/**
3876 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3877 *
3878 * Return 1 if there is a delalloc block in the range, otherwise 0.
3879 */
3880int ext4_find_delalloc_range(struct inode *inode,
3881                             ext4_lblk_t lblk_start,
3882                             ext4_lblk_t lblk_end)
3883{
3884        struct extent_status es;
3885
3886        ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
3887        if (es.es_len == 0)
3888                return 0; /* there is no delay extent in this tree */
3889        else if (es.es_lblk <= lblk_start &&
3890                 lblk_start < es.es_lblk + es.es_len)
3891                return 1;
3892        else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
3893                return 1;
3894        else
3895                return 0;
3896}
3897
3898int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
3899{
3900        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3901        ext4_lblk_t lblk_start, lblk_end;
3902        lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
3903        lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3904
3905        return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
3906}
3907
3908/**
3909 * Determines how many complete clusters (out of those specified by the 'map')
3910 * are under delalloc and were reserved quota for.
3911 * This function is called when we are writing out the blocks that were
3912 * originally written with their allocation delayed, but then the space was
3913 * allocated using fallocate() before the delayed allocation could be resolved.
3914 * The cases to look for are:
3915 * ('=' indicated delayed allocated blocks
3916 *  '-' indicates non-delayed allocated blocks)
3917 * (a) partial clusters towards beginning and/or end outside of allocated range
3918 *     are not delalloc'ed.
3919 *      Ex:
3920 *      |----c---=|====c====|====c====|===-c----|
3921 *               |++++++ allocated ++++++|
3922 *      ==> 4 complete clusters in above example
3923 *
3924 * (b) partial cluster (outside of allocated range) towards either end is
3925 *     marked for delayed allocation. In this case, we will exclude that
3926 *     cluster.
3927 *      Ex:
3928 *      |----====c========|========c========|
3929 *           |++++++ allocated ++++++|
3930 *      ==> 1 complete clusters in above example
3931 *
3932 *      Ex:
3933 *      |================c================|
3934 *            |++++++ allocated ++++++|
3935 *      ==> 0 complete clusters in above example
3936 *
3937 * The ext4_da_update_reserve_space will be called only if we
3938 * determine here that there were some "entire" clusters that span
3939 * this 'allocated' range.
3940 * In the non-bigalloc case, this function will just end up returning num_blks
3941 * without ever calling ext4_find_delalloc_range.
3942 */
3943static unsigned int
3944get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3945                           unsigned int num_blks)
3946{
3947        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3948        ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3949        ext4_lblk_t lblk_from, lblk_to, c_offset;
3950        unsigned int allocated_clusters = 0;
3951
3952        alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3953        alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3954
3955        /* max possible clusters for this allocation */
3956        allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3957
3958        trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3959
3960        /* Check towards left side */
3961        c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
3962        if (c_offset) {
3963                lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
3964                lblk_to = lblk_from + c_offset - 1;
3965
3966                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3967                        allocated_clusters--;
3968        }
3969
3970        /* Now check towards right. */
3971        c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
3972        if (allocated_clusters && c_offset) {
3973                lblk_from = lblk_start + num_blks;
3974                lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3975
3976                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3977                        allocated_clusters--;
3978        }
3979
3980        return allocated_clusters;
3981}
3982
3983static int
3984ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
3985                        struct ext4_map_blocks *map,
3986                        struct ext4_ext_path *path, int flags,
3987                        unsigned int allocated, ext4_fsblk_t newblock)
3988{
3989        int ret = 0;
3990        int err = 0;
3991
3992        /*
3993         * Make sure that the extent is no bigger than we support with
3994         * unwritten extent
3995         */
3996        if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3997                map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3998
3999        ret = ext4_convert_initialized_extents(handle, inode, map,
4000                                                path);
4001        if (ret >= 0) {
4002                ext4_update_inode_fsync_trans(handle, inode, 1);
4003                err = check_eofblocks_fl(handle, inode, map->m_lblk,
4004                                         path, map->m_len);
4005        } else
4006                err = ret;
4007        map->m_flags |= EXT4_MAP_UNWRITTEN;
4008        if (allocated > map->m_len)
4009                allocated = map->m_len;
4010        map->m_len = allocated;
4011
4012        return err ? err : allocated;
4013}
4014
4015static int
4016ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4017                        struct ext4_map_blocks *map,
4018                        struct ext4_ext_path *path, int flags,
4019                        unsigned int allocated, ext4_fsblk_t newblock)
4020{
4021        int ret = 0;
4022        int err = 0;
4023        ext4_io_end_t *io = ext4_inode_aio(inode);
4024
4025        ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
4026                  "block %llu, max_blocks %u, flags %x, allocated %u\n",
4027                  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
4028                  flags, allocated);
4029        ext4_ext_show_leaf(inode, path);
4030
4031        /*
4032         * When writing into unwritten space, we should not fail to
4033         * allocate metadata blocks for the new extent block if needed.
4034         */
4035        flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
4036
4037        trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
4038                                                    allocated, newblock);
4039
4040        /* get_block() before submit the IO, split the extent */
4041        if (flags & EXT4_GET_BLOCKS_PRE_IO) {
4042                ret = ext4_split_convert_extents(handle, inode, map,
4043                                         path, flags | EXT4_GET_BLOCKS_CONVERT);
4044                if (ret <= 0)
4045                        goto out;
4046                /*
4047                 * Flag the inode(non aio case) or end_io struct (aio case)
4048                 * that this IO needs to conversion to written when IO is
4049                 * completed
4050                 */
4051                if (io)
4052                        ext4_set_io_unwritten_flag(inode, io);
4053                else
4054                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
4055                map->m_flags |= EXT4_MAP_UNWRITTEN;
4056                goto out;
4057        }
4058        /* IO end_io complete, convert the filled extent to written */
4059        if (flags & EXT4_GET_BLOCKS_CONVERT) {
4060                if (flags & EXT4_GET_BLOCKS_ZERO) {
4061                        if (allocated > map->m_len)
4062                                allocated = map->m_len;
4063                        err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
4064                                                 allocated);
4065                        if (err < 0)
4066                                goto out2;
4067                }
4068                ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
4069                                                        path);
4070                if (ret >= 0) {
4071                        ext4_update_inode_fsync_trans(handle, inode, 1);
4072                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
4073                                                 path, map->m_len);
4074                } else
4075                        err = ret;
4076                map->m_flags |= EXT4_MAP_MAPPED;
4077                map->m_pblk = newblock;
4078                if (allocated > map->m_len)
4079                        allocated = map->m_len;
4080                map->m_len = allocated;
4081                goto out2;
4082        }
4083        /* buffered IO case */
4084        /*
4085         * repeat fallocate creation request
4086         * we already have an unwritten extent
4087         */
4088        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4089                map->m_flags |= EXT4_MAP_UNWRITTEN;
4090                goto map_out;
4091        }
4092
4093        /* buffered READ or buffered write_begin() lookup */
4094        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4095                /*
4096                 * We have blocks reserved already.  We
4097                 * return allocated blocks so that delalloc
4098                 * won't do block reservation for us.  But
4099                 * the buffer head will be unmapped so that
4100                 * a read from the block returns 0s.
4101                 */
4102                map->m_flags |= EXT4_MAP_UNWRITTEN;
4103                goto out1;
4104        }
4105
4106        /* buffered write, writepage time, convert*/
4107        ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
4108        if (ret >= 0)
4109                ext4_update_inode_fsync_trans(handle, inode, 1);
4110out:
4111        if (ret <= 0) {
4112                err = ret;
4113                goto out2;
4114        } else
4115                allocated = ret;
4116        map->m_flags |= EXT4_MAP_NEW;
4117        /*
4118         * if we allocated more blocks than requested
4119         * we need to make sure we unmap the extra block
4120         * allocated. The actual needed block will get
4121         * unmapped later when we find the buffer_head marked
4122         * new.
4123         */
4124        if (allocated > map->m_len) {
4125                unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
4126                                        newblock + map->m_len,
4127                                        allocated - map->m_len);
4128                allocated = map->m_len;
4129        }
4130        map->m_len = allocated;
4131
4132        /*
4133         * If we have done fallocate with the offset that is already
4134         * delayed allocated, we would have block reservation
4135         * and quota reservation done in the delayed write path.
4136         * But fallocate would have already updated quota and block
4137         * count for this offset. So cancel these reservation
4138         */
4139        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4140                unsigned int reserved_clusters;
4141                reserved_clusters = get_reserved_cluster_alloc(inode,
4142                                map->m_lblk, map->m_len);
4143                if (reserved_clusters)
4144                        ext4_da_update_reserve_space(inode,
4145                                                     reserved_clusters,
4146                                                     0);
4147        }
4148
4149map_out:
4150        map->m_flags |= EXT4_MAP_MAPPED;
4151        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
4152                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
4153                                         map->m_len);
4154                if (err < 0)
4155                        goto out2;
4156        }
4157out1:
4158        if (allocated > map->m_len)
4159                allocated = map->m_len;
4160        ext4_ext_show_leaf(inode, path);
4161        map->m_pblk = newblock;
4162        map->m_len = allocated;
4163out2:
4164        return err ? err : allocated;
4165}
4166
4167/*
4168 * get_implied_cluster_alloc - check to see if the requested
4169 * allocation (in the map structure) overlaps with a cluster already
4170 * allocated in an extent.
4171 *      @sb     The filesystem superblock structure
4172 *      @map    The requested lblk->pblk mapping
4173 *      @ex     The extent structure which might contain an implied
4174 *                      cluster allocation
4175 *
4176 * This function is called by ext4_ext_map_blocks() after we failed to
4177 * find blocks that were already in the inode's extent tree.  Hence,
4178 * we know that the beginning of the requested region cannot overlap
4179 * the extent from the inode's extent tree.  There are three cases we
4180 * want to catch.  The first is this case:
4181 *
4182 *               |--- cluster # N--|
4183 *    |--- extent ---|  |---- requested region ---|
4184 *                      |==========|
4185 *
4186 * The second case that we need to test for is this one:
4187 *
4188 *   |--------- cluster # N ----------------|
4189 *         |--- requested region --|   |------- extent ----|
4190 *         |=======================|
4191 *
4192 * The third case is when the requested region lies between two extents
4193 * within the same cluster:
4194 *          |------------- cluster # N-------------|
4195 * |----- ex -----|                  |---- ex_right ----|
4196 *                  |------ requested region ------|
4197 *                  |================|
4198 *
4199 * In each of the above cases, we need to set the map->m_pblk and
4200 * map->m_len so it corresponds to the return the extent labelled as
4201 * "|====|" from cluster #N, since it is already in use for data in
4202 * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
4203 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4204 * as a new "allocated" block region.  Otherwise, we will return 0 and
4205 * ext4_ext_map_blocks() will then allocate one or more new clusters
4206 * by calling ext4_mb_new_blocks().
4207 */
4208static int get_implied_cluster_alloc(struct super_block *sb,
4209                                     struct ext4_map_blocks *map,
4210                                     struct ext4_extent *ex,
4211                                     struct ext4_ext_path *path)
4212{
4213        struct ext4_sb_info *sbi = EXT4_SB(sb);
4214        ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4215        ext4_lblk_t ex_cluster_start, ex_cluster_end;
4216        ext4_lblk_t rr_cluster_start;
4217        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4218        ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4219        unsigned short ee_len = ext4_ext_get_actual_len(ex);
4220
4221        /* The extent passed in that we are trying to match */
4222        ex_cluster_start = EXT4_B2C(sbi, ee_block);
4223        ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4224
4225        /* The requested region passed into ext4_map_blocks() */
4226        rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4227
4228        if ((rr_cluster_start == ex_cluster_end) ||
4229            (rr_cluster_start == ex_cluster_start)) {
4230                if (rr_cluster_start == ex_cluster_end)
4231                        ee_start += ee_len - 1;
4232                map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4233                map->m_len = min(map->m_len,
4234                                 (unsigned) sbi->s_cluster_ratio - c_offset);
4235                /*
4236                 * Check for and handle this case:
4237                 *
4238                 *   |--------- cluster # N-------------|
4239                 *                     |------- extent ----|
4240                 *         |--- requested region ---|
4241                 *         |===========|
4242                 */
4243
4244                if (map->m_lblk < ee_block)
4245                        map->m_len = min(map->m_len, ee_block - map->m_lblk);
4246
4247                /*
4248                 * Check for the case where there is already another allocated
4249                 * block to the right of 'ex' but before the end of the cluster.
4250                 *
4251                 *          |------------- cluster # N-------------|
4252                 * |----- ex -----|                  |---- ex_right ----|
4253                 *                  |------ requested region ------|
4254                 *                  |================|
4255                 */
4256                if (map->m_lblk > ee_block) {
4257                        ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4258                        map->m_len = min(map->m_len, next - map->m_lblk);
4259                }
4260
4261                trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4262                return 1;
4263        }
4264
4265        trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4266        return 0;
4267}
4268
4269
4270/*
4271 * Block allocation/map/preallocation routine for extents based files
4272 *
4273 *
4274 * Need to be called with
4275 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4276 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4277 *
4278 * return > 0, number of of blocks already mapped/allocated
4279 *          if create == 0 and these are pre-allocated blocks
4280 *              buffer head is unmapped
4281 *          otherwise blocks are mapped
4282 *
4283 * return = 0, if plain look up failed (blocks have not been allocated)
4284 *          buffer head is unmapped
4285 *
4286 * return < 0, error case.
4287 */
4288int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4289                        struct ext4_map_blocks *map, int flags)
4290{
4291        struct ext4_ext_path *path = NULL;
4292        struct ext4_extent newex, *ex, *ex2;
4293        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4294        ext4_fsblk_t newblock = 0;
4295        int free_on_err = 0, err = 0, depth, ret;
4296        unsigned int allocated = 0, offset = 0;
4297        unsigned int allocated_clusters = 0;
4298        struct ext4_allocation_request ar;
4299        ext4_io_end_t *io = ext4_inode_aio(inode);
4300        ext4_lblk_t cluster_offset;
4301        int set_unwritten = 0;
4302        bool map_from_cluster = false;
4303
4304        ext_debug("blocks %u/%u requested for inode %lu\n",
4305                  map->m_lblk, map->m_len, inode->i_ino);
4306        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4307
4308        /* find extent for this block */
4309        path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
4310        if (IS_ERR(path)) {
4311                err = PTR_ERR(path);
4312                path = NULL;
4313                goto out2;
4314        }
4315
4316        depth = ext_depth(inode);
4317
4318        /*
4319         * consistent leaf must not be empty;
4320         * this situation is possible, though, _during_ tree modification;
4321         * this is why assert can't be put in ext4_ext_find_extent()
4322         */
4323        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4324                EXT4_ERROR_INODE(inode, "bad extent address "
4325                                 "lblock: %lu, depth: %d pblock %lld",
4326                                 (unsigned long) map->m_lblk, depth,
4327                                 path[depth].p_block);
4328                err = -EIO;
4329                goto out2;
4330        }
4331
4332        ex = path[depth].p_ext;
4333        if (ex) {
4334                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4335                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4336                unsigned short ee_len;
4337
4338
4339                /*
4340                 * unwritten extents are treated as holes, except that
4341                 * we split out initialized portions during a write.
4342                 */
4343                ee_len = ext4_ext_get_actual_len(ex);
4344
4345                trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4346
4347                /* if found extent covers block, simply return it */
4348                if (in_range(map->m_lblk, ee_block, ee_len)) {
4349                        newblock = map->m_lblk - ee_block + ee_start;
4350                        /* number of remaining blocks in the extent */
4351                        allocated = ee_len - (map->m_lblk - ee_block);
4352                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4353                                  ee_block, ee_len, newblock);
4354
4355                        /*
4356                         * If the extent is initialized check whether the
4357                         * caller wants to convert it to unwritten.
4358                         */
4359                        if ((!ext4_ext_is_unwritten(ex)) &&
4360                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4361                                allocated = ext4_ext_convert_initialized_extent(
4362                                                handle, inode, map, path, flags,
4363                                                allocated, newblock);
4364                                goto out2;
4365                        } else if (!ext4_ext_is_unwritten(ex))
4366                                goto out;
4367
4368                        ret = ext4_ext_handle_unwritten_extents(
4369                                handle, inode, map, path, flags,
4370                                allocated, newblock);
4371                        if (ret < 0)
4372                                err = ret;
4373                        else
4374                                allocated = ret;
4375                        goto out2;
4376                }
4377        }
4378
4379        /*
4380         * requested block isn't allocated yet;
4381         * we couldn't try to create block if create flag is zero
4382         */
4383        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4384                ext4_lblk_t hole_start, hole_len;
4385
4386                hole_start = map->m_lblk;
4387                hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4388                /*
4389                 * put just found gap into cache to speed up
4390                 * subsequent requests
4391                 */
4392                ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4393
4394                /* Update hole_len to reflect hole size after map->m_lblk */
4395                if (hole_start != map->m_lblk)
4396                        hole_len -= map->m_lblk - hole_start;
4397                map->m_pblk = 0;
4398                map->m_len = min_t(unsigned int, map->m_len, hole_len);
4399
4400                goto out2;
4401        }
4402
4403        /*
4404         * Okay, we need to do block allocation.
4405         */
4406        newex.ee_block = cpu_to_le32(map->m_lblk);
4407        cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4408
4409        /*
4410         * If we are doing bigalloc, check to see if the extent returned
4411         * by ext4_ext_find_extent() implies a cluster we can use.
4412         */
4413        if (cluster_offset && ex &&
4414            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4415                ar.len = allocated = map->m_len;
4416                newblock = map->m_pblk;
4417                map_from_cluster = true;
4418                goto got_allocated_blocks;
4419        }
4420
4421        /* find neighbour allocated blocks */
4422        ar.lleft = map->m_lblk;
4423        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4424        if (err)
4425                goto out2;
4426        ar.lright = map->m_lblk;
4427        ex2 = NULL;
4428        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4429        if (err)
4430                goto out2;
4431
4432        /* Check if the extent after searching to the right implies a
4433         * cluster we can use. */
4434        if ((sbi->s_cluster_ratio > 1) && ex2 &&
4435            get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4436                ar.len = allocated = map->m_len;
4437                newblock = map->m_pblk;
4438                map_from_cluster = true;
4439                goto got_allocated_blocks;
4440        }
4441
4442        /*
4443         * See if request is beyond maximum number of blocks we can have in
4444         * a single extent. For an initialized extent this limit is
4445         * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4446         * EXT_UNWRITTEN_MAX_LEN.
4447         */
4448        if (map->m_len > EXT_INIT_MAX_LEN &&
4449            !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4450                map->m_len = EXT_INIT_MAX_LEN;
4451        else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4452                 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4453                map->m_len = EXT_UNWRITTEN_MAX_LEN;
4454
4455        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4456        newex.ee_len = cpu_to_le16(map->m_len);
4457        err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4458        if (err)
4459                allocated = ext4_ext_get_actual_len(&newex);
4460        else
4461                allocated = map->m_len;
4462
4463        /* allocate new block */
4464        ar.inode = inode;
4465        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4466        ar.logical = map->m_lblk;
4467        /*
4468         * We calculate the offset from the beginning of the cluster
4469         * for the logical block number, since when we allocate a
4470         * physical cluster, the physical block should start at the
4471         * same offset from the beginning of the cluster.  This is
4472         * needed so that future calls to get_implied_cluster_alloc()
4473         * work correctly.
4474         */
4475        offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4476        ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4477        ar.goal -= offset;
4478        ar.logical -= offset;
4479        if (S_ISREG(inode->i_mode))
4480                ar.flags = EXT4_MB_HINT_DATA;
4481        else
4482                /* disable in-core preallocation for non-regular files */
4483                ar.flags = 0;
4484        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4485                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4486        newblock = ext4_mb_new_blocks(handle, &ar, &err);
4487        if (!newblock)
4488                goto out2;
4489        ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4490                  ar.goal, newblock, allocated);
4491        free_on_err = 1;
4492        allocated_clusters = ar.len;
4493        ar.len = EXT4_C2B(sbi, ar.len) - offset;
4494        if (ar.len > allocated)
4495                ar.len = allocated;
4496
4497got_allocated_blocks:
4498        /* try to insert new extent into found leaf and return */
4499        ext4_ext_store_pblock(&newex, newblock + offset);
4500        newex.ee_len = cpu_to_le16(ar.len);
4501        /* Mark unwritten */
4502        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4503                ext4_ext_mark_unwritten(&newex);
4504                map->m_flags |= EXT4_MAP_UNWRITTEN;
4505                /*
4506                 * io_end structure was created for every IO write to an
4507                 * unwritten extent. To avoid unnecessary conversion,
4508                 * here we flag the IO that really needs the conversion.
4509                 * For non asycn direct IO case, flag the inode state
4510                 * that we need to perform conversion when IO is done.
4511                 */
4512                if (flags & EXT4_GET_BLOCKS_PRE_IO)
4513                        set_unwritten = 1;
4514        }
4515
4516        err = 0;
4517        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4518                err = check_eofblocks_fl(handle, inode, map->m_lblk,
4519                                         path, ar.len);
4520        if (!err)
4521                err = ext4_ext_insert_extent(handle, inode, path,
4522                                             &newex, flags);
4523
4524        if (!err && set_unwritten) {
4525                if (io)
4526                        ext4_set_io_unwritten_flag(inode, io);
4527                else
4528                        ext4_set_inode_state(inode,
4529                                             EXT4_STATE_DIO_UNWRITTEN);
4530        }
4531
4532        if (err && free_on_err) {
4533                int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4534                        EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4535                /* free data blocks we just allocated */
4536                /* not a good idea to call discard here directly,
4537                 * but otherwise we'd need to call it every free() */
4538                ext4_discard_preallocations(inode);
4539                ext4_free_blocks(handle, inode, NULL, newblock,
4540                                 EXT4_C2B(sbi, allocated_clusters), fb_flags);
4541                goto out2;
4542        }
4543
4544        /* previous routine could use block we allocated */
4545        newblock = ext4_ext_pblock(&newex);
4546        allocated = ext4_ext_get_actual_len(&newex);
4547        if (allocated > map->m_len)
4548                allocated = map->m_len;
4549        map->m_flags |= EXT4_MAP_NEW;
4550
4551        /*
4552         * Update reserved blocks/metadata blocks after successful
4553         * block allocation which had been deferred till now.
4554         */
4555        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4556                unsigned int reserved_clusters;
4557                /*
4558                 * Check how many clusters we had reserved this allocated range
4559                 */
4560                reserved_clusters = get_reserved_cluster_alloc(inode,
4561                                                map->m_lblk, allocated);
4562                if (!map_from_cluster) {
4563                        BUG_ON(allocated_clusters < reserved_clusters);
4564                        if (reserved_clusters < allocated_clusters) {
4565                                struct ext4_inode_info *ei = EXT4_I(inode);
4566                                int reservation = allocated_clusters -
4567                                                  reserved_clusters;
4568                                /*
4569                                 * It seems we claimed few clusters outside of
4570                                 * the range of this allocation. We should give
4571                                 * it back to the reservation pool. This can
4572                                 * happen in the following case:
4573                                 *
4574                                 * * Suppose s_cluster_ratio is 4 (i.e., each
4575                                 *   cluster has 4 blocks. Thus, the clusters
4576                                 *   are [0-3],[4-7],[8-11]...
4577                                 * * First comes delayed allocation write for
4578                                 *   logical blocks 10 & 11. Since there were no
4579                                 *   previous delayed allocated blocks in the
4580                                 *   range [8-11], we would reserve 1 cluster
4581                                 *   for this write.
4582                                 * * Next comes write for logical blocks 3 to 8.
4583                                 *   In this case, we will reserve 2 clusters
4584                                 *   (for [0-3] and [4-7]; and not for [8-11] as
4585                                 *   that range has a delayed allocated blocks.
4586                                 *   Thus total reserved clusters now becomes 3.
4587                                 * * Now, during the delayed allocation writeout
4588                                 *   time, we will first write blocks [3-8] and
4589                                 *   allocate 3 clusters for writing these
4590                                 *   blocks. Also, we would claim all these
4591                                 *   three clusters above.
4592                                 * * Now when we come here to writeout the
4593                                 *   blocks [10-11], we would expect to claim
4594                                 *   the reservation of 1 cluster we had made
4595                                 *   (and we would claim it since there are no
4596                                 *   more delayed allocated blocks in the range
4597                                 *   [8-11]. But our reserved cluster count had
4598                                 *   already gone to 0.
4599                                 *
4600                                 *   Thus, at the step 4 above when we determine
4601                                 *   that there are still some unwritten delayed
4602                                 *   allocated blocks outside of our current
4603                                 *   block range, we should increment the
4604                                 *   reserved clusters count so that when the
4605                                 *   remaining blocks finally gets written, we
4606                                 *   could claim them.
4607                                 */
4608                                dquot_reserve_block(inode,
4609                                                EXT4_C2B(sbi, reservation));
4610                                spin_lock(&ei->i_block_reservation_lock);
4611                                ei->i_reserved_data_blocks += reservation;
4612                                spin_unlock(&ei->i_block_reservation_lock);
4613                        }
4614                        /*
4615                         * We will claim quota for all newly allocated blocks.
4616                         * We're updating the reserved space *after* the
4617                         * correction above so we do not accidentally free
4618                         * all the metadata reservation because we might
4619                         * actually need it later on.
4620                         */
4621                        ext4_da_update_reserve_space(inode, allocated_clusters,
4622                                                        1);
4623                }
4624        }
4625
4626        /*
4627         * Cache the extent and update transaction to commit on fdatasync only
4628         * when it is _not_ an unwritten extent.
4629         */
4630        if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4631                ext4_update_inode_fsync_trans(handle, inode, 1);
4632        else
4633                ext4_update_inode_fsync_trans(handle, inode, 0);
4634out:
4635        if (allocated > map->m_len)
4636                allocated = map->m_len;
4637        ext4_ext_show_leaf(inode, path);
4638        map->m_flags |= EXT4_MAP_MAPPED;
4639        map->m_pblk = newblock;
4640        map->m_len = allocated;
4641out2:
4642        if (path) {
4643                ext4_ext_drop_refs(path);
4644                kfree(path);
4645        }
4646
4647        trace_ext4_ext_map_blocks_exit(inode, flags, map,
4648                                       err ? err : allocated);
4649        ext4_es_lru_add(inode);
4650        return err ? err : allocated;
4651}
4652
4653void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4654{
4655        struct super_block *sb = inode->i_sb;
4656        ext4_lblk_t last_block;
4657        int err = 0;
4658
4659        /*
4660         * TODO: optimization is possible here.
4661         * Probably we need not scan at all,
4662         * because page truncation is enough.
4663         */
4664
4665        /* we have to know where to truncate from in crash case */
4666        EXT4_I(inode)->i_disksize = inode->i_size;
4667        ext4_mark_inode_dirty(handle, inode);
4668
4669        last_block = (inode->i_size + sb->s_blocksize - 1)
4670                        >> EXT4_BLOCK_SIZE_BITS(sb);
4671retry:
4672        err = ext4_es_remove_extent(inode, last_block,
4673                                    EXT_MAX_BLOCKS - last_block);
4674        if (err == -ENOMEM) {
4675                cond_resched();
4676                congestion_wait(BLK_RW_ASYNC, HZ/50);
4677                goto retry;
4678        }
4679        if (err) {
4680                ext4_std_error(inode->i_sb, err);
4681                return;
4682        }
4683        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4684        ext4_std_error(inode->i_sb, err);
4685}
4686
4687static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4688                                  ext4_lblk_t len, loff_t new_size,
4689                                  int flags, int mode)
4690{
4691        struct inode *inode = file_inode(file);
4692        handle_t *handle;
4693        int ret = 0;
4694        int ret2 = 0;
4695        int retries = 0;
4696        struct ext4_map_blocks map;
4697        unsigned int credits;
4698        loff_t epos;
4699
4700        map.m_lblk = offset;
4701        map.m_len = len;
4702        /*
4703         * Don't normalize the request if it can fit in one extent so
4704         * that it doesn't get unnecessarily split into multiple
4705         * extents.
4706         */
4707        if (len <= EXT_UNWRITTEN_MAX_LEN)
4708                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4709
4710        /*
4711         * credits to insert 1 extent into extent tree
4712         */
4713        credits = ext4_chunk_trans_blocks(inode, len);
4714
4715retry:
4716        while (ret >= 0 && len) {
4717                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4718                                            credits);
4719                if (IS_ERR(handle)) {
4720                        ret = PTR_ERR(handle);
4721                        break;
4722                }
4723                ret = ext4_map_blocks(handle, inode, &map, flags);
4724                if (ret <= 0) {
4725                        ext4_debug("inode #%lu: block %u: len %u: "
4726                                   "ext4_ext_map_blocks returned %d",
4727                                   inode->i_ino, map.m_lblk,
4728                                   map.m_len, ret);
4729                        ext4_mark_inode_dirty(handle, inode);
4730                        ret2 = ext4_journal_stop(handle);
4731                        break;
4732                }
4733                map.m_lblk += ret;
4734                map.m_len = len = len - ret;
4735                epos = (loff_t)map.m_lblk << inode->i_blkbits;
4736                inode->i_ctime = ext4_current_time(inode);
4737                if (new_size) {
4738                        if (epos > new_size)
4739                                epos = new_size;
4740                        if (ext4_update_inode_size(inode, epos) & 0x1)
4741                                inode->i_mtime = inode->i_ctime;
4742                } else {
4743                        if (epos > inode->i_size)
4744                                ext4_set_inode_flag(inode,
4745                                                    EXT4_INODE_EOFBLOCKS);
4746                }
4747                ext4_mark_inode_dirty(handle, inode);
4748                ret2 = ext4_journal_stop(handle);
4749                if (ret2)
4750                        break;
4751        }
4752        if (ret == -ENOSPC &&
4753                        ext4_should_retry_alloc(inode->i_sb, &retries)) {
4754                ret = 0;
4755                goto retry;
4756        }
4757
4758        return ret > 0 ? ret2 : ret;
4759}
4760
4761static long ext4_zero_range(struct file *file, loff_t offset,
4762                            loff_t len, int mode)
4763{
4764        struct inode *inode = file_inode(file);
4765        handle_t *handle = NULL;
4766        unsigned int max_blocks;
4767        loff_t new_size = 0;
4768        int ret = 0;
4769        int flags;
4770        int credits;
4771        int partial_begin, partial_end;
4772        loff_t start, end;
4773        ext4_lblk_t lblk;
4774        unsigned int blkbits = inode->i_blkbits;
4775
4776        trace_ext4_zero_range(inode, offset, len, mode);
4777
4778        if (!S_ISREG(inode->i_mode))
4779                return -EINVAL;
4780
4781        /* Call ext4_force_commit to flush all data in case of data=journal. */
4782        if (ext4_should_journal_data(inode)) {
4783                ret = ext4_force_commit(inode->i_sb);
4784                if (ret)
4785                        return ret;
4786        }
4787
4788        /*
4789         * Round up offset. This is not fallocate, we neet to zero out
4790         * blocks, so convert interior block aligned part of the range to
4791         * unwritten and possibly manually zero out unaligned parts of the
4792         * range.
4793         */
4794        start = round_up(offset, 1 << blkbits);
4795        end = round_down((offset + len), 1 << blkbits);
4796
4797        if (start < offset || end > offset + len)
4798                return -EINVAL;
4799        partial_begin = offset & ((1 << blkbits) - 1);
4800        partial_end = (offset + len) & ((1 << blkbits) - 1);
4801
4802        lblk = start >> blkbits;
4803        max_blocks = (end >> blkbits);
4804        if (max_blocks < lblk)
4805                max_blocks = 0;
4806        else
4807                max_blocks -= lblk;
4808
4809        mutex_lock(&inode->i_mutex);
4810
4811        /*
4812         * Indirect files do not support unwritten extnets
4813         */
4814        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4815                ret = -EOPNOTSUPP;
4816                goto out_mutex;
4817        }
4818
4819        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4820             offset + len > i_size_read(inode)) {
4821                new_size = offset + len;
4822                ret = inode_newsize_ok(inode, new_size);
4823                if (ret)
4824                        goto out_mutex;
4825        }
4826
4827        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4828        if (mode & FALLOC_FL_KEEP_SIZE)
4829                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4830
4831        /* Wait all existing dio workers, newcomers will block on i_mutex */
4832        ext4_inode_block_unlocked_dio(inode);
4833        inode_dio_wait(inode);
4834
4835        /* Preallocate the range including the unaligned edges */
4836        if (partial_begin || partial_end) {
4837                ret = ext4_alloc_file_blocks(file,
4838                                round_down(offset, 1 << blkbits) >> blkbits,
4839                                (round_up((offset + len), 1 << blkbits) -
4840                                 round_down(offset, 1 << blkbits)) >> blkbits,
4841                                new_size, flags, mode);
4842                if (ret)
4843                        goto out_dio;
4844
4845        }
4846
4847        /* Zero range excluding the unaligned edges */
4848        if (max_blocks > 0) {
4849                flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4850                          EXT4_EX_NOCACHE);
4851
4852                /*
4853                 * Prevent page faults from reinstantiating pages we have
4854                 * released from page cache.
4855                 */
4856                down_write(&EXT4_I(inode)->i_mmap_sem);
4857                ret = ext4_update_disksize_before_punch(inode, offset, len);
4858                if (ret) {
4859                        up_write(&EXT4_I(inode)->i_mmap_sem);
4860                        goto out_dio;
4861                }
4862                /* Now release the pages and zero block aligned part of pages */
4863                truncate_pagecache_range(inode, start, end - 1);
4864                inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4865
4866                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4867                                             flags, mode);
4868                up_write(&EXT4_I(inode)->i_mmap_sem);
4869                if (ret)
4870                        goto out_dio;
4871        }
4872        if (!partial_begin && !partial_end)
4873                goto out_dio;
4874
4875        /*
4876         * In worst case we have to writeout two nonadjacent unwritten
4877         * blocks and update the inode
4878         */
4879        credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4880        if (ext4_should_journal_data(inode))
4881                credits += 2;
4882        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4883        if (IS_ERR(handle)) {
4884                ret = PTR_ERR(handle);
4885                ext4_std_error(inode->i_sb, ret);
4886                goto out_dio;
4887        }
4888
4889        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4890        if (new_size) {
4891                ext4_update_inode_size(inode, new_size);
4892        } else {
4893                /*
4894                * Mark that we allocate beyond EOF so the subsequent truncate
4895                * can proceed even if the new size is the same as i_size.
4896                */
4897                if ((offset + len) > i_size_read(inode))
4898                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4899        }
4900        ext4_mark_inode_dirty(handle, inode);
4901
4902        /* Zero out partial block at the edges of the range */
4903        ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4904
4905        if (file->f_flags & O_SYNC)
4906                ext4_handle_sync(handle);
4907
4908        ext4_journal_stop(handle);
4909out_dio:
4910        ext4_inode_resume_unlocked_dio(inode);
4911out_mutex:
4912        mutex_unlock(&inode->i_mutex);
4913        return ret;
4914}
4915
4916/*
4917 * preallocate space for a file. This implements ext4's fallocate file
4918 * operation, which gets called from sys_fallocate system call.
4919 * For block-mapped files, posix_fallocate should fall back to the method
4920 * of writing zeroes to the required new blocks (the same behavior which is
4921 * expected for file systems which do not support fallocate() system call).
4922 */
4923long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4924{
4925        struct inode *inode = file_inode(file);
4926        loff_t new_size = 0;
4927        unsigned int max_blocks;
4928        int ret = 0;
4929        int flags;
4930        ext4_lblk_t lblk;
4931        unsigned int blkbits = inode->i_blkbits;
4932
4933        /* Return error if mode is not supported */
4934        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4935                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
4936                return -EOPNOTSUPP;
4937
4938        if (mode & FALLOC_FL_PUNCH_HOLE)
4939                return ext4_punch_hole(inode, offset, len);
4940
4941        ret = ext4_convert_inline_data(inode);
4942        if (ret)
4943                return ret;
4944
4945        if (mode & FALLOC_FL_COLLAPSE_RANGE)
4946                return ext4_collapse_range(inode, offset, len);
4947
4948        if (mode & FALLOC_FL_ZERO_RANGE)
4949                return ext4_zero_range(file, offset, len, mode);
4950
4951        trace_ext4_fallocate_enter(inode, offset, len, mode);
4952        lblk = offset >> blkbits;
4953        /*
4954         * We can't just convert len to max_blocks because
4955         * If blocksize = 4096 offset = 3072 and len = 2048
4956         */
4957        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4958                - lblk;
4959
4960        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4961        if (mode & FALLOC_FL_KEEP_SIZE)
4962                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4963
4964        mutex_lock(&inode->i_mutex);
4965
4966        /*
4967         * We only support preallocation for extent-based files only
4968         */
4969        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4970                ret = -EOPNOTSUPP;
4971                goto out;
4972        }
4973
4974        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4975             offset + len > i_size_read(inode)) {
4976                new_size = offset + len;
4977                ret = inode_newsize_ok(inode, new_size);
4978                if (ret)
4979                        goto out;
4980        }
4981
4982        /* Wait all existing dio workers, newcomers will block on i_mutex */
4983        ext4_inode_block_unlocked_dio(inode);
4984        inode_dio_wait(inode);
4985
4986        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4987                                     flags, mode);
4988        ext4_inode_resume_unlocked_dio(inode);
4989        if (ret)
4990                goto out;
4991
4992        if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4993                ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
4994                                                EXT4_I(inode)->i_sync_tid);
4995        }
4996out:
4997        mutex_unlock(&inode->i_mutex);
4998        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4999        return ret;
5000}
5001
5002/*
5003 * This function convert a range of blocks to written extents
5004 * The caller of this function will pass the start offset and the size.
5005 * all unwritten extents within this range will be converted to
5006 * written extents.
5007 *
5008 * This function is called from the direct IO end io call back
5009 * function, to convert the fallocated extents after IO is completed.
5010 * Returns 0 on success.
5011 */
5012int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
5013                                   loff_t offset, ssize_t len)
5014{
5015        unsigned int max_blocks;
5016        int ret = 0;
5017        int ret2 = 0;
5018        struct ext4_map_blocks map;
5019        unsigned int credits, blkbits = inode->i_blkbits;
5020
5021        map.m_lblk = offset >> blkbits;
5022        /*
5023         * We can't just convert len to max_blocks because
5024         * If blocksize = 4096 offset = 3072 and len = 2048
5025         */
5026        max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
5027                      map.m_lblk);
5028        /*
5029         * This is somewhat ugly but the idea is clear: When transaction is
5030         * reserved, everything goes into it. Otherwise we rather start several
5031         * smaller transactions for conversion of each extent separately.
5032         */
5033        if (handle) {
5034                handle = ext4_journal_start_reserved(handle,
5035                                                     EXT4_HT_EXT_CONVERT);
5036                if (IS_ERR(handle))
5037                        return PTR_ERR(handle);
5038                credits = 0;
5039        } else {
5040                /*
5041                 * credits to insert 1 extent into extent tree
5042                 */
5043                credits = ext4_chunk_trans_blocks(inode, max_blocks);
5044        }
5045        while (ret >= 0 && ret < max_blocks) {
5046                map.m_lblk += ret;
5047                map.m_len = (max_blocks -= ret);
5048                if (credits) {
5049                        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
5050                                                    credits);
5051                        if (IS_ERR(handle)) {
5052                                ret = PTR_ERR(handle);
5053                                break;
5054                        }
5055                }
5056                ret = ext4_map_blocks(handle, inode, &map,
5057                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
5058                if (ret <= 0)
5059                        ext4_warning(inode->i_sb,
5060                                     "inode #%lu: block %u: len %u: "
5061                                     "ext4_ext_map_blocks returned %d",
5062                                     inode->i_ino, map.m_lblk,
5063                                     map.m_len, ret);
5064                ext4_mark_inode_dirty(handle, inode);
5065                if (credits)
5066                        ret2 = ext4_journal_stop(handle);
5067                if (ret <= 0 || ret2)
5068                        break;
5069        }
5070        if (!credits)
5071                ret2 = ext4_journal_stop(handle);
5072        return ret > 0 ? ret2 : ret;
5073}
5074
5075/*
5076 * If newes is not existing extent (newes->ec_pblk equals zero) find
5077 * delayed extent at start of newes and update newes accordingly and
5078 * return start of the next delayed extent.
5079 *
5080 * If newes is existing extent (newes->ec_pblk is not equal zero)
5081 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
5082 * extent found. Leave newes unmodified.
5083 */
5084static int ext4_find_delayed_extent(struct inode *inode,
5085                                    struct extent_status *newes)
5086{
5087        struct extent_status es;
5088        ext4_lblk_t block, next_del;
5089
5090        if (newes->es_pblk == 0) {
5091                ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
5092                                newes->es_lblk + newes->es_len - 1, &es);
5093
5094                /*
5095                 * No extent in extent-tree contains block @newes->es_pblk,
5096                 * then the block may stay in 1)a hole or 2)delayed-extent.
5097                 */
5098                if (es.es_len == 0)
5099                        /* A hole found. */
5100                        return 0;
5101
5102                if (es.es_lblk > newes->es_lblk) {
5103                        /* A hole found. */
5104                        newes->es_len = min(es.es_lblk - newes->es_lblk,
5105                                            newes->es_len);
5106                        return 0;
5107                }
5108
5109                newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
5110        }
5111
5112        block = newes->es_lblk + newes->es_len;
5113        ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
5114        if (es.es_len == 0)
5115                next_del = EXT_MAX_BLOCKS;
5116        else
5117                next_del = es.es_lblk;
5118
5119        return next_del;
5120}
5121/* fiemap flags we can handle specified here */
5122#define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
5123
5124static int ext4_xattr_fiemap(struct inode *inode,
5125                                struct fiemap_extent_info *fieinfo)
5126{
5127        __u64 physical = 0;
5128        __u64 length;
5129        __u32 flags = FIEMAP_EXTENT_LAST;
5130        int blockbits = inode->i_sb->s_blocksize_bits;
5131        int error = 0;
5132
5133        /* in-inode? */
5134        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
5135                struct ext4_iloc iloc;
5136                int offset;     /* offset of xattr in inode */
5137
5138                error = ext4_get_inode_loc(inode, &iloc);
5139                if (error)
5140                        return error;
5141                physical = (__u64)iloc.bh->b_blocknr << blockbits;
5142                offset = EXT4_GOOD_OLD_INODE_SIZE +
5143                                EXT4_I(inode)->i_extra_isize;
5144                physical += offset;
5145                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
5146                flags |= FIEMAP_EXTENT_DATA_INLINE;
5147                brelse(iloc.bh);
5148        } else { /* external block */
5149                physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
5150                length = inode->i_sb->s_blocksize;
5151        }
5152
5153        if (physical)
5154                error = fiemap_fill_next_extent(fieinfo, 0, physical,
5155                                                length, flags);
5156        return (error < 0 ? error : 0);
5157}
5158
5159int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5160                __u64 start, __u64 len)
5161{
5162        ext4_lblk_t start_blk;
5163        int error = 0;
5164
5165        if (ext4_has_inline_data(inode)) {
5166                int has_inline = 1;
5167
5168                error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
5169                                                start, len);
5170
5171                if (has_inline)
5172                        return error;
5173        }
5174
5175        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5176                error = ext4_ext_precache(inode);
5177                if (error)
5178                        return error;
5179        }
5180
5181        /* fallback to generic here if not in extents fmt */
5182        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5183                return generic_block_fiemap(inode, fieinfo, start, len,
5184                        ext4_get_block);
5185
5186        if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
5187                return -EBADR;
5188
5189        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
5190                error = ext4_xattr_fiemap(inode, fieinfo);
5191        } else {
5192                ext4_lblk_t len_blks;
5193                __u64 last_blk;
5194
5195                start_blk = start >> inode->i_sb->s_blocksize_bits;
5196                last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5197                if (last_blk >= EXT_MAX_BLOCKS)
5198                        last_blk = EXT_MAX_BLOCKS-1;
5199                len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5200
5201                /*
5202                 * Walk the extent tree gathering extent information
5203                 * and pushing extents back to the user.
5204                 */
5205                error = ext4_fill_fiemap_extents(inode, start_blk,
5206                                                 len_blks, fieinfo);
5207        }
5208        ext4_es_lru_add(inode);
5209        return error;
5210}
5211
5212/*
5213 * ext4_access_path:
5214 * Function to access the path buffer for marking it dirty.
5215 * It also checks if there are sufficient credits left in the journal handle
5216 * to update path.
5217 */
5218static int
5219ext4_access_path(handle_t *handle, struct inode *inode,
5220                struct ext4_ext_path *path)
5221{
5222        int credits, err;
5223
5224        if (!ext4_handle_valid(handle))
5225                return 0;
5226
5227        /*
5228         * Check if need to extend journal credits
5229         * 3 for leaf, sb, and inode plus 2 (bmap and group
5230         * descriptor) for each block group; assume two block
5231         * groups
5232         */
5233        if (handle->h_buffer_credits < 7) {
5234                credits = ext4_writepage_trans_blocks(inode);
5235                err = ext4_ext_truncate_extend_restart(handle, inode, credits);
5236                /* EAGAIN is success */
5237                if (err && err != -EAGAIN)
5238                        return err;
5239        }
5240
5241        err = ext4_ext_get_access(handle, inode, path);
5242        return err;
5243}
5244
5245/*
5246 * ext4_ext_shift_path_extents:
5247 * Shift the extents of a path structure lying between path[depth].p_ext
5248 * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
5249 * from starting block for each extent.
5250 */
5251static int
5252ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5253                            struct inode *inode, handle_t *handle,
5254                            ext4_lblk_t *start)
5255{
5256        int depth, err = 0;
5257        struct ext4_extent *ex_start, *ex_last;
5258        bool update = 0;
5259        depth = path->p_depth;
5260
5261        while (depth >= 0) {
5262                if (depth == path->p_depth) {
5263                        ex_start = path[depth].p_ext;
5264                        if (!ex_start)
5265                                return -EIO;
5266
5267                        ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5268
5269                        err = ext4_access_path(handle, inode, path + depth);
5270                        if (err)
5271                                goto out;
5272
5273                        if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5274                                update = 1;
5275
5276                        *start = le32_to_cpu(ex_last->ee_block) +
5277                                ext4_ext_get_actual_len(ex_last);
5278
5279                        while (ex_start <= ex_last) {
5280                                le32_add_cpu(&ex_start->ee_block, -shift);
5281                                /* Try to merge to the left. */
5282                                if ((ex_start >
5283                                     EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
5284                                    ext4_ext_try_to_merge_right(inode,
5285                                                        path, ex_start - 1))
5286                                        ex_last--;
5287                                else
5288                                        ex_start++;
5289                        }
5290                        err = ext4_ext_dirty(handle, inode, path + depth);
5291                        if (err)
5292                                goto out;
5293
5294                        if (--depth < 0 || !update)
5295                                break;
5296                }
5297
5298                /* Update index too */
5299                err = ext4_access_path(handle, inode, path + depth);
5300                if (err)
5301                        goto out;
5302
5303                le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5304                err = ext4_ext_dirty(handle, inode, path + depth);
5305                if (err)
5306                        goto out;
5307
5308                /* we are done if current index is not a starting index */
5309                if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5310                        break;
5311
5312                depth--;
5313        }
5314
5315out:
5316        return err;
5317}
5318
5319/*
5320 * ext4_ext_shift_extents:
5321 * All the extents which lies in the range from start to the last allocated
5322 * block for the file are shifted downwards by shift blocks.
5323 * On success, 0 is returned, error otherwise.
5324 */
5325static int
5326ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5327                       ext4_lblk_t start, ext4_lblk_t shift)
5328{
5329        struct ext4_ext_path *path;
5330        int ret = 0, depth;
5331        struct ext4_extent *extent;
5332        ext4_lblk_t stop_block, current_block;
5333        ext4_lblk_t ex_start, ex_end;
5334
5335        /* Let path point to the last extent */
5336        path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
5337        if (IS_ERR(path))
5338                return PTR_ERR(path);
5339
5340        depth = path->p_depth;
5341        extent = path[depth].p_ext;
5342        if (!extent) {
5343                ext4_ext_drop_refs(path);
5344                kfree(path);
5345                return ret;
5346        }
5347
5348        stop_block = le32_to_cpu(extent->ee_block) +
5349                        ext4_ext_get_actual_len(extent);
5350        ext4_ext_drop_refs(path);
5351        kfree(path);
5352
5353        /* Nothing to shift, if hole is at the end of file */
5354        if (start >= stop_block)
5355                return ret;
5356
5357        /*
5358         * Don't start shifting extents until we make sure the hole is big
5359         * enough to accomodate the shift.
5360         */
5361        path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
5362        depth = path->p_depth;
5363        extent =  path[depth].p_ext;
5364        ex_start = le32_to_cpu(extent->ee_block);
5365        ex_end = le32_to_cpu(extent->ee_block) +
5366                        ext4_ext_get_actual_len(extent);
5367        ext4_ext_drop_refs(path);
5368        kfree(path);
5369
5370        if ((start == ex_start && shift > ex_start) ||
5371            (shift > start - ex_end))
5372                return -EINVAL;
5373
5374        /* Its safe to start updating extents */
5375        while (start < stop_block) {
5376                path = ext4_ext_find_extent(inode, start, NULL, 0);
5377                if (IS_ERR(path))
5378                        return PTR_ERR(path);
5379                depth = path->p_depth;
5380                extent = path[depth].p_ext;
5381                if (!extent) {
5382                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5383                                         (unsigned long) start);
5384                        return -EIO;
5385                }
5386
5387                current_block = le32_to_cpu(extent->ee_block);
5388                if (start > current_block) {
5389                        /* Hole, move to the next extent */
5390                        ret = mext_next_extent(inode, path, &extent);
5391                        if (ret != 0) {
5392                                ext4_ext_drop_refs(path);
5393                                kfree(path);
5394                                if (ret == 1)
5395                                        ret = 0;
5396                                break;
5397                        }
5398                }
5399                ret = ext4_ext_shift_path_extents(path, shift, inode,
5400                                handle, &start);
5401                ext4_ext_drop_refs(path);
5402                kfree(path);
5403                if (ret)
5404                        break;
5405        }
5406
5407        return ret;
5408}
5409
5410/*
5411 * ext4_collapse_range:
5412 * This implements the fallocate's collapse range functionality for ext4
5413 * Returns: 0 and non-zero on error.
5414 */
5415int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5416{
5417        struct super_block *sb = inode->i_sb;
5418        ext4_lblk_t punch_start, punch_stop;
5419        handle_t *handle;
5420        unsigned int credits;
5421        loff_t new_size, ioffset;
5422        int ret;
5423
5424        /*
5425         * We need to test this early because xfstests assumes that a
5426         * collapse range of (0, 1) will return EOPNOTSUPP if the file
5427         * system does not support collapse range.
5428         */
5429        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5430                return -EOPNOTSUPP;
5431
5432        /* Collapse range works only on fs block size aligned offsets. */
5433        if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
5434            len & (EXT4_CLUSTER_SIZE(sb) - 1))
5435                return -EINVAL;
5436
5437        if (!S_ISREG(inode->i_mode))
5438                return -EINVAL;
5439
5440        trace_ext4_collapse_range(inode, offset, len);
5441
5442        punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5443        punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5444
5445        /* Call ext4_force_commit to flush all data in case of data=journal. */
5446        if (ext4_should_journal_data(inode)) {
5447                ret = ext4_force_commit(inode->i_sb);
5448                if (ret)
5449                        return ret;
5450        }
5451
5452        mutex_lock(&inode->i_mutex);
5453        /* It's not possible punch hole on append only file */
5454        if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
5455                ret = -EPERM;
5456                goto out_mutex;
5457        }
5458
5459        /*
5460         * There is no need to overlap collapse range with EOF, in which case
5461         * it is effectively a truncate operation
5462         */
5463        if (offset + len >= i_size_read(inode)) {
5464                ret = -EINVAL;
5465                goto out_mutex;
5466        }
5467
5468        /* Currently just for extent based files */
5469        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5470                ret = -EOPNOTSUPP;
5471                goto out_mutex;
5472        }
5473
5474        /* Wait for existing dio to complete */
5475        ext4_inode_block_unlocked_dio(inode);
5476        inode_dio_wait(inode);
5477
5478        /*
5479         * Prevent page faults from reinstantiating pages we have released from
5480         * page cache.
5481         */
5482        down_write(&EXT4_I(inode)->i_mmap_sem);
5483        /*
5484         * Need to round down offset to be aligned with page size boundary
5485         * for page size > block size.
5486         */
5487        ioffset = round_down(offset, PAGE_SIZE);
5488        /*
5489         * Write tail of the last page before removed range since it will get
5490         * removed from the page cache below.
5491         */
5492        ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
5493        if (ret)
5494                goto out_mmap;
5495        /*
5496         * Write data that will be shifted to preserve them when discarding
5497         * page cache below. We are also protected from pages becoming dirty
5498         * by i_mmap_sem.
5499         */
5500        ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
5501                                           LLONG_MAX);
5502        if (ret)
5503                goto out_mmap;
5504        truncate_pagecache(inode, ioffset);
5505
5506        credits = ext4_writepage_trans_blocks(inode);
5507        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5508        if (IS_ERR(handle)) {
5509                ret = PTR_ERR(handle);
5510                goto out_mmap;
5511        }
5512
5513        down_write(&EXT4_I(inode)->i_data_sem);
5514        ext4_discard_preallocations(inode);
5515
5516        ret = ext4_es_remove_extent(inode, punch_start,
5517                                    EXT_MAX_BLOCKS - punch_start);
5518        if (ret) {
5519                up_write(&EXT4_I(inode)->i_data_sem);
5520                goto out_stop;
5521        }
5522
5523        ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5524        if (ret) {
5525                up_write(&EXT4_I(inode)->i_data_sem);
5526                goto out_stop;
5527        }
5528        ext4_discard_preallocations(inode);
5529
5530        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5531                                     punch_stop - punch_start);
5532        if (ret) {
5533                up_write(&EXT4_I(inode)->i_data_sem);
5534                goto out_stop;
5535        }
5536
5537        new_size = i_size_read(inode) - len;
5538        i_size_write(inode, new_size);
5539        EXT4_I(inode)->i_disksize = new_size;
5540
5541        up_write(&EXT4_I(inode)->i_data_sem);
5542        if (IS_SYNC(inode))
5543                ext4_handle_sync(handle);
5544        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5545        ext4_mark_inode_dirty(handle, inode);
5546
5547out_stop:
5548        ext4_journal_stop(handle);
5549out_mmap:
5550        up_write(&EXT4_I(inode)->i_mmap_sem);
5551        ext4_inode_resume_unlocked_dio(inode);
5552out_mutex:
5553        mutex_unlock(&inode->i_mutex);
5554        return ret;
5555}
5556