linux/fs/ext4/extents.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   3 * Written by Alex Tomas <alex@clusterfs.com>
   4 *
   5 * Architecture independence:
   6 *   Copyright (c) 2005, Bull S.A.
   7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License version 2 as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful,
  14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 * GNU General Public License for more details.
  17 *
  18 * You should have received a copy of the GNU General Public Licens
  19 * along with this program; if not, write to the Free Software
  20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  21 */
  22
  23/*
  24 * Extents support for EXT4
  25 *
  26 * TODO:
  27 *   - ext4*_error() should be used in some situations
  28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
  29 *   - smart tree reduction
  30 */
  31
  32#include <linux/fs.h>
  33#include <linux/time.h>
  34#include <linux/jbd2.h>
  35#include <linux/highuid.h>
  36#include <linux/pagemap.h>
  37#include <linux/quotaops.h>
  38#include <linux/string.h>
  39#include <linux/slab.h>
  40#include <linux/falloc.h>
  41#include <asm/uaccess.h>
  42#include <linux/fiemap.h>
  43#include "ext4_jbd2.h"
  44#include "ext4_extents.h"
  45#include "xattr.h"
  46
  47#include <trace/events/ext4.h>
  48
  49/*
  50 * used by extent splitting.
  51 */
  52#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
  53                                        due to ENOSPC */
  54#define EXT4_EXT_MARK_UNINIT1   0x2  /* mark first half uninitialized */
  55#define EXT4_EXT_MARK_UNINIT2   0x4  /* mark second half uninitialized */
  56
  57#define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
  58#define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
  59
  60static __le32 ext4_extent_block_csum(struct inode *inode,
  61                                     struct ext4_extent_header *eh)
  62{
  63        struct ext4_inode_info *ei = EXT4_I(inode);
  64        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  65        __u32 csum;
  66
  67        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
  68                           EXT4_EXTENT_TAIL_OFFSET(eh));
  69        return cpu_to_le32(csum);
  70}
  71
  72static int ext4_extent_block_csum_verify(struct inode *inode,
  73                                         struct ext4_extent_header *eh)
  74{
  75        struct ext4_extent_tail *et;
  76
  77        if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
  78                EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
  79                return 1;
  80
  81        et = find_ext4_extent_tail(eh);
  82        if (et->et_checksum != ext4_extent_block_csum(inode, eh))
  83                return 0;
  84        return 1;
  85}
  86
  87static void ext4_extent_block_csum_set(struct inode *inode,
  88                                       struct ext4_extent_header *eh)
  89{
  90        struct ext4_extent_tail *et;
  91
  92        if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
  93                EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
  94                return;
  95
  96        et = find_ext4_extent_tail(eh);
  97        et->et_checksum = ext4_extent_block_csum(inode, eh);
  98}
  99
 100static int ext4_split_extent(handle_t *handle,
 101                                struct inode *inode,
 102                                struct ext4_ext_path *path,
 103                                struct ext4_map_blocks *map,
 104                                int split_flag,
 105                                int flags);
 106
 107static int ext4_split_extent_at(handle_t *handle,
 108                             struct inode *inode,
 109                             struct ext4_ext_path *path,
 110                             ext4_lblk_t split,
 111                             int split_flag,
 112                             int flags);
 113
 114static int ext4_find_delayed_extent(struct inode *inode,
 115                                    struct extent_status *newes);
 116
 117static int ext4_ext_truncate_extend_restart(handle_t *handle,
 118                                            struct inode *inode,
 119                                            int needed)
 120{
 121        int err;
 122
 123        if (!ext4_handle_valid(handle))
 124                return 0;
 125        if (handle->h_buffer_credits > needed)
 126                return 0;
 127        err = ext4_journal_extend(handle, needed);
 128        if (err <= 0)
 129                return err;
 130        err = ext4_truncate_restart_trans(handle, inode, needed);
 131        if (err == 0)
 132                err = -EAGAIN;
 133
 134        return err;
 135}
 136
 137/*
 138 * could return:
 139 *  - EROFS
 140 *  - ENOMEM
 141 */
 142static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 143                                struct ext4_ext_path *path)
 144{
 145        if (path->p_bh) {
 146                /* path points to block */
 147                return ext4_journal_get_write_access(handle, path->p_bh);
 148        }
 149        /* path points to leaf/index in inode body */
 150        /* we use in-core data, no need to protect them */
 151        return 0;
 152}
 153
 154/*
 155 * could return:
 156 *  - EROFS
 157 *  - ENOMEM
 158 *  - EIO
 159 */
 160int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
 161                     struct inode *inode, struct ext4_ext_path *path)
 162{
 163        int err;
 164        if (path->p_bh) {
 165                ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
 166                /* path points to block */
 167                err = __ext4_handle_dirty_metadata(where, line, handle,
 168                                                   inode, path->p_bh);
 169        } else {
 170                /* path points to leaf/index in inode body */
 171                err = ext4_mark_inode_dirty(handle, inode);
 172        }
 173        return err;
 174}
 175
 176static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 177                              struct ext4_ext_path *path,
 178                              ext4_lblk_t block)
 179{
 180        if (path) {
 181                int depth = path->p_depth;
 182                struct ext4_extent *ex;
 183
 184                /*
 185                 * Try to predict block placement assuming that we are
 186                 * filling in a file which will eventually be
 187                 * non-sparse --- i.e., in the case of libbfd writing
 188                 * an ELF object sections out-of-order but in a way
 189                 * the eventually results in a contiguous object or
 190                 * executable file, or some database extending a table
 191                 * space file.  However, this is actually somewhat
 192                 * non-ideal if we are writing a sparse file such as
 193                 * qemu or KVM writing a raw image file that is going
 194                 * to stay fairly sparse, since it will end up
 195                 * fragmenting the file system's free space.  Maybe we
 196                 * should have some hueristics or some way to allow
 197                 * userspace to pass a hint to file system,
 198                 * especially if the latter case turns out to be
 199                 * common.
 200                 */
 201                ex = path[depth].p_ext;
 202                if (ex) {
 203                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
 204                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
 205
 206                        if (block > ext_block)
 207                                return ext_pblk + (block - ext_block);
 208                        else
 209                                return ext_pblk - (ext_block - block);
 210                }
 211
 212                /* it looks like index is empty;
 213                 * try to find starting block from index itself */
 214                if (path[depth].p_bh)
 215                        return path[depth].p_bh->b_blocknr;
 216        }
 217
 218        /* OK. use inode's group */
 219        return ext4_inode_to_goal_block(inode);
 220}
 221
 222/*
 223 * Allocation for a meta data block
 224 */
 225static ext4_fsblk_t
 226ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 227                        struct ext4_ext_path *path,
 228                        struct ext4_extent *ex, int *err, unsigned int flags)
 229{
 230        ext4_fsblk_t goal, newblock;
 231
 232        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
 233        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
 234                                        NULL, err);
 235        return newblock;
 236}
 237
 238static inline int ext4_ext_space_block(struct inode *inode, int check)
 239{
 240        int size;
 241
 242        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 243                        / sizeof(struct ext4_extent);
 244#ifdef AGGRESSIVE_TEST
 245        if (!check && size > 6)
 246                size = 6;
 247#endif
 248        return size;
 249}
 250
 251static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 252{
 253        int size;
 254
 255        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 256                        / sizeof(struct ext4_extent_idx);
 257#ifdef AGGRESSIVE_TEST
 258        if (!check && size > 5)
 259                size = 5;
 260#endif
 261        return size;
 262}
 263
 264static inline int ext4_ext_space_root(struct inode *inode, int check)
 265{
 266        int size;
 267
 268        size = sizeof(EXT4_I(inode)->i_data);
 269        size -= sizeof(struct ext4_extent_header);
 270        size /= sizeof(struct ext4_extent);
 271#ifdef AGGRESSIVE_TEST
 272        if (!check && size > 3)
 273                size = 3;
 274#endif
 275        return size;
 276}
 277
 278static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 279{
 280        int size;
 281
 282        size = sizeof(EXT4_I(inode)->i_data);
 283        size -= sizeof(struct ext4_extent_header);
 284        size /= sizeof(struct ext4_extent_idx);
 285#ifdef AGGRESSIVE_TEST
 286        if (!check && size > 4)
 287                size = 4;
 288#endif
 289        return size;
 290}
 291
 292/*
 293 * Calculate the number of metadata blocks needed
 294 * to allocate @blocks
 295 * Worse case is one block per extent
 296 */
 297int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 298{
 299        struct ext4_inode_info *ei = EXT4_I(inode);
 300        int idxs;
 301
 302        idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 303                / sizeof(struct ext4_extent_idx));
 304
 305        /*
 306         * If the new delayed allocation block is contiguous with the
 307         * previous da block, it can share index blocks with the
 308         * previous block, so we only need to allocate a new index
 309         * block every idxs leaf blocks.  At ldxs**2 blocks, we need
 310         * an additional index block, and at ldxs**3 blocks, yet
 311         * another index blocks.
 312         */
 313        if (ei->i_da_metadata_calc_len &&
 314            ei->i_da_metadata_calc_last_lblock+1 == lblock) {
 315                int num = 0;
 316
 317                if ((ei->i_da_metadata_calc_len % idxs) == 0)
 318                        num++;
 319                if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
 320                        num++;
 321                if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
 322                        num++;
 323                        ei->i_da_metadata_calc_len = 0;
 324                } else
 325                        ei->i_da_metadata_calc_len++;
 326                ei->i_da_metadata_calc_last_lblock++;
 327                return num;
 328        }
 329
 330        /*
 331         * In the worst case we need a new set of index blocks at
 332         * every level of the inode's extent tree.
 333         */
 334        ei->i_da_metadata_calc_len = 1;
 335        ei->i_da_metadata_calc_last_lblock = lblock;
 336        return ext_depth(inode) + 1;
 337}
 338
 339static int
 340ext4_ext_max_entries(struct inode *inode, int depth)
 341{
 342        int max;
 343
 344        if (depth == ext_depth(inode)) {
 345                if (depth == 0)
 346                        max = ext4_ext_space_root(inode, 1);
 347                else
 348                        max = ext4_ext_space_root_idx(inode, 1);
 349        } else {
 350                if (depth == 0)
 351                        max = ext4_ext_space_block(inode, 1);
 352                else
 353                        max = ext4_ext_space_block_idx(inode, 1);
 354        }
 355
 356        return max;
 357}
 358
 359static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 360{
 361        ext4_fsblk_t block = ext4_ext_pblock(ext);
 362        int len = ext4_ext_get_actual_len(ext);
 363
 364        if (len == 0)
 365                return 0;
 366        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 367}
 368
 369static int ext4_valid_extent_idx(struct inode *inode,
 370                                struct ext4_extent_idx *ext_idx)
 371{
 372        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
 373
 374        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 375}
 376
 377static int ext4_valid_extent_entries(struct inode *inode,
 378                                struct ext4_extent_header *eh,
 379                                int depth)
 380{
 381        unsigned short entries;
 382        if (eh->eh_entries == 0)
 383                return 1;
 384
 385        entries = le16_to_cpu(eh->eh_entries);
 386
 387        if (depth == 0) {
 388                /* leaf entries */
 389                struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
 390                while (entries) {
 391                        if (!ext4_valid_extent(inode, ext))
 392                                return 0;
 393                        ext++;
 394                        entries--;
 395                }
 396        } else {
 397                struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
 398                while (entries) {
 399                        if (!ext4_valid_extent_idx(inode, ext_idx))
 400                                return 0;
 401                        ext_idx++;
 402                        entries--;
 403                }
 404        }
 405        return 1;
 406}
 407
 408static int __ext4_ext_check(const char *function, unsigned int line,
 409                            struct inode *inode, struct ext4_extent_header *eh,
 410                            int depth)
 411{
 412        const char *error_msg;
 413        int max = 0;
 414
 415        if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
 416                error_msg = "invalid magic";
 417                goto corrupted;
 418        }
 419        if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
 420                error_msg = "unexpected eh_depth";
 421                goto corrupted;
 422        }
 423        if (unlikely(eh->eh_max == 0)) {
 424                error_msg = "invalid eh_max";
 425                goto corrupted;
 426        }
 427        max = ext4_ext_max_entries(inode, depth);
 428        if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
 429                error_msg = "too large eh_max";
 430                goto corrupted;
 431        }
 432        if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
 433                error_msg = "invalid eh_entries";
 434                goto corrupted;
 435        }
 436        if (!ext4_valid_extent_entries(inode, eh, depth)) {
 437                error_msg = "invalid extent entries";
 438                goto corrupted;
 439        }
 440        /* Verify checksum on non-root extent tree nodes */
 441        if (ext_depth(inode) != depth &&
 442            !ext4_extent_block_csum_verify(inode, eh)) {
 443                error_msg = "extent tree corrupted";
 444                goto corrupted;
 445        }
 446        return 0;
 447
 448corrupted:
 449        ext4_error_inode(inode, function, line, 0,
 450                        "bad header/extent: %s - magic %x, "
 451                        "entries %u, max %u(%u), depth %u(%u)",
 452                        error_msg, le16_to_cpu(eh->eh_magic),
 453                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
 454                        max, le16_to_cpu(eh->eh_depth), depth);
 455
 456        return -EIO;
 457}
 458
 459#define ext4_ext_check(inode, eh, depth)        \
 460        __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
 461
 462int ext4_ext_check_inode(struct inode *inode)
 463{
 464        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
 465}
 466
 467static int __ext4_ext_check_block(const char *function, unsigned int line,
 468                                  struct inode *inode,
 469                                  struct ext4_extent_header *eh,
 470                                  int depth,
 471                                  struct buffer_head *bh)
 472{
 473        int ret;
 474
 475        if (buffer_verified(bh))
 476                return 0;
 477        ret = ext4_ext_check(inode, eh, depth);
 478        if (ret)
 479                return ret;
 480        set_buffer_verified(bh);
 481        return ret;
 482}
 483
 484#define ext4_ext_check_block(inode, eh, depth, bh)      \
 485        __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
 486
 487#ifdef EXT_DEBUG
 488static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 489{
 490        int k, l = path->p_depth;
 491
 492        ext_debug("path:");
 493        for (k = 0; k <= l; k++, path++) {
 494                if (path->p_idx) {
 495                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
 496                            ext4_idx_pblock(path->p_idx));
 497                } else if (path->p_ext) {
 498                        ext_debug("  %d:[%d]%d:%llu ",
 499                                  le32_to_cpu(path->p_ext->ee_block),
 500                                  ext4_ext_is_uninitialized(path->p_ext),
 501                                  ext4_ext_get_actual_len(path->p_ext),
 502                                  ext4_ext_pblock(path->p_ext));
 503                } else
 504                        ext_debug("  []");
 505        }
 506        ext_debug("\n");
 507}
 508
 509static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 510{
 511        int depth = ext_depth(inode);
 512        struct ext4_extent_header *eh;
 513        struct ext4_extent *ex;
 514        int i;
 515
 516        if (!path)
 517                return;
 518
 519        eh = path[depth].p_hdr;
 520        ex = EXT_FIRST_EXTENT(eh);
 521
 522        ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
 523
 524        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
 525                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
 526                          ext4_ext_is_uninitialized(ex),
 527                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
 528        }
 529        ext_debug("\n");
 530}
 531
 532static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
 533                        ext4_fsblk_t newblock, int level)
 534{
 535        int depth = ext_depth(inode);
 536        struct ext4_extent *ex;
 537
 538        if (depth != level) {
 539                struct ext4_extent_idx *idx;
 540                idx = path[level].p_idx;
 541                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
 542                        ext_debug("%d: move %d:%llu in new index %llu\n", level,
 543                                        le32_to_cpu(idx->ei_block),
 544                                        ext4_idx_pblock(idx),
 545                                        newblock);
 546                        idx++;
 547                }
 548
 549                return;
 550        }
 551
 552        ex = path[depth].p_ext;
 553        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
 554                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
 555                                le32_to_cpu(ex->ee_block),
 556                                ext4_ext_pblock(ex),
 557                                ext4_ext_is_uninitialized(ex),
 558                                ext4_ext_get_actual_len(ex),
 559                                newblock);
 560                ex++;
 561        }
 562}
 563
 564#else
 565#define ext4_ext_show_path(inode, path)
 566#define ext4_ext_show_leaf(inode, path)
 567#define ext4_ext_show_move(inode, path, newblock, level)
 568#endif
 569
 570void ext4_ext_drop_refs(struct ext4_ext_path *path)
 571{
 572        int depth = path->p_depth;
 573        int i;
 574
 575        for (i = 0; i <= depth; i++, path++)
 576                if (path->p_bh) {
 577                        brelse(path->p_bh);
 578                        path->p_bh = NULL;
 579                }
 580}
 581
 582/*
 583 * ext4_ext_binsearch_idx:
 584 * binary search for the closest index of the given block
 585 * the header must be checked before calling this
 586 */
 587static void
 588ext4_ext_binsearch_idx(struct inode *inode,
 589                        struct ext4_ext_path *path, ext4_lblk_t block)
 590{
 591        struct ext4_extent_header *eh = path->p_hdr;
 592        struct ext4_extent_idx *r, *l, *m;
 593
 594
 595        ext_debug("binsearch for %u(idx):  ", block);
 596
 597        l = EXT_FIRST_INDEX(eh) + 1;
 598        r = EXT_LAST_INDEX(eh);
 599        while (l <= r) {
 600                m = l + (r - l) / 2;
 601                if (block < le32_to_cpu(m->ei_block))
 602                        r = m - 1;
 603                else
 604                        l = m + 1;
 605                ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
 606                                m, le32_to_cpu(m->ei_block),
 607                                r, le32_to_cpu(r->ei_block));
 608        }
 609
 610        path->p_idx = l - 1;
 611        ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
 612                  ext4_idx_pblock(path->p_idx));
 613
 614#ifdef CHECK_BINSEARCH
 615        {
 616                struct ext4_extent_idx *chix, *ix;
 617                int k;
 618
 619                chix = ix = EXT_FIRST_INDEX(eh);
 620                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
 621                  if (k != 0 &&
 622                      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
 623                                printk(KERN_DEBUG "k=%d, ix=0x%p, "
 624                                       "first=0x%p\n", k,
 625                                       ix, EXT_FIRST_INDEX(eh));
 626                                printk(KERN_DEBUG "%u <= %u\n",
 627                                       le32_to_cpu(ix->ei_block),
 628                                       le32_to_cpu(ix[-1].ei_block));
 629                        }
 630                        BUG_ON(k && le32_to_cpu(ix->ei_block)
 631                                           <= le32_to_cpu(ix[-1].ei_block));
 632                        if (block < le32_to_cpu(ix->ei_block))
 633                                break;
 634                        chix = ix;
 635                }
 636                BUG_ON(chix != path->p_idx);
 637        }
 638#endif
 639
 640}
 641
 642/*
 643 * ext4_ext_binsearch:
 644 * binary search for closest extent of the given block
 645 * the header must be checked before calling this
 646 */
 647static void
 648ext4_ext_binsearch(struct inode *inode,
 649                struct ext4_ext_path *path, ext4_lblk_t block)
 650{
 651        struct ext4_extent_header *eh = path->p_hdr;
 652        struct ext4_extent *r, *l, *m;
 653
 654        if (eh->eh_entries == 0) {
 655                /*
 656                 * this leaf is empty:
 657                 * we get such a leaf in split/add case
 658                 */
 659                return;
 660        }
 661
 662        ext_debug("binsearch for %u:  ", block);
 663
 664        l = EXT_FIRST_EXTENT(eh) + 1;
 665        r = EXT_LAST_EXTENT(eh);
 666
 667        while (l <= r) {
 668                m = l + (r - l) / 2;
 669                if (block < le32_to_cpu(m->ee_block))
 670                        r = m - 1;
 671                else
 672                        l = m + 1;
 673                ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
 674                                m, le32_to_cpu(m->ee_block),
 675                                r, le32_to_cpu(r->ee_block));
 676        }
 677
 678        path->p_ext = l - 1;
 679        ext_debug("  -> %d:%llu:[%d]%d ",
 680                        le32_to_cpu(path->p_ext->ee_block),
 681                        ext4_ext_pblock(path->p_ext),
 682                        ext4_ext_is_uninitialized(path->p_ext),
 683                        ext4_ext_get_actual_len(path->p_ext));
 684
 685#ifdef CHECK_BINSEARCH
 686        {
 687                struct ext4_extent *chex, *ex;
 688                int k;
 689
 690                chex = ex = EXT_FIRST_EXTENT(eh);
 691                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
 692                        BUG_ON(k && le32_to_cpu(ex->ee_block)
 693                                          <= le32_to_cpu(ex[-1].ee_block));
 694                        if (block < le32_to_cpu(ex->ee_block))
 695                                break;
 696                        chex = ex;
 697                }
 698                BUG_ON(chex != path->p_ext);
 699        }
 700#endif
 701
 702}
 703
 704int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 705{
 706        struct ext4_extent_header *eh;
 707
 708        eh = ext_inode_hdr(inode);
 709        eh->eh_depth = 0;
 710        eh->eh_entries = 0;
 711        eh->eh_magic = EXT4_EXT_MAGIC;
 712        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
 713        ext4_mark_inode_dirty(handle, inode);
 714        return 0;
 715}
 716
 717struct ext4_ext_path *
 718ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 719                                        struct ext4_ext_path *path)
 720{
 721        struct ext4_extent_header *eh;
 722        struct buffer_head *bh;
 723        short int depth, i, ppos = 0, alloc = 0;
 724        int ret;
 725
 726        eh = ext_inode_hdr(inode);
 727        depth = ext_depth(inode);
 728
 729        /* account possible depth increase */
 730        if (!path) {
 731                path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
 732                                GFP_NOFS);
 733                if (!path)
 734                        return ERR_PTR(-ENOMEM);
 735                alloc = 1;
 736        }
 737        path[0].p_hdr = eh;
 738        path[0].p_bh = NULL;
 739
 740        i = depth;
 741        /* walk through the tree */
 742        while (i) {
 743                ext_debug("depth %d: num %d, max %d\n",
 744                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 745
 746                ext4_ext_binsearch_idx(inode, path + ppos, block);
 747                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 748                path[ppos].p_depth = i;
 749                path[ppos].p_ext = NULL;
 750
 751                bh = sb_getblk(inode->i_sb, path[ppos].p_block);
 752                if (unlikely(!bh)) {
 753                        ret = -ENOMEM;
 754                        goto err;
 755                }
 756                if (!bh_uptodate_or_lock(bh)) {
 757                        trace_ext4_ext_load_extent(inode, block,
 758                                                path[ppos].p_block);
 759                        ret = bh_submit_read(bh);
 760                        if (ret < 0) {
 761                                put_bh(bh);
 762                                goto err;
 763                        }
 764                }
 765                eh = ext_block_hdr(bh);
 766                ppos++;
 767                if (unlikely(ppos > depth)) {
 768                        put_bh(bh);
 769                        EXT4_ERROR_INODE(inode,
 770                                         "ppos %d > depth %d", ppos, depth);
 771                        ret = -EIO;
 772                        goto err;
 773                }
 774                path[ppos].p_bh = bh;
 775                path[ppos].p_hdr = eh;
 776                i--;
 777
 778                ret = ext4_ext_check_block(inode, eh, i, bh);
 779                if (ret < 0)
 780                        goto err;
 781        }
 782
 783        path[ppos].p_depth = i;
 784        path[ppos].p_ext = NULL;
 785        path[ppos].p_idx = NULL;
 786
 787        /* find extent */
 788        ext4_ext_binsearch(inode, path + ppos, block);
 789        /* if not an empty leaf */
 790        if (path[ppos].p_ext)
 791                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 792
 793        ext4_ext_show_path(inode, path);
 794
 795        return path;
 796
 797err:
 798        ext4_ext_drop_refs(path);
 799        if (alloc)
 800                kfree(path);
 801        return ERR_PTR(ret);
 802}
 803
 804/*
 805 * ext4_ext_insert_index:
 806 * insert new index [@logical;@ptr] into the block at @curp;
 807 * check where to insert: before @curp or after @curp
 808 */
 809static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 810                                 struct ext4_ext_path *curp,
 811                                 int logical, ext4_fsblk_t ptr)
 812{
 813        struct ext4_extent_idx *ix;
 814        int len, err;
 815
 816        err = ext4_ext_get_access(handle, inode, curp);
 817        if (err)
 818                return err;
 819
 820        if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
 821                EXT4_ERROR_INODE(inode,
 822                                 "logical %d == ei_block %d!",
 823                                 logical, le32_to_cpu(curp->p_idx->ei_block));
 824                return -EIO;
 825        }
 826
 827        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
 828                             >= le16_to_cpu(curp->p_hdr->eh_max))) {
 829                EXT4_ERROR_INODE(inode,
 830                                 "eh_entries %d >= eh_max %d!",
 831                                 le16_to_cpu(curp->p_hdr->eh_entries),
 832                                 le16_to_cpu(curp->p_hdr->eh_max));
 833                return -EIO;
 834        }
 835
 836        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
 837                /* insert after */
 838                ext_debug("insert new index %d after: %llu\n", logical, ptr);
 839                ix = curp->p_idx + 1;
 840        } else {
 841                /* insert before */
 842                ext_debug("insert new index %d before: %llu\n", logical, ptr);
 843                ix = curp->p_idx;
 844        }
 845
 846        len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
 847        BUG_ON(len < 0);
 848        if (len > 0) {
 849                ext_debug("insert new index %d: "
 850                                "move %d indices from 0x%p to 0x%p\n",
 851                                logical, len, ix, ix + 1);
 852                memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
 853        }
 854
 855        if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
 856                EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
 857                return -EIO;
 858        }
 859
 860        ix->ei_block = cpu_to_le32(logical);
 861        ext4_idx_store_pblock(ix, ptr);
 862        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
 863
 864        if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
 865                EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
 866                return -EIO;
 867        }
 868
 869        err = ext4_ext_dirty(handle, inode, curp);
 870        ext4_std_error(inode->i_sb, err);
 871
 872        return err;
 873}
 874
 875/*
 876 * ext4_ext_split:
 877 * inserts new subtree into the path, using free index entry
 878 * at depth @at:
 879 * - allocates all needed blocks (new leaf and all intermediate index blocks)
 880 * - makes decision where to split
 881 * - moves remaining extents and index entries (right to the split point)
 882 *   into the newly allocated blocks
 883 * - initializes subtree
 884 */
 885static int ext4_ext_split(handle_t *handle, struct inode *inode,
 886                          unsigned int flags,
 887                          struct ext4_ext_path *path,
 888                          struct ext4_extent *newext, int at)
 889{
 890        struct buffer_head *bh = NULL;
 891        int depth = ext_depth(inode);
 892        struct ext4_extent_header *neh;
 893        struct ext4_extent_idx *fidx;
 894        int i = at, k, m, a;
 895        ext4_fsblk_t newblock, oldblock;
 896        __le32 border;
 897        ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
 898        int err = 0;
 899
 900        /* make decision: where to split? */
 901        /* FIXME: now decision is simplest: at current extent */
 902
 903        /* if current leaf will be split, then we should use
 904         * border from split point */
 905        if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
 906                EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
 907                return -EIO;
 908        }
 909        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
 910                border = path[depth].p_ext[1].ee_block;
 911                ext_debug("leaf will be split."
 912                                " next leaf starts at %d\n",
 913                                  le32_to_cpu(border));
 914        } else {
 915                border = newext->ee_block;
 916                ext_debug("leaf will be added."
 917                                " next leaf starts at %d\n",
 918                                le32_to_cpu(border));
 919        }
 920
 921        /*
 922         * If error occurs, then we break processing
 923         * and mark filesystem read-only. index won't
 924         * be inserted and tree will be in consistent
 925         * state. Next mount will repair buffers too.
 926         */
 927
 928        /*
 929         * Get array to track all allocated blocks.
 930         * We need this to handle errors and free blocks
 931         * upon them.
 932         */
 933        ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
 934        if (!ablocks)
 935                return -ENOMEM;
 936
 937        /* allocate all needed blocks */
 938        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
 939        for (a = 0; a < depth - at; a++) {
 940                newblock = ext4_ext_new_meta_block(handle, inode, path,
 941                                                   newext, &err, flags);
 942                if (newblock == 0)
 943                        goto cleanup;
 944                ablocks[a] = newblock;
 945        }
 946
 947        /* initialize new leaf */
 948        newblock = ablocks[--a];
 949        if (unlikely(newblock == 0)) {
 950                EXT4_ERROR_INODE(inode, "newblock == 0!");
 951                err = -EIO;
 952                goto cleanup;
 953        }
 954        bh = sb_getblk(inode->i_sb, newblock);
 955        if (unlikely(!bh)) {
 956                err = -ENOMEM;
 957                goto cleanup;
 958        }
 959        lock_buffer(bh);
 960
 961        err = ext4_journal_get_create_access(handle, bh);
 962        if (err)
 963                goto cleanup;
 964
 965        neh = ext_block_hdr(bh);
 966        neh->eh_entries = 0;
 967        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
 968        neh->eh_magic = EXT4_EXT_MAGIC;
 969        neh->eh_depth = 0;
 970
 971        /* move remainder of path[depth] to the new leaf */
 972        if (unlikely(path[depth].p_hdr->eh_entries !=
 973                     path[depth].p_hdr->eh_max)) {
 974                EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
 975                                 path[depth].p_hdr->eh_entries,
 976                                 path[depth].p_hdr->eh_max);
 977                err = -EIO;
 978                goto cleanup;
 979        }
 980        /* start copy from next extent */
 981        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
 982        ext4_ext_show_move(inode, path, newblock, depth);
 983        if (m) {
 984                struct ext4_extent *ex;
 985                ex = EXT_FIRST_EXTENT(neh);
 986                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
 987                le16_add_cpu(&neh->eh_entries, m);
 988        }
 989
 990        ext4_extent_block_csum_set(inode, neh);
 991        set_buffer_uptodate(bh);
 992        unlock_buffer(bh);
 993
 994        err = ext4_handle_dirty_metadata(handle, inode, bh);
 995        if (err)
 996                goto cleanup;
 997        brelse(bh);
 998        bh = NULL;
 999
1000        /* correct old leaf */
1001        if (m) {
1002                err = ext4_ext_get_access(handle, inode, path + depth);
1003                if (err)
1004                        goto cleanup;
1005                le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1006                err = ext4_ext_dirty(handle, inode, path + depth);
1007                if (err)
1008                        goto cleanup;
1009
1010        }
1011
1012        /* create intermediate indexes */
1013        k = depth - at - 1;
1014        if (unlikely(k < 0)) {
1015                EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1016                err = -EIO;
1017                goto cleanup;
1018        }
1019        if (k)
1020                ext_debug("create %d intermediate indices\n", k);
1021        /* insert new index into current index block */
1022        /* current depth stored in i var */
1023        i = depth - 1;
1024        while (k--) {
1025                oldblock = newblock;
1026                newblock = ablocks[--a];
1027                bh = sb_getblk(inode->i_sb, newblock);
1028                if (unlikely(!bh)) {
1029                        err = -ENOMEM;
1030                        goto cleanup;
1031                }
1032                lock_buffer(bh);
1033
1034                err = ext4_journal_get_create_access(handle, bh);
1035                if (err)
1036                        goto cleanup;
1037
1038                neh = ext_block_hdr(bh);
1039                neh->eh_entries = cpu_to_le16(1);
1040                neh->eh_magic = EXT4_EXT_MAGIC;
1041                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1042                neh->eh_depth = cpu_to_le16(depth - i);
1043                fidx = EXT_FIRST_INDEX(neh);
1044                fidx->ei_block = border;
1045                ext4_idx_store_pblock(fidx, oldblock);
1046
1047                ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1048                                i, newblock, le32_to_cpu(border), oldblock);
1049
1050                /* move remainder of path[i] to the new index block */
1051                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1052                                        EXT_LAST_INDEX(path[i].p_hdr))) {
1053                        EXT4_ERROR_INODE(inode,
1054                                         "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1055                                         le32_to_cpu(path[i].p_ext->ee_block));
1056                        err = -EIO;
1057                        goto cleanup;
1058                }
1059                /* start copy indexes */
1060                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1061                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1062                                EXT_MAX_INDEX(path[i].p_hdr));
1063                ext4_ext_show_move(inode, path, newblock, i);
1064                if (m) {
1065                        memmove(++fidx, path[i].p_idx,
1066                                sizeof(struct ext4_extent_idx) * m);
1067                        le16_add_cpu(&neh->eh_entries, m);
1068                }
1069                ext4_extent_block_csum_set(inode, neh);
1070                set_buffer_uptodate(bh);
1071                unlock_buffer(bh);
1072
1073                err = ext4_handle_dirty_metadata(handle, inode, bh);
1074                if (err)
1075                        goto cleanup;
1076                brelse(bh);
1077                bh = NULL;
1078
1079                /* correct old index */
1080                if (m) {
1081                        err = ext4_ext_get_access(handle, inode, path + i);
1082                        if (err)
1083                                goto cleanup;
1084                        le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1085                        err = ext4_ext_dirty(handle, inode, path + i);
1086                        if (err)
1087                                goto cleanup;
1088                }
1089
1090                i--;
1091        }
1092
1093        /* insert new index */
1094        err = ext4_ext_insert_index(handle, inode, path + at,
1095                                    le32_to_cpu(border), newblock);
1096
1097cleanup:
1098        if (bh) {
1099                if (buffer_locked(bh))
1100                        unlock_buffer(bh);
1101                brelse(bh);
1102        }
1103
1104        if (err) {
1105                /* free all allocated blocks in error case */
1106                for (i = 0; i < depth; i++) {
1107                        if (!ablocks[i])
1108                                continue;
1109                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1110                                         EXT4_FREE_BLOCKS_METADATA);
1111                }
1112        }
1113        kfree(ablocks);
1114
1115        return err;
1116}
1117
1118/*
1119 * ext4_ext_grow_indepth:
1120 * implements tree growing procedure:
1121 * - allocates new block
1122 * - moves top-level data (index block or leaf) into the new block
1123 * - initializes new top-level, creating index that points to the
1124 *   just created block
1125 */
1126static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1127                                 unsigned int flags,
1128                                 struct ext4_extent *newext)
1129{
1130        struct ext4_extent_header *neh;
1131        struct buffer_head *bh;
1132        ext4_fsblk_t newblock;
1133        int err = 0;
1134
1135        newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1136                newext, &err, flags);
1137        if (newblock == 0)
1138                return err;
1139
1140        bh = sb_getblk(inode->i_sb, newblock);
1141        if (unlikely(!bh))
1142                return -ENOMEM;
1143        lock_buffer(bh);
1144
1145        err = ext4_journal_get_create_access(handle, bh);
1146        if (err) {
1147                unlock_buffer(bh);
1148                goto out;
1149        }
1150
1151        /* move top-level index/leaf into new block */
1152        memmove(bh->b_data, EXT4_I(inode)->i_data,
1153                sizeof(EXT4_I(inode)->i_data));
1154
1155        /* set size of new block */
1156        neh = ext_block_hdr(bh);
1157        /* old root could have indexes or leaves
1158         * so calculate e_max right way */
1159        if (ext_depth(inode))
1160                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1161        else
1162                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1163        neh->eh_magic = EXT4_EXT_MAGIC;
1164        ext4_extent_block_csum_set(inode, neh);
1165        set_buffer_uptodate(bh);
1166        unlock_buffer(bh);
1167
1168        err = ext4_handle_dirty_metadata(handle, inode, bh);
1169        if (err)
1170                goto out;
1171
1172        /* Update top-level index: num,max,pointer */
1173        neh = ext_inode_hdr(inode);
1174        neh->eh_entries = cpu_to_le16(1);
1175        ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1176        if (neh->eh_depth == 0) {
1177                /* Root extent block becomes index block */
1178                neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1179                EXT_FIRST_INDEX(neh)->ei_block =
1180                        EXT_FIRST_EXTENT(neh)->ee_block;
1181        }
1182        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1183                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1184                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1185                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1186
1187        le16_add_cpu(&neh->eh_depth, 1);
1188        ext4_mark_inode_dirty(handle, inode);
1189out:
1190        brelse(bh);
1191
1192        return err;
1193}
1194
1195/*
1196 * ext4_ext_create_new_leaf:
1197 * finds empty index and adds new leaf.
1198 * if no free index is found, then it requests in-depth growing.
1199 */
1200static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1201                                    unsigned int flags,
1202                                    struct ext4_ext_path *path,
1203                                    struct ext4_extent *newext)
1204{
1205        struct ext4_ext_path *curp;
1206        int depth, i, err = 0;
1207
1208repeat:
1209        i = depth = ext_depth(inode);
1210
1211        /* walk up to the tree and look for free index entry */
1212        curp = path + depth;
1213        while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1214                i--;
1215                curp--;
1216        }
1217
1218        /* we use already allocated block for index block,
1219         * so subsequent data blocks should be contiguous */
1220        if (EXT_HAS_FREE_INDEX(curp)) {
1221                /* if we found index with free entry, then use that
1222                 * entry: create all needed subtree and add new leaf */
1223                err = ext4_ext_split(handle, inode, flags, path, newext, i);
1224                if (err)
1225                        goto out;
1226
1227                /* refill path */
1228                ext4_ext_drop_refs(path);
1229                path = ext4_ext_find_extent(inode,
1230                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1231                                    path);
1232                if (IS_ERR(path))
1233                        err = PTR_ERR(path);
1234        } else {
1235                /* tree is full, time to grow in depth */
1236                err = ext4_ext_grow_indepth(handle, inode, flags, newext);
1237                if (err)
1238                        goto out;
1239
1240                /* refill path */
1241                ext4_ext_drop_refs(path);
1242                path = ext4_ext_find_extent(inode,
1243                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1244                                    path);
1245                if (IS_ERR(path)) {
1246                        err = PTR_ERR(path);
1247                        goto out;
1248                }
1249
1250                /*
1251                 * only first (depth 0 -> 1) produces free space;
1252                 * in all other cases we have to split the grown tree
1253                 */
1254                depth = ext_depth(inode);
1255                if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1256                        /* now we need to split */
1257                        goto repeat;
1258                }
1259        }
1260
1261out:
1262        return err;
1263}
1264
1265/*
1266 * search the closest allocated block to the left for *logical
1267 * and returns it at @logical + it's physical address at @phys
1268 * if *logical is the smallest allocated block, the function
1269 * returns 0 at @phys
1270 * return value contains 0 (success) or error code
1271 */
1272static int ext4_ext_search_left(struct inode *inode,
1273                                struct ext4_ext_path *path,
1274                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
1275{
1276        struct ext4_extent_idx *ix;
1277        struct ext4_extent *ex;
1278        int depth, ee_len;
1279
1280        if (unlikely(path == NULL)) {
1281                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1282                return -EIO;
1283        }
1284        depth = path->p_depth;
1285        *phys = 0;
1286
1287        if (depth == 0 && path->p_ext == NULL)
1288                return 0;
1289
1290        /* usually extent in the path covers blocks smaller
1291         * then *logical, but it can be that extent is the
1292         * first one in the file */
1293
1294        ex = path[depth].p_ext;
1295        ee_len = ext4_ext_get_actual_len(ex);
1296        if (*logical < le32_to_cpu(ex->ee_block)) {
1297                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1298                        EXT4_ERROR_INODE(inode,
1299                                         "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1300                                         *logical, le32_to_cpu(ex->ee_block));
1301                        return -EIO;
1302                }
1303                while (--depth >= 0) {
1304                        ix = path[depth].p_idx;
1305                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1306                                EXT4_ERROR_INODE(inode,
1307                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1308                                  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1309                                  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1310                le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1311                                  depth);
1312                                return -EIO;
1313                        }
1314                }
1315                return 0;
1316        }
1317
1318        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1319                EXT4_ERROR_INODE(inode,
1320                                 "logical %d < ee_block %d + ee_len %d!",
1321                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
1322                return -EIO;
1323        }
1324
1325        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1326        *phys = ext4_ext_pblock(ex) + ee_len - 1;
1327        return 0;
1328}
1329
1330/*
1331 * search the closest allocated block to the right for *logical
1332 * and returns it at @logical + it's physical address at @phys
1333 * if *logical is the largest allocated block, the function
1334 * returns 0 at @phys
1335 * return value contains 0 (success) or error code
1336 */
1337static int ext4_ext_search_right(struct inode *inode,
1338                                 struct ext4_ext_path *path,
1339                                 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1340                                 struct ext4_extent **ret_ex)
1341{
1342        struct buffer_head *bh = NULL;
1343        struct ext4_extent_header *eh;
1344        struct ext4_extent_idx *ix;
1345        struct ext4_extent *ex;
1346        ext4_fsblk_t block;
1347        int depth;      /* Note, NOT eh_depth; depth from top of tree */
1348        int ee_len;
1349
1350        if (unlikely(path == NULL)) {
1351                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1352                return -EIO;
1353        }
1354        depth = path->p_depth;
1355        *phys = 0;
1356
1357        if (depth == 0 && path->p_ext == NULL)
1358                return 0;
1359
1360        /* usually extent in the path covers blocks smaller
1361         * then *logical, but it can be that extent is the
1362         * first one in the file */
1363
1364        ex = path[depth].p_ext;
1365        ee_len = ext4_ext_get_actual_len(ex);
1366        if (*logical < le32_to_cpu(ex->ee_block)) {
1367                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1368                        EXT4_ERROR_INODE(inode,
1369                                         "first_extent(path[%d].p_hdr) != ex",
1370                                         depth);
1371                        return -EIO;
1372                }
1373                while (--depth >= 0) {
1374                        ix = path[depth].p_idx;
1375                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1376                                EXT4_ERROR_INODE(inode,
1377                                                 "ix != EXT_FIRST_INDEX *logical %d!",
1378                                                 *logical);
1379                                return -EIO;
1380                        }
1381                }
1382                goto found_extent;
1383        }
1384
1385        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1386                EXT4_ERROR_INODE(inode,
1387                                 "logical %d < ee_block %d + ee_len %d!",
1388                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
1389                return -EIO;
1390        }
1391
1392        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1393                /* next allocated block in this leaf */
1394                ex++;
1395                goto found_extent;
1396        }
1397
1398        /* go up and search for index to the right */
1399        while (--depth >= 0) {
1400                ix = path[depth].p_idx;
1401                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1402                        goto got_index;
1403        }
1404
1405        /* we've gone up to the root and found no index to the right */
1406        return 0;
1407
1408got_index:
1409        /* we've found index to the right, let's
1410         * follow it and find the closest allocated
1411         * block to the right */
1412        ix++;
1413        block = ext4_idx_pblock(ix);
1414        while (++depth < path->p_depth) {
1415                bh = sb_bread(inode->i_sb, block);
1416                if (bh == NULL)
1417                        return -EIO;
1418                eh = ext_block_hdr(bh);
1419                /* subtract from p_depth to get proper eh_depth */
1420                if (ext4_ext_check_block(inode, eh,
1421                                         path->p_depth - depth, bh)) {
1422                        put_bh(bh);
1423                        return -EIO;
1424                }
1425                ix = EXT_FIRST_INDEX(eh);
1426                block = ext4_idx_pblock(ix);
1427                put_bh(bh);
1428        }
1429
1430        bh = sb_bread(inode->i_sb, block);
1431        if (bh == NULL)
1432                return -EIO;
1433        eh = ext_block_hdr(bh);
1434        if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
1435                put_bh(bh);
1436                return -EIO;
1437        }
1438        ex = EXT_FIRST_EXTENT(eh);
1439found_extent:
1440        *logical = le32_to_cpu(ex->ee_block);
1441        *phys = ext4_ext_pblock(ex);
1442        *ret_ex = ex;
1443        if (bh)
1444                put_bh(bh);
1445        return 0;
1446}
1447
1448/*
1449 * ext4_ext_next_allocated_block:
1450 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1451 * NOTE: it considers block number from index entry as
1452 * allocated block. Thus, index entries have to be consistent
1453 * with leaves.
1454 */
1455static ext4_lblk_t
1456ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1457{
1458        int depth;
1459
1460        BUG_ON(path == NULL);
1461        depth = path->p_depth;
1462
1463        if (depth == 0 && path->p_ext == NULL)
1464                return EXT_MAX_BLOCKS;
1465
1466        while (depth >= 0) {
1467                if (depth == path->p_depth) {
1468                        /* leaf */
1469                        if (path[depth].p_ext &&
1470                                path[depth].p_ext !=
1471                                        EXT_LAST_EXTENT(path[depth].p_hdr))
1472                          return le32_to_cpu(path[depth].p_ext[1].ee_block);
1473                } else {
1474                        /* index */
1475                        if (path[depth].p_idx !=
1476                                        EXT_LAST_INDEX(path[depth].p_hdr))
1477                          return le32_to_cpu(path[depth].p_idx[1].ei_block);
1478                }
1479                depth--;
1480        }
1481
1482        return EXT_MAX_BLOCKS;
1483}
1484
1485/*
1486 * ext4_ext_next_leaf_block:
1487 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1488 */
1489static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1490{
1491        int depth;
1492
1493        BUG_ON(path == NULL);
1494        depth = path->p_depth;
1495
1496        /* zero-tree has no leaf blocks at all */
1497        if (depth == 0)
1498                return EXT_MAX_BLOCKS;
1499
1500        /* go to index block */
1501        depth--;
1502
1503        while (depth >= 0) {
1504                if (path[depth].p_idx !=
1505                                EXT_LAST_INDEX(path[depth].p_hdr))
1506                        return (ext4_lblk_t)
1507                                le32_to_cpu(path[depth].p_idx[1].ei_block);
1508                depth--;
1509        }
1510
1511        return EXT_MAX_BLOCKS;
1512}
1513
1514/*
1515 * ext4_ext_correct_indexes:
1516 * if leaf gets modified and modified extent is first in the leaf,
1517 * then we have to correct all indexes above.
1518 * TODO: do we need to correct tree in all cases?
1519 */
1520static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1521                                struct ext4_ext_path *path)
1522{
1523        struct ext4_extent_header *eh;
1524        int depth = ext_depth(inode);
1525        struct ext4_extent *ex;
1526        __le32 border;
1527        int k, err = 0;
1528
1529        eh = path[depth].p_hdr;
1530        ex = path[depth].p_ext;
1531
1532        if (unlikely(ex == NULL || eh == NULL)) {
1533                EXT4_ERROR_INODE(inode,
1534                                 "ex %p == NULL or eh %p == NULL", ex, eh);
1535                return -EIO;
1536        }
1537
1538        if (depth == 0) {
1539                /* there is no tree at all */
1540                return 0;
1541        }
1542
1543        if (ex != EXT_FIRST_EXTENT(eh)) {
1544                /* we correct tree if first leaf got modified only */
1545                return 0;
1546        }
1547
1548        /*
1549         * TODO: we need correction if border is smaller than current one
1550         */
1551        k = depth - 1;
1552        border = path[depth].p_ext->ee_block;
1553        err = ext4_ext_get_access(handle, inode, path + k);
1554        if (err)
1555                return err;
1556        path[k].p_idx->ei_block = border;
1557        err = ext4_ext_dirty(handle, inode, path + k);
1558        if (err)
1559                return err;
1560
1561        while (k--) {
1562                /* change all left-side indexes */
1563                if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1564                        break;
1565                err = ext4_ext_get_access(handle, inode, path + k);
1566                if (err)
1567                        break;
1568                path[k].p_idx->ei_block = border;
1569                err = ext4_ext_dirty(handle, inode, path + k);
1570                if (err)
1571                        break;
1572        }
1573
1574        return err;
1575}
1576
1577int
1578ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1579                                struct ext4_extent *ex2)
1580{
1581        unsigned short ext1_ee_len, ext2_ee_len, max_len;
1582
1583        /*
1584         * Make sure that both extents are initialized. We don't merge
1585         * uninitialized extents so that we can be sure that end_io code has
1586         * the extent that was written properly split out and conversion to
1587         * initialized is trivial.
1588         */
1589        if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
1590                return 0;
1591
1592        if (ext4_ext_is_uninitialized(ex1))
1593                max_len = EXT_UNINIT_MAX_LEN;
1594        else
1595                max_len = EXT_INIT_MAX_LEN;
1596
1597        ext1_ee_len = ext4_ext_get_actual_len(ex1);
1598        ext2_ee_len = ext4_ext_get_actual_len(ex2);
1599
1600        if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1601                        le32_to_cpu(ex2->ee_block))
1602                return 0;
1603
1604        /*
1605         * To allow future support for preallocated extents to be added
1606         * as an RO_COMPAT feature, refuse to merge to extents if
1607         * this can result in the top bit of ee_len being set.
1608         */
1609        if (ext1_ee_len + ext2_ee_len > max_len)
1610                return 0;
1611#ifdef AGGRESSIVE_TEST
1612        if (ext1_ee_len >= 4)
1613                return 0;
1614#endif
1615
1616        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1617                return 1;
1618        return 0;
1619}
1620
1621/*
1622 * This function tries to merge the "ex" extent to the next extent in the tree.
1623 * It always tries to merge towards right. If you want to merge towards
1624 * left, pass "ex - 1" as argument instead of "ex".
1625 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1626 * 1 if they got merged.
1627 */
1628static int ext4_ext_try_to_merge_right(struct inode *inode,
1629                                 struct ext4_ext_path *path,
1630                                 struct ext4_extent *ex)
1631{
1632        struct ext4_extent_header *eh;
1633        unsigned int depth, len;
1634        int merge_done = 0;
1635        int uninitialized = 0;
1636
1637        depth = ext_depth(inode);
1638        BUG_ON(path[depth].p_hdr == NULL);
1639        eh = path[depth].p_hdr;
1640
1641        while (ex < EXT_LAST_EXTENT(eh)) {
1642                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1643                        break;
1644                /* merge with next extent! */
1645                if (ext4_ext_is_uninitialized(ex))
1646                        uninitialized = 1;
1647                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1648                                + ext4_ext_get_actual_len(ex + 1));
1649                if (uninitialized)
1650                        ext4_ext_mark_uninitialized(ex);
1651
1652                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1653                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
1654                                * sizeof(struct ext4_extent);
1655                        memmove(ex + 1, ex + 2, len);
1656                }
1657                le16_add_cpu(&eh->eh_entries, -1);
1658                merge_done = 1;
1659                WARN_ON(eh->eh_entries == 0);
1660                if (!eh->eh_entries)
1661                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1662        }
1663
1664        return merge_done;
1665}
1666
1667/*
1668 * This function does a very simple check to see if we can collapse
1669 * an extent tree with a single extent tree leaf block into the inode.
1670 */
1671static void ext4_ext_try_to_merge_up(handle_t *handle,
1672                                     struct inode *inode,
1673                                     struct ext4_ext_path *path)
1674{
1675        size_t s;
1676        unsigned max_root = ext4_ext_space_root(inode, 0);
1677        ext4_fsblk_t blk;
1678
1679        if ((path[0].p_depth != 1) ||
1680            (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1681            (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1682                return;
1683
1684        /*
1685         * We need to modify the block allocation bitmap and the block
1686         * group descriptor to release the extent tree block.  If we
1687         * can't get the journal credits, give up.
1688         */
1689        if (ext4_journal_extend(handle, 2))
1690                return;
1691
1692        /*
1693         * Copy the extent data up to the inode
1694         */
1695        blk = ext4_idx_pblock(path[0].p_idx);
1696        s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1697                sizeof(struct ext4_extent_idx);
1698        s += sizeof(struct ext4_extent_header);
1699
1700        memcpy(path[0].p_hdr, path[1].p_hdr, s);
1701        path[0].p_depth = 0;
1702        path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1703                (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1704        path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1705
1706        brelse(path[1].p_bh);
1707        ext4_free_blocks(handle, inode, NULL, blk, 1,
1708                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1709}
1710
1711/*
1712 * This function tries to merge the @ex extent to neighbours in the tree.
1713 * return 1 if merge left else 0.
1714 */
1715static void ext4_ext_try_to_merge(handle_t *handle,
1716                                  struct inode *inode,
1717                                  struct ext4_ext_path *path,
1718                                  struct ext4_extent *ex) {
1719        struct ext4_extent_header *eh;
1720        unsigned int depth;
1721        int merge_done = 0;
1722
1723        depth = ext_depth(inode);
1724        BUG_ON(path[depth].p_hdr == NULL);
1725        eh = path[depth].p_hdr;
1726
1727        if (ex > EXT_FIRST_EXTENT(eh))
1728                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1729
1730        if (!merge_done)
1731                (void) ext4_ext_try_to_merge_right(inode, path, ex);
1732
1733        ext4_ext_try_to_merge_up(handle, inode, path);
1734}
1735
1736/*
1737 * check if a portion of the "newext" extent overlaps with an
1738 * existing extent.
1739 *
1740 * If there is an overlap discovered, it updates the length of the newext
1741 * such that there will be no overlap, and then returns 1.
1742 * If there is no overlap found, it returns 0.
1743 */
1744static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1745                                           struct inode *inode,
1746                                           struct ext4_extent *newext,
1747                                           struct ext4_ext_path *path)
1748{
1749        ext4_lblk_t b1, b2;
1750        unsigned int depth, len1;
1751        unsigned int ret = 0;
1752
1753        b1 = le32_to_cpu(newext->ee_block);
1754        len1 = ext4_ext_get_actual_len(newext);
1755        depth = ext_depth(inode);
1756        if (!path[depth].p_ext)
1757                goto out;
1758        b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1759        b2 &= ~(sbi->s_cluster_ratio - 1);
1760
1761        /*
1762         * get the next allocated block if the extent in the path
1763         * is before the requested block(s)
1764         */
1765        if (b2 < b1) {
1766                b2 = ext4_ext_next_allocated_block(path);
1767                if (b2 == EXT_MAX_BLOCKS)
1768                        goto out;
1769                b2 &= ~(sbi->s_cluster_ratio - 1);
1770        }
1771
1772        /* check for wrap through zero on extent logical start block*/
1773        if (b1 + len1 < b1) {
1774                len1 = EXT_MAX_BLOCKS - b1;
1775                newext->ee_len = cpu_to_le16(len1);
1776                ret = 1;
1777        }
1778
1779        /* check for overlap */
1780        if (b1 + len1 > b2) {
1781                newext->ee_len = cpu_to_le16(b2 - b1);
1782                ret = 1;
1783        }
1784out:
1785        return ret;
1786}
1787
1788/*
1789 * ext4_ext_insert_extent:
1790 * tries to merge requsted extent into the existing extent or
1791 * inserts requested extent as new one into the tree,
1792 * creating new leaf in the no-space case.
1793 */
1794int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1795                                struct ext4_ext_path *path,
1796                                struct ext4_extent *newext, int flag)
1797{
1798        struct ext4_extent_header *eh;
1799        struct ext4_extent *ex, *fex;
1800        struct ext4_extent *nearex; /* nearest extent */
1801        struct ext4_ext_path *npath = NULL;
1802        int depth, len, err;
1803        ext4_lblk_t next;
1804        unsigned uninitialized = 0;
1805        int flags = 0;
1806
1807        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1808                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1809                return -EIO;
1810        }
1811        depth = ext_depth(inode);
1812        ex = path[depth].p_ext;
1813        eh = path[depth].p_hdr;
1814        if (unlikely(path[depth].p_hdr == NULL)) {
1815                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1816                return -EIO;
1817        }
1818
1819        /* try to insert block into found extent and return */
1820        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
1821
1822                /*
1823                 * Try to see whether we should rather test the extent on
1824                 * right from ex, or from the left of ex. This is because
1825                 * ext4_ext_find_extent() can return either extent on the
1826                 * left, or on the right from the searched position. This
1827                 * will make merging more effective.
1828                 */
1829                if (ex < EXT_LAST_EXTENT(eh) &&
1830                    (le32_to_cpu(ex->ee_block) +
1831                    ext4_ext_get_actual_len(ex) <
1832                    le32_to_cpu(newext->ee_block))) {
1833                        ex += 1;
1834                        goto prepend;
1835                } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1836                           (le32_to_cpu(newext->ee_block) +
1837                           ext4_ext_get_actual_len(newext) <
1838                           le32_to_cpu(ex->ee_block)))
1839                        ex -= 1;
1840
1841                /* Try to append newex to the ex */
1842                if (ext4_can_extents_be_merged(inode, ex, newext)) {
1843                        ext_debug("append [%d]%d block to %u:[%d]%d"
1844                                  "(from %llu)\n",
1845                                  ext4_ext_is_uninitialized(newext),
1846                                  ext4_ext_get_actual_len(newext),
1847                                  le32_to_cpu(ex->ee_block),
1848                                  ext4_ext_is_uninitialized(ex),
1849                                  ext4_ext_get_actual_len(ex),
1850                                  ext4_ext_pblock(ex));
1851                        err = ext4_ext_get_access(handle, inode,
1852                                                  path + depth);
1853                        if (err)
1854                                return err;
1855
1856                        /*
1857                         * ext4_can_extents_be_merged should have checked
1858                         * that either both extents are uninitialized, or
1859                         * both aren't. Thus we need to check only one of
1860                         * them here.
1861                         */
1862                        if (ext4_ext_is_uninitialized(ex))
1863                                uninitialized = 1;
1864                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1865                                        + ext4_ext_get_actual_len(newext));
1866                        if (uninitialized)
1867                                ext4_ext_mark_uninitialized(ex);
1868                        eh = path[depth].p_hdr;
1869                        nearex = ex;
1870                        goto merge;
1871                }
1872
1873prepend:
1874                /* Try to prepend newex to the ex */
1875                if (ext4_can_extents_be_merged(inode, newext, ex)) {
1876                        ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1877                                  "(from %llu)\n",
1878                                  le32_to_cpu(newext->ee_block),
1879                                  ext4_ext_is_uninitialized(newext),
1880                                  ext4_ext_get_actual_len(newext),
1881                                  le32_to_cpu(ex->ee_block),
1882                                  ext4_ext_is_uninitialized(ex),
1883                                  ext4_ext_get_actual_len(ex),
1884                                  ext4_ext_pblock(ex));
1885                        err = ext4_ext_get_access(handle, inode,
1886                                                  path + depth);
1887                        if (err)
1888                                return err;
1889
1890                        /*
1891                         * ext4_can_extents_be_merged should have checked
1892                         * that either both extents are uninitialized, or
1893                         * both aren't. Thus we need to check only one of
1894                         * them here.
1895                         */
1896                        if (ext4_ext_is_uninitialized(ex))
1897                                uninitialized = 1;
1898                        ex->ee_block = newext->ee_block;
1899                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
1900                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1901                                        + ext4_ext_get_actual_len(newext));
1902                        if (uninitialized)
1903                                ext4_ext_mark_uninitialized(ex);
1904                        eh = path[depth].p_hdr;
1905                        nearex = ex;
1906                        goto merge;
1907                }
1908        }
1909
1910        depth = ext_depth(inode);
1911        eh = path[depth].p_hdr;
1912        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1913                goto has_space;
1914
1915        /* probably next leaf has space for us? */
1916        fex = EXT_LAST_EXTENT(eh);
1917        next = EXT_MAX_BLOCKS;
1918        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1919                next = ext4_ext_next_leaf_block(path);
1920        if (next != EXT_MAX_BLOCKS) {
1921                ext_debug("next leaf block - %u\n", next);
1922                BUG_ON(npath != NULL);
1923                npath = ext4_ext_find_extent(inode, next, NULL);
1924                if (IS_ERR(npath))
1925                        return PTR_ERR(npath);
1926                BUG_ON(npath->p_depth != path->p_depth);
1927                eh = npath[depth].p_hdr;
1928                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1929                        ext_debug("next leaf isn't full(%d)\n",
1930                                  le16_to_cpu(eh->eh_entries));
1931                        path = npath;
1932                        goto has_space;
1933                }
1934                ext_debug("next leaf has no free space(%d,%d)\n",
1935                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1936        }
1937
1938        /*
1939         * There is no free space in the found leaf.
1940         * We're gonna add a new leaf in the tree.
1941         */
1942        if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
1943                flags = EXT4_MB_USE_RESERVED;
1944        err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1945        if (err)
1946                goto cleanup;
1947        depth = ext_depth(inode);
1948        eh = path[depth].p_hdr;
1949
1950has_space:
1951        nearex = path[depth].p_ext;
1952
1953        err = ext4_ext_get_access(handle, inode, path + depth);
1954        if (err)
1955                goto cleanup;
1956
1957        if (!nearex) {
1958                /* there is no extent in this leaf, create first one */
1959                ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
1960                                le32_to_cpu(newext->ee_block),
1961                                ext4_ext_pblock(newext),
1962                                ext4_ext_is_uninitialized(newext),
1963                                ext4_ext_get_actual_len(newext));
1964                nearex = EXT_FIRST_EXTENT(eh);
1965        } else {
1966                if (le32_to_cpu(newext->ee_block)
1967                           > le32_to_cpu(nearex->ee_block)) {
1968                        /* Insert after */
1969                        ext_debug("insert %u:%llu:[%d]%d before: "
1970                                        "nearest %p\n",
1971                                        le32_to_cpu(newext->ee_block),
1972                                        ext4_ext_pblock(newext),
1973                                        ext4_ext_is_uninitialized(newext),
1974                                        ext4_ext_get_actual_len(newext),
1975                                        nearex);
1976                        nearex++;
1977                } else {
1978                        /* Insert before */
1979                        BUG_ON(newext->ee_block == nearex->ee_block);
1980                        ext_debug("insert %u:%llu:[%d]%d after: "
1981                                        "nearest %p\n",
1982                                        le32_to_cpu(newext->ee_block),
1983                                        ext4_ext_pblock(newext),
1984                                        ext4_ext_is_uninitialized(newext),
1985                                        ext4_ext_get_actual_len(newext),
1986                                        nearex);
1987                }
1988                len = EXT_LAST_EXTENT(eh) - nearex + 1;
1989                if (len > 0) {
1990                        ext_debug("insert %u:%llu:[%d]%d: "
1991                                        "move %d extents from 0x%p to 0x%p\n",
1992                                        le32_to_cpu(newext->ee_block),
1993                                        ext4_ext_pblock(newext),
1994                                        ext4_ext_is_uninitialized(newext),
1995                                        ext4_ext_get_actual_len(newext),
1996                                        len, nearex, nearex + 1);
1997                        memmove(nearex + 1, nearex,
1998                                len * sizeof(struct ext4_extent));
1999                }
2000        }
2001
2002        le16_add_cpu(&eh->eh_entries, 1);
2003        path[depth].p_ext = nearex;
2004        nearex->ee_block = newext->ee_block;
2005        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2006        nearex->ee_len = newext->ee_len;
2007
2008merge:
2009        /* try to merge extents */
2010        if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
2011                ext4_ext_try_to_merge(handle, inode, path, nearex);
2012
2013
2014        /* time to correct all indexes above */
2015        err = ext4_ext_correct_indexes(handle, inode, path);
2016        if (err)
2017                goto cleanup;
2018
2019        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2020
2021cleanup:
2022        if (npath) {
2023                ext4_ext_drop_refs(npath);
2024                kfree(npath);
2025        }
2026        return err;
2027}
2028
2029static int ext4_fill_fiemap_extents(struct inode *inode,
2030                                    ext4_lblk_t block, ext4_lblk_t num,
2031                                    struct fiemap_extent_info *fieinfo)
2032{
2033        struct ext4_ext_path *path = NULL;
2034        struct ext4_extent *ex;
2035        struct extent_status es;
2036        ext4_lblk_t next, next_del, start = 0, end = 0;
2037        ext4_lblk_t last = block + num;
2038        int exists, depth = 0, err = 0;
2039        unsigned int flags = 0;
2040        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2041
2042        while (block < last && block != EXT_MAX_BLOCKS) {
2043                num = last - block;
2044                /* find extent for this block */
2045                down_read(&EXT4_I(inode)->i_data_sem);
2046
2047                if (path && ext_depth(inode) != depth) {
2048                        /* depth was changed. we have to realloc path */
2049                        kfree(path);
2050                        path = NULL;
2051                }
2052
2053                path = ext4_ext_find_extent(inode, block, path);
2054                if (IS_ERR(path)) {
2055                        up_read(&EXT4_I(inode)->i_data_sem);
2056                        err = PTR_ERR(path);
2057                        path = NULL;
2058                        break;
2059                }
2060
2061                depth = ext_depth(inode);
2062                if (unlikely(path[depth].p_hdr == NULL)) {
2063                        up_read(&EXT4_I(inode)->i_data_sem);
2064                        EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2065                        err = -EIO;
2066                        break;
2067                }
2068                ex = path[depth].p_ext;
2069                next = ext4_ext_next_allocated_block(path);
2070                ext4_ext_drop_refs(path);
2071
2072                flags = 0;
2073                exists = 0;
2074                if (!ex) {
2075                        /* there is no extent yet, so try to allocate
2076                         * all requested space */
2077                        start = block;
2078                        end = block + num;
2079                } else if (le32_to_cpu(ex->ee_block) > block) {
2080                        /* need to allocate space before found extent */
2081                        start = block;
2082                        end = le32_to_cpu(ex->ee_block);
2083                        if (block + num < end)
2084                                end = block + num;
2085                } else if (block >= le32_to_cpu(ex->ee_block)
2086                                        + ext4_ext_get_actual_len(ex)) {
2087                        /* need to allocate space after found extent */
2088                        start = block;
2089                        end = block + num;
2090                        if (end >= next)
2091                                end = next;
2092                } else if (block >= le32_to_cpu(ex->ee_block)) {
2093                        /*
2094                         * some part of requested space is covered
2095                         * by found extent
2096                         */
2097                        start = block;
2098                        end = le32_to_cpu(ex->ee_block)
2099                                + ext4_ext_get_actual_len(ex);
2100                        if (block + num < end)
2101                                end = block + num;
2102                        exists = 1;
2103                } else {
2104                        BUG();
2105                }
2106                BUG_ON(end <= start);
2107
2108                if (!exists) {
2109                        es.es_lblk = start;
2110                        es.es_len = end - start;
2111                        es.es_pblk = 0;
2112                } else {
2113                        es.es_lblk = le32_to_cpu(ex->ee_block);
2114                        es.es_len = ext4_ext_get_actual_len(ex);
2115                        es.es_pblk = ext4_ext_pblock(ex);
2116                        if (ext4_ext_is_uninitialized(ex))
2117                                flags |= FIEMAP_EXTENT_UNWRITTEN;
2118                }
2119
2120                /*
2121                 * Find delayed extent and update es accordingly. We call
2122                 * it even in !exists case to find out whether es is the
2123                 * last existing extent or not.
2124                 */
2125                next_del = ext4_find_delayed_extent(inode, &es);
2126                if (!exists && next_del) {
2127                        exists = 1;
2128                        flags |= (FIEMAP_EXTENT_DELALLOC |
2129                                  FIEMAP_EXTENT_UNKNOWN);
2130                }
2131                up_read(&EXT4_I(inode)->i_data_sem);
2132
2133                if (unlikely(es.es_len == 0)) {
2134                        EXT4_ERROR_INODE(inode, "es.es_len == 0");
2135                        err = -EIO;
2136                        break;
2137                }
2138
2139                /*
2140                 * This is possible iff next == next_del == EXT_MAX_BLOCKS.
2141                 * we need to check next == EXT_MAX_BLOCKS because it is
2142                 * possible that an extent is with unwritten and delayed
2143                 * status due to when an extent is delayed allocated and
2144                 * is allocated by fallocate status tree will track both of
2145                 * them in a extent.
2146                 *
2147                 * So we could return a unwritten and delayed extent, and
2148                 * its block is equal to 'next'.
2149                 */
2150                if (next == next_del && next == EXT_MAX_BLOCKS) {
2151                        flags |= FIEMAP_EXTENT_LAST;
2152                        if (unlikely(next_del != EXT_MAX_BLOCKS ||
2153                                     next != EXT_MAX_BLOCKS)) {
2154                                EXT4_ERROR_INODE(inode,
2155                                                 "next extent == %u, next "
2156                                                 "delalloc extent = %u",
2157                                                 next, next_del);
2158                                err = -EIO;
2159                                break;
2160                        }
2161                }
2162
2163                if (exists) {
2164                        err = fiemap_fill_next_extent(fieinfo,
2165                                (__u64)es.es_lblk << blksize_bits,
2166                                (__u64)es.es_pblk << blksize_bits,
2167                                (__u64)es.es_len << blksize_bits,
2168                                flags);
2169                        if (err < 0)
2170                                break;
2171                        if (err == 1) {
2172                                err = 0;
2173                                break;
2174                        }
2175                }
2176
2177                block = es.es_lblk + es.es_len;
2178        }
2179
2180        if (path) {
2181                ext4_ext_drop_refs(path);
2182                kfree(path);
2183        }
2184
2185        return err;
2186}
2187
2188/*
2189 * ext4_ext_put_gap_in_cache:
2190 * calculate boundaries of the gap that the requested block fits into
2191 * and cache this gap
2192 */
2193static void
2194ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2195                                ext4_lblk_t block)
2196{
2197        int depth = ext_depth(inode);
2198        unsigned long len;
2199        ext4_lblk_t lblock;
2200        struct ext4_extent *ex;
2201
2202        ex = path[depth].p_ext;
2203        if (ex == NULL) {
2204                /*
2205                 * there is no extent yet, so gap is [0;-] and we
2206                 * don't cache it
2207                 */
2208                ext_debug("cache gap(whole file):");
2209        } else if (block < le32_to_cpu(ex->ee_block)) {
2210                lblock = block;
2211                len = le32_to_cpu(ex->ee_block) - block;
2212                ext_debug("cache gap(before): %u [%u:%u]",
2213                                block,
2214                                le32_to_cpu(ex->ee_block),
2215                                 ext4_ext_get_actual_len(ex));
2216                if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2217                        ext4_es_insert_extent(inode, lblock, len, ~0,
2218                                              EXTENT_STATUS_HOLE);
2219        } else if (block >= le32_to_cpu(ex->ee_block)
2220                        + ext4_ext_get_actual_len(ex)) {
2221                ext4_lblk_t next;
2222                lblock = le32_to_cpu(ex->ee_block)
2223                        + ext4_ext_get_actual_len(ex);
2224
2225                next = ext4_ext_next_allocated_block(path);
2226                ext_debug("cache gap(after): [%u:%u] %u",
2227                                le32_to_cpu(ex->ee_block),
2228                                ext4_ext_get_actual_len(ex),
2229                                block);
2230                BUG_ON(next == lblock);
2231                len = next - lblock;
2232                if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2233                        ext4_es_insert_extent(inode, lblock, len, ~0,
2234                                              EXTENT_STATUS_HOLE);
2235        } else {
2236                lblock = len = 0;
2237                BUG();
2238        }
2239
2240        ext_debug(" -> %u:%lu\n", lblock, len);
2241}
2242
2243/*
2244 * ext4_ext_rm_idx:
2245 * removes index from the index block.
2246 */
2247static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2248                        struct ext4_ext_path *path, int depth)
2249{
2250        int err;
2251        ext4_fsblk_t leaf;
2252
2253        /* free index block */
2254        depth--;
2255        path = path + depth;
2256        leaf = ext4_idx_pblock(path->p_idx);
2257        if (unlikely(path->p_hdr->eh_entries == 0)) {
2258                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2259                return -EIO;
2260        }
2261        err = ext4_ext_get_access(handle, inode, path);
2262        if (err)
2263                return err;
2264
2265        if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2266                int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2267                len *= sizeof(struct ext4_extent_idx);
2268                memmove(path->p_idx, path->p_idx + 1, len);
2269        }
2270
2271        le16_add_cpu(&path->p_hdr->eh_entries, -1);
2272        err = ext4_ext_dirty(handle, inode, path);
2273        if (err)
2274                return err;
2275        ext_debug("index is empty, remove it, free block %llu\n", leaf);
2276        trace_ext4_ext_rm_idx(inode, leaf);
2277
2278        ext4_free_blocks(handle, inode, NULL, leaf, 1,
2279                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2280
2281        while (--depth >= 0) {
2282                if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2283                        break;
2284                path--;
2285                err = ext4_ext_get_access(handle, inode, path);
2286                if (err)
2287                        break;
2288                path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2289                err = ext4_ext_dirty(handle, inode, path);
2290                if (err)
2291                        break;
2292        }
2293        return err;
2294}
2295
2296/*
2297 * ext4_ext_calc_credits_for_single_extent:
2298 * This routine returns max. credits that needed to insert an extent
2299 * to the extent tree.
2300 * When pass the actual path, the caller should calculate credits
2301 * under i_data_sem.
2302 */
2303int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2304                                                struct ext4_ext_path *path)
2305{
2306        if (path) {
2307                int depth = ext_depth(inode);
2308                int ret = 0;
2309
2310                /* probably there is space in leaf? */
2311                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2312                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2313
2314                        /*
2315                         *  There are some space in the leaf tree, no
2316                         *  need to account for leaf block credit
2317                         *
2318                         *  bitmaps and block group descriptor blocks
2319                         *  and other metadata blocks still need to be
2320                         *  accounted.
2321                         */
2322                        /* 1 bitmap, 1 block group descriptor */
2323                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2324                        return ret;
2325                }
2326        }
2327
2328        return ext4_chunk_trans_blocks(inode, nrblocks);
2329}
2330
2331/*
2332 * How many index/leaf blocks need to change/allocate to add @extents extents?
2333 *
2334 * If we add a single extent, then in the worse case, each tree level
2335 * index/leaf need to be changed in case of the tree split.
2336 *
2337 * If more extents are inserted, they could cause the whole tree split more
2338 * than once, but this is really rare.
2339 */
2340int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2341{
2342        int index;
2343        int depth;
2344
2345        /* If we are converting the inline data, only one is needed here. */
2346        if (ext4_has_inline_data(inode))
2347                return 1;
2348
2349        depth = ext_depth(inode);
2350
2351        if (extents <= 1)
2352                index = depth * 2;
2353        else
2354                index = depth * 3;
2355
2356        return index;
2357}
2358
2359static inline int get_default_free_blocks_flags(struct inode *inode)
2360{
2361        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2362                return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2363        else if (ext4_should_journal_data(inode))
2364                return EXT4_FREE_BLOCKS_FORGET;
2365        return 0;
2366}
2367
2368static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2369                              struct ext4_extent *ex,
2370                              long long *partial_cluster,
2371                              ext4_lblk_t from, ext4_lblk_t to)
2372{
2373        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2374        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2375        ext4_fsblk_t pblk;
2376        int flags = get_default_free_blocks_flags(inode);
2377
2378        /*
2379         * For bigalloc file systems, we never free a partial cluster
2380         * at the beginning of the extent.  Instead, we make a note
2381         * that we tried freeing the cluster, and check to see if we
2382         * need to free it on a subsequent call to ext4_remove_blocks,
2383         * or at the end of the ext4_truncate() operation.
2384         */
2385        flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2386
2387        trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2388        /*
2389         * If we have a partial cluster, and it's different from the
2390         * cluster of the last block, we need to explicitly free the
2391         * partial cluster here.
2392         */
2393        pblk = ext4_ext_pblock(ex) + ee_len - 1;
2394        if ((*partial_cluster > 0) &&
2395            (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2396                ext4_free_blocks(handle, inode, NULL,
2397                                 EXT4_C2B(sbi, *partial_cluster),
2398                                 sbi->s_cluster_ratio, flags);
2399                *partial_cluster = 0;
2400        }
2401
2402#ifdef EXTENTS_STATS
2403        {
2404                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2405                spin_lock(&sbi->s_ext_stats_lock);
2406                sbi->s_ext_blocks += ee_len;
2407                sbi->s_ext_extents++;
2408                if (ee_len < sbi->s_ext_min)
2409                        sbi->s_ext_min = ee_len;
2410                if (ee_len > sbi->s_ext_max)
2411                        sbi->s_ext_max = ee_len;
2412                if (ext_depth(inode) > sbi->s_depth_max)
2413                        sbi->s_depth_max = ext_depth(inode);
2414                spin_unlock(&sbi->s_ext_stats_lock);
2415        }
2416#endif
2417        if (from >= le32_to_cpu(ex->ee_block)
2418            && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2419                /* tail removal */
2420                ext4_lblk_t num;
2421                unsigned int unaligned;
2422
2423                num = le32_to_cpu(ex->ee_block) + ee_len - from;
2424                pblk = ext4_ext_pblock(ex) + ee_len - num;
2425                /*
2426                 * Usually we want to free partial cluster at the end of the
2427                 * extent, except for the situation when the cluster is still
2428                 * used by any other extent (partial_cluster is negative).
2429                 */
2430                if (*partial_cluster < 0 &&
2431                    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
2432                        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2433
2434                ext_debug("free last %u blocks starting %llu partial %lld\n",
2435                          num, pblk, *partial_cluster);
2436                ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2437                /*
2438                 * If the block range to be freed didn't start at the
2439                 * beginning of a cluster, and we removed the entire
2440                 * extent and the cluster is not used by any other extent,
2441                 * save the partial cluster here, since we might need to
2442                 * delete if we determine that the truncate operation has
2443                 * removed all of the blocks in the cluster.
2444                 *
2445                 * On the other hand, if we did not manage to free the whole
2446                 * extent, we have to mark the cluster as used (store negative
2447                 * cluster number in partial_cluster).
2448                 */
2449                unaligned = pblk & (sbi->s_cluster_ratio - 1);
2450                if (unaligned && (ee_len == num) &&
2451                    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
2452                        *partial_cluster = EXT4_B2C(sbi, pblk);
2453                else if (unaligned)
2454                        *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2455                else if (*partial_cluster > 0)
2456                        *partial_cluster = 0;
2457        } else
2458                ext4_error(sbi->s_sb, "strange request: removal(2) "
2459                           "%u-%u from %u:%u\n",
2460                           from, to, le32_to_cpu(ex->ee_block), ee_len);
2461        return 0;
2462}
2463
2464
2465/*
2466 * ext4_ext_rm_leaf() Removes the extents associated with the
2467 * blocks appearing between "start" and "end", and splits the extents
2468 * if "start" and "end" appear in the same extent
2469 *
2470 * @handle: The journal handle
2471 * @inode:  The files inode
2472 * @path:   The path to the leaf
2473 * @partial_cluster: The cluster which we'll have to free if all extents
2474 *                   has been released from it. It gets negative in case
2475 *                   that the cluster is still used.
2476 * @start:  The first block to remove
2477 * @end:   The last block to remove
2478 */
2479static int
2480ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2481                 struct ext4_ext_path *path,
2482                 long long *partial_cluster,
2483                 ext4_lblk_t start, ext4_lblk_t end)
2484{
2485        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2486        int err = 0, correct_index = 0;
2487        int depth = ext_depth(inode), credits;
2488        struct ext4_extent_header *eh;
2489        ext4_lblk_t a, b;
2490        unsigned num;
2491        ext4_lblk_t ex_ee_block;
2492        unsigned short ex_ee_len;
2493        unsigned uninitialized = 0;
2494        struct ext4_extent *ex;
2495        ext4_fsblk_t pblk;
2496
2497        /* the header must be checked already in ext4_ext_remove_space() */
2498        ext_debug("truncate since %u in leaf to %u\n", start, end);
2499        if (!path[depth].p_hdr)
2500                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2501        eh = path[depth].p_hdr;
2502        if (unlikely(path[depth].p_hdr == NULL)) {
2503                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2504                return -EIO;
2505        }
2506        /* find where to start removing */
2507        ex = path[depth].p_ext;
2508        if (!ex)
2509                ex = EXT_LAST_EXTENT(eh);
2510
2511        ex_ee_block = le32_to_cpu(ex->ee_block);
2512        ex_ee_len = ext4_ext_get_actual_len(ex);
2513
2514        trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2515
2516        while (ex >= EXT_FIRST_EXTENT(eh) &&
2517                        ex_ee_block + ex_ee_len > start) {
2518
2519                if (ext4_ext_is_uninitialized(ex))
2520                        uninitialized = 1;
2521                else
2522                        uninitialized = 0;
2523
2524                ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2525                         uninitialized, ex_ee_len);
2526                path[depth].p_ext = ex;
2527
2528                a = ex_ee_block > start ? ex_ee_block : start;
2529                b = ex_ee_block+ex_ee_len - 1 < end ?
2530                        ex_ee_block+ex_ee_len - 1 : end;
2531
2532                ext_debug("  border %u:%u\n", a, b);
2533
2534                /* If this extent is beyond the end of the hole, skip it */
2535                if (end < ex_ee_block) {
2536                        /*
2537                         * We're going to skip this extent and move to another,
2538                         * so if this extent is not cluster aligned we have
2539                         * to mark the current cluster as used to avoid
2540                         * accidentally freeing it later on
2541                         */
2542                        pblk = ext4_ext_pblock(ex);
2543                        if (pblk & (sbi->s_cluster_ratio - 1))
2544                                *partial_cluster =
2545                                        -((long long)EXT4_B2C(sbi, pblk));
2546                        ex--;
2547                        ex_ee_block = le32_to_cpu(ex->ee_block);
2548                        ex_ee_len = ext4_ext_get_actual_len(ex);
2549                        continue;
2550                } else if (b != ex_ee_block + ex_ee_len - 1) {
2551                        EXT4_ERROR_INODE(inode,
2552                                         "can not handle truncate %u:%u "
2553                                         "on extent %u:%u",
2554                                         start, end, ex_ee_block,
2555                                         ex_ee_block + ex_ee_len - 1);
2556                        err = -EIO;
2557                        goto out;
2558                } else if (a != ex_ee_block) {
2559                        /* remove tail of the extent */
2560                        num = a - ex_ee_block;
2561                } else {
2562                        /* remove whole extent: excellent! */
2563                        num = 0;
2564                }
2565                /*
2566                 * 3 for leaf, sb, and inode plus 2 (bmap and group
2567                 * descriptor) for each block group; assume two block
2568                 * groups plus ex_ee_len/blocks_per_block_group for
2569                 * the worst case
2570                 */
2571                credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2572                if (ex == EXT_FIRST_EXTENT(eh)) {
2573                        correct_index = 1;
2574                        credits += (ext_depth(inode)) + 1;
2575                }
2576                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2577
2578                err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2579                if (err)
2580                        goto out;
2581
2582                err = ext4_ext_get_access(handle, inode, path + depth);
2583                if (err)
2584                        goto out;
2585
2586                err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2587                                         a, b);
2588                if (err)
2589                        goto out;
2590
2591                if (num == 0)
2592                        /* this extent is removed; mark slot entirely unused */
2593                        ext4_ext_store_pblock(ex, 0);
2594
2595                ex->ee_len = cpu_to_le16(num);
2596                /*
2597                 * Do not mark uninitialized if all the blocks in the
2598                 * extent have been removed.
2599                 */
2600                if (uninitialized && num)
2601                        ext4_ext_mark_uninitialized(ex);
2602                /*
2603                 * If the extent was completely released,
2604                 * we need to remove it from the leaf
2605                 */
2606                if (num == 0) {
2607                        if (end != EXT_MAX_BLOCKS - 1) {
2608                                /*
2609                                 * For hole punching, we need to scoot all the
2610                                 * extents up when an extent is removed so that
2611                                 * we dont have blank extents in the middle
2612                                 */
2613                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2614                                        sizeof(struct ext4_extent));
2615
2616                                /* Now get rid of the one at the end */
2617                                memset(EXT_LAST_EXTENT(eh), 0,
2618                                        sizeof(struct ext4_extent));
2619                        }
2620                        le16_add_cpu(&eh->eh_entries, -1);
2621                } else if (*partial_cluster > 0)
2622                        *partial_cluster = 0;
2623
2624                err = ext4_ext_dirty(handle, inode, path + depth);
2625                if (err)
2626                        goto out;
2627
2628                ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2629                                ext4_ext_pblock(ex));
2630                ex--;
2631                ex_ee_block = le32_to_cpu(ex->ee_block);
2632                ex_ee_len = ext4_ext_get_actual_len(ex);
2633        }
2634
2635        if (correct_index && eh->eh_entries)
2636                err = ext4_ext_correct_indexes(handle, inode, path);
2637
2638        /*
2639         * Free the partial cluster only if the current extent does not
2640         * reference it. Otherwise we might free used cluster.
2641         */
2642        if (*partial_cluster > 0 &&
2643            (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2644             *partial_cluster)) {
2645                int flags = get_default_free_blocks_flags(inode);
2646
2647                ext4_free_blocks(handle, inode, NULL,
2648                                 EXT4_C2B(sbi, *partial_cluster),
2649                                 sbi->s_cluster_ratio, flags);
2650                *partial_cluster = 0;
2651        }
2652
2653        /* if this leaf is free, then we should
2654         * remove it from index block above */
2655        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2656                err = ext4_ext_rm_idx(handle, inode, path, depth);
2657
2658out:
2659        return err;
2660}
2661
2662/*
2663 * ext4_ext_more_to_rm:
2664 * returns 1 if current index has to be freed (even partial)
2665 */
2666static int
2667ext4_ext_more_to_rm(struct ext4_ext_path *path)
2668{
2669        BUG_ON(path->p_idx == NULL);
2670
2671        if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2672                return 0;
2673
2674        /*
2675         * if truncate on deeper level happened, it wasn't partial,
2676         * so we have to consider current index for truncation
2677         */
2678        if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2679                return 0;
2680        return 1;
2681}
2682
2683int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2684                          ext4_lblk_t end)
2685{
2686        struct super_block *sb = inode->i_sb;
2687        int depth = ext_depth(inode);
2688        struct ext4_ext_path *path = NULL;
2689        long long partial_cluster = 0;
2690        handle_t *handle;
2691        int i = 0, err = 0;
2692
2693        ext_debug("truncate since %u to %u\n", start, end);
2694
2695        /* probably first extent we're gonna free will be last in block */
2696        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
2697        if (IS_ERR(handle))
2698                return PTR_ERR(handle);
2699
2700again:
2701        trace_ext4_ext_remove_space(inode, start, end, depth);
2702
2703        /*
2704         * Check if we are removing extents inside the extent tree. If that
2705         * is the case, we are going to punch a hole inside the extent tree
2706         * so we have to check whether we need to split the extent covering
2707         * the last block to remove so we can easily remove the part of it
2708         * in ext4_ext_rm_leaf().
2709         */
2710        if (end < EXT_MAX_BLOCKS - 1) {
2711                struct ext4_extent *ex;
2712                ext4_lblk_t ee_block;
2713
2714                /* find extent for this block */
2715                path = ext4_ext_find_extent(inode, end, NULL);
2716                if (IS_ERR(path)) {
2717                        ext4_journal_stop(handle);
2718                        return PTR_ERR(path);
2719                }
2720                depth = ext_depth(inode);
2721                /* Leaf not may not exist only if inode has no blocks at all */
2722                ex = path[depth].p_ext;
2723                if (!ex) {
2724                        if (depth) {
2725                                EXT4_ERROR_INODE(inode,
2726                                                 "path[%d].p_hdr == NULL",
2727                                                 depth);
2728                                err = -EIO;
2729                        }
2730                        goto out;
2731                }
2732
2733                ee_block = le32_to_cpu(ex->ee_block);
2734
2735                /*
2736                 * See if the last block is inside the extent, if so split
2737                 * the extent at 'end' block so we can easily remove the
2738                 * tail of the first part of the split extent in
2739                 * ext4_ext_rm_leaf().
2740                 */
2741                if (end >= ee_block &&
2742                    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
2743                        int split_flag = 0;
2744
2745                        if (ext4_ext_is_uninitialized(ex))
2746                                split_flag = EXT4_EXT_MARK_UNINIT1 |
2747                                             EXT4_EXT_MARK_UNINIT2;
2748
2749                        /*
2750                         * Split the extent in two so that 'end' is the last
2751                         * block in the first new extent. Also we should not
2752                         * fail removing space due to ENOSPC so try to use
2753                         * reserved block if that happens.
2754                         */
2755                        err = ext4_split_extent_at(handle, inode, path,
2756                                        end + 1, split_flag,
2757                                        EXT4_GET_BLOCKS_PRE_IO |
2758                                        EXT4_GET_BLOCKS_METADATA_NOFAIL);
2759
2760                        if (err < 0)
2761                                goto out;
2762                }
2763        }
2764        /*
2765         * We start scanning from right side, freeing all the blocks
2766         * after i_size and walking into the tree depth-wise.
2767         */
2768        depth = ext_depth(inode);
2769        if (path) {
2770                int k = i = depth;
2771                while (--k > 0)
2772                        path[k].p_block =
2773                                le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2774        } else {
2775                path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
2776                               GFP_NOFS);
2777                if (path == NULL) {
2778                        ext4_journal_stop(handle);
2779                        return -ENOMEM;
2780                }
2781                path[0].p_depth = depth;
2782                path[0].p_hdr = ext_inode_hdr(inode);
2783                i = 0;
2784
2785                if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2786                        err = -EIO;
2787                        goto out;
2788                }
2789        }
2790        err = 0;
2791
2792        while (i >= 0 && err == 0) {
2793                if (i == depth) {
2794                        /* this is leaf block */
2795                        err = ext4_ext_rm_leaf(handle, inode, path,
2796                                               &partial_cluster, start,
2797                                               end);
2798                        /* root level has p_bh == NULL, brelse() eats this */
2799                        brelse(path[i].p_bh);
2800                        path[i].p_bh = NULL;
2801                        i--;
2802                        continue;
2803                }
2804
2805                /* this is index block */
2806                if (!path[i].p_hdr) {
2807                        ext_debug("initialize header\n");
2808                        path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2809                }
2810
2811                if (!path[i].p_idx) {
2812                        /* this level hasn't been touched yet */
2813                        path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2814                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2815                        ext_debug("init index ptr: hdr 0x%p, num %d\n",
2816                                  path[i].p_hdr,
2817                                  le16_to_cpu(path[i].p_hdr->eh_entries));
2818                } else {
2819                        /* we were already here, see at next index */
2820                        path[i].p_idx--;
2821                }
2822
2823                ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2824                                i, EXT_FIRST_INDEX(path[i].p_hdr),
2825                                path[i].p_idx);
2826                if (ext4_ext_more_to_rm(path + i)) {
2827                        struct buffer_head *bh;
2828                        /* go to the next level */
2829                        ext_debug("move to level %d (block %llu)\n",
2830                                  i + 1, ext4_idx_pblock(path[i].p_idx));
2831                        memset(path + i + 1, 0, sizeof(*path));
2832                        bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2833                        if (!bh) {
2834                                /* should we reset i_size? */
2835                                err = -EIO;
2836                                break;
2837                        }
2838                        /* Yield here to deal with large extent trees.
2839                         * Should be a no-op if we did IO above. */
2840                        cond_resched();
2841                        if (WARN_ON(i + 1 > depth)) {
2842                                err = -EIO;
2843                                break;
2844                        }
2845                        if (ext4_ext_check_block(inode, ext_block_hdr(bh),
2846                                                        depth - i - 1, bh)) {
2847                                err = -EIO;
2848                                break;
2849                        }
2850                        path[i + 1].p_bh = bh;
2851
2852                        /* save actual number of indexes since this
2853                         * number is changed at the next iteration */
2854                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2855                        i++;
2856                } else {
2857                        /* we finished processing this index, go up */
2858                        if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2859                                /* index is empty, remove it;
2860                                 * handle must be already prepared by the
2861                                 * truncatei_leaf() */
2862                                err = ext4_ext_rm_idx(handle, inode, path, i);
2863                        }
2864                        /* root level has p_bh == NULL, brelse() eats this */
2865                        brelse(path[i].p_bh);
2866                        path[i].p_bh = NULL;
2867                        i--;
2868                        ext_debug("return to level %d\n", i);
2869                }
2870        }
2871
2872        trace_ext4_ext_remove_space_done(inode, start, end, depth,
2873                        partial_cluster, path->p_hdr->eh_entries);
2874
2875        /* If we still have something in the partial cluster and we have removed
2876         * even the first extent, then we should free the blocks in the partial
2877         * cluster as well. */
2878        if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
2879                int flags = get_default_free_blocks_flags(inode);
2880
2881                ext4_free_blocks(handle, inode, NULL,
2882                                 EXT4_C2B(EXT4_SB(sb), partial_cluster),
2883                                 EXT4_SB(sb)->s_cluster_ratio, flags);
2884                partial_cluster = 0;
2885        }
2886
2887        /* TODO: flexible tree reduction should be here */
2888        if (path->p_hdr->eh_entries == 0) {
2889                /*
2890                 * truncate to zero freed all the tree,
2891                 * so we need to correct eh_depth
2892                 */
2893                err = ext4_ext_get_access(handle, inode, path);
2894                if (err == 0) {
2895                        ext_inode_hdr(inode)->eh_depth = 0;
2896                        ext_inode_hdr(inode)->eh_max =
2897                                cpu_to_le16(ext4_ext_space_root(inode, 0));
2898                        err = ext4_ext_dirty(handle, inode, path);
2899                }
2900        }
2901out:
2902        ext4_ext_drop_refs(path);
2903        kfree(path);
2904        if (err == -EAGAIN) {
2905                path = NULL;
2906                goto again;
2907        }
2908        ext4_journal_stop(handle);
2909
2910        return err;
2911}
2912
2913/*
2914 * called at mount time
2915 */
2916void ext4_ext_init(struct super_block *sb)
2917{
2918        /*
2919         * possible initialization would be here
2920         */
2921
2922        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2923#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2924                printk(KERN_INFO "EXT4-fs: file extents enabled"
2925#ifdef AGGRESSIVE_TEST
2926                       ", aggressive tests"
2927#endif
2928#ifdef CHECK_BINSEARCH
2929                       ", check binsearch"
2930#endif
2931#ifdef EXTENTS_STATS
2932                       ", stats"
2933#endif
2934                       "\n");
2935#endif
2936#ifdef EXTENTS_STATS
2937                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2938                EXT4_SB(sb)->s_ext_min = 1 << 30;
2939                EXT4_SB(sb)->s_ext_max = 0;
2940#endif
2941        }
2942}
2943
2944/*
2945 * called at umount time
2946 */
2947void ext4_ext_release(struct super_block *sb)
2948{
2949        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2950                return;
2951
2952#ifdef EXTENTS_STATS
2953        if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2954                struct ext4_sb_info *sbi = EXT4_SB(sb);
2955                printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2956                        sbi->s_ext_blocks, sbi->s_ext_extents,
2957                        sbi->s_ext_blocks / sbi->s_ext_extents);
2958                printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2959                        sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2960        }
2961#endif
2962}
2963
2964/* FIXME!! we need to try to merge to left or right after zero-out  */
2965static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2966{
2967        ext4_fsblk_t ee_pblock;
2968        unsigned int ee_len;
2969        int ret;
2970
2971        ee_len    = ext4_ext_get_actual_len(ex);
2972        ee_pblock = ext4_ext_pblock(ex);
2973
2974        ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2975        if (ret > 0)
2976                ret = 0;
2977
2978        return ret;
2979}
2980
2981/*
2982 * ext4_split_extent_at() splits an extent at given block.
2983 *
2984 * @handle: the journal handle
2985 * @inode: the file inode
2986 * @path: the path to the extent
2987 * @split: the logical block where the extent is splitted.
2988 * @split_flags: indicates if the extent could be zeroout if split fails, and
2989 *               the states(init or uninit) of new extents.
2990 * @flags: flags used to insert new extent to extent tree.
2991 *
2992 *
2993 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2994 * of which are deterimined by split_flag.
2995 *
2996 * There are two cases:
2997 *  a> the extent are splitted into two extent.
2998 *  b> split is not needed, and just mark the extent.
2999 *
3000 * return 0 on success.
3001 */
3002static int ext4_split_extent_at(handle_t *handle,
3003                             struct inode *inode,
3004                             struct ext4_ext_path *path,
3005                             ext4_lblk_t split,
3006                             int split_flag,
3007                             int flags)
3008{
3009        ext4_fsblk_t newblock;
3010        ext4_lblk_t ee_block;
3011        struct ext4_extent *ex, newex, orig_ex, zero_ex;
3012        struct ext4_extent *ex2 = NULL;
3013        unsigned int ee_len, depth;
3014        int err = 0;
3015
3016        BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3017               (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3018
3019        ext_debug("ext4_split_extents_at: inode %lu, logical"
3020                "block %llu\n", inode->i_ino, (unsigned long long)split);
3021
3022        ext4_ext_show_leaf(inode, path);
3023
3024        depth = ext_depth(inode);
3025        ex = path[depth].p_ext;
3026        ee_block = le32_to_cpu(ex->ee_block);
3027        ee_len = ext4_ext_get_actual_len(ex);
3028        newblock = split - ee_block + ext4_ext_pblock(ex);
3029
3030        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3031        BUG_ON(!ext4_ext_is_uninitialized(ex) &&
3032               split_flag & (EXT4_EXT_MAY_ZEROOUT |
3033                             EXT4_EXT_MARK_UNINIT1 |
3034                             EXT4_EXT_MARK_UNINIT2));
3035
3036        err = ext4_ext_get_access(handle, inode, path + depth);
3037        if (err)
3038                goto out;
3039
3040        if (split == ee_block) {
3041                /*
3042                 * case b: block @split is the block that the extent begins with
3043                 * then we just change the state of the extent, and splitting
3044                 * is not needed.
3045                 */
3046                if (split_flag & EXT4_EXT_MARK_UNINIT2)
3047                        ext4_ext_mark_uninitialized(ex);
3048                else
3049                        ext4_ext_mark_initialized(ex);
3050
3051                if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3052                        ext4_ext_try_to_merge(handle, inode, path, ex);
3053
3054                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3055                goto out;
3056        }
3057
3058        /* case a */
3059        memcpy(&orig_ex, ex, sizeof(orig_ex));
3060        ex->ee_len = cpu_to_le16(split - ee_block);
3061        if (split_flag & EXT4_EXT_MARK_UNINIT1)
3062                ext4_ext_mark_uninitialized(ex);
3063
3064        /*
3065         * path may lead to new leaf, not to original leaf any more
3066         * after ext4_ext_insert_extent() returns,
3067         */
3068        err = ext4_ext_dirty(handle, inode, path + depth);
3069        if (err)
3070                goto fix_extent_len;
3071
3072        ex2 = &newex;
3073        ex2->ee_block = cpu_to_le32(split);
3074        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3075        ext4_ext_store_pblock(ex2, newblock);
3076        if (split_flag & EXT4_EXT_MARK_UNINIT2)
3077                ext4_ext_mark_uninitialized(ex2);
3078
3079        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3080        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3081                if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3082                        if (split_flag & EXT4_EXT_DATA_VALID1) {
3083                                err = ext4_ext_zeroout(inode, ex2);
3084                                zero_ex.ee_block = ex2->ee_block;
3085                                zero_ex.ee_len = cpu_to_le16(
3086                                                ext4_ext_get_actual_len(ex2));
3087                                ext4_ext_store_pblock(&zero_ex,
3088                                                      ext4_ext_pblock(ex2));
3089                        } else {
3090                                err = ext4_ext_zeroout(inode, ex);
3091                                zero_ex.ee_block = ex->ee_block;
3092                                zero_ex.ee_len = cpu_to_le16(
3093                                                ext4_ext_get_actual_len(ex));
3094                                ext4_ext_store_pblock(&zero_ex,
3095                                                      ext4_ext_pblock(ex));
3096                        }
3097                } else {
3098                        err = ext4_ext_zeroout(inode, &orig_ex);
3099                        zero_ex.ee_block = orig_ex.ee_block;
3100                        zero_ex.ee_len = cpu_to_le16(
3101                                                ext4_ext_get_actual_len(&orig_ex));
3102                        ext4_ext_store_pblock(&zero_ex,
3103                                              ext4_ext_pblock(&orig_ex));
3104                }
3105
3106                if (err)
3107                        goto fix_extent_len;
3108                /* update the extent length and mark as initialized */
3109                ex->ee_len = cpu_to_le16(ee_len);
3110                ext4_ext_try_to_merge(handle, inode, path, ex);
3111                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3112                if (err)
3113                        goto fix_extent_len;
3114
3115                /* update extent status tree */
3116                err = ext4_es_zeroout(inode, &zero_ex);
3117
3118                goto out;
3119        } else if (err)
3120                goto fix_extent_len;
3121
3122out:
3123        ext4_ext_show_leaf(inode, path);
3124        return err;
3125
3126fix_extent_len:
3127        ex->ee_len = orig_ex.ee_len;
3128        ext4_ext_dirty(handle, inode, path + depth);
3129        return err;
3130}
3131
3132/*
3133 * ext4_split_extents() splits an extent and mark extent which is covered
3134 * by @map as split_flags indicates
3135 *
3136 * It may result in splitting the extent into multiple extents (upto three)
3137 * There are three possibilities:
3138 *   a> There is no split required
3139 *   b> Splits in two extents: Split is happening at either end of the extent
3140 *   c> Splits in three extents: Somone is splitting in middle of the extent
3141 *
3142 */
3143static int ext4_split_extent(handle_t *handle,
3144                              struct inode *inode,
3145                              struct ext4_ext_path *path,
3146                              struct ext4_map_blocks *map,
3147                              int split_flag,
3148                              int flags)
3149{
3150        ext4_lblk_t ee_block;
3151        struct ext4_extent *ex;
3152        unsigned int ee_len, depth;
3153        int err = 0;
3154        int uninitialized;
3155        int split_flag1, flags1;
3156        int allocated = map->m_len;
3157
3158        depth = ext_depth(inode);
3159        ex = path[depth].p_ext;
3160        ee_block = le32_to_cpu(ex->ee_block);
3161        ee_len = ext4_ext_get_actual_len(ex);
3162        uninitialized = ext4_ext_is_uninitialized(ex);
3163
3164        if (map->m_lblk + map->m_len < ee_block + ee_len) {
3165                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3166                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3167                if (uninitialized)
3168                        split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
3169                                       EXT4_EXT_MARK_UNINIT2;
3170                if (split_flag & EXT4_EXT_DATA_VALID2)
3171                        split_flag1 |= EXT4_EXT_DATA_VALID1;
3172                err = ext4_split_extent_at(handle, inode, path,
3173                                map->m_lblk + map->m_len, split_flag1, flags1);
3174                if (err)
3175                        goto out;
3176        } else {
3177                allocated = ee_len - (map->m_lblk - ee_block);
3178        }
3179        /*
3180         * Update path is required because previous ext4_split_extent_at() may
3181         * result in split of original leaf or extent zeroout.
3182         */
3183        ext4_ext_drop_refs(path);
3184        path = ext4_ext_find_extent(inode, map->m_lblk, path);
3185        if (IS_ERR(path))
3186                return PTR_ERR(path);
3187        depth = ext_depth(inode);
3188        ex = path[depth].p_ext;
3189        uninitialized = ext4_ext_is_uninitialized(ex);
3190        split_flag1 = 0;
3191
3192        if (map->m_lblk >= ee_block) {
3193                split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3194                if (uninitialized) {
3195                        split_flag1 |= EXT4_EXT_MARK_UNINIT1;
3196                        split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3197                                                     EXT4_EXT_MARK_UNINIT2);
3198                }
3199                err = ext4_split_extent_at(handle, inode, path,
3200                                map->m_lblk, split_flag1, flags);
3201                if (err)
3202                        goto out;
3203        }
3204
3205        ext4_ext_show_leaf(inode, path);
3206out:
3207        return err ? err : allocated;
3208}
3209
3210/*
3211 * This function is called by ext4_ext_map_blocks() if someone tries to write
3212 * to an uninitialized extent. It may result in splitting the uninitialized
3213 * extent into multiple extents (up to three - one initialized and two
3214 * uninitialized).
3215 * There are three possibilities:
3216 *   a> There is no split required: Entire extent should be initialized
3217 *   b> Splits in two extents: Write is happening at either end of the extent
3218 *   c> Splits in three extents: Somone is writing in middle of the extent
3219 *
3220 * Pre-conditions:
3221 *  - The extent pointed to by 'path' is uninitialized.
3222 *  - The extent pointed to by 'path' contains a superset
3223 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3224 *
3225 * Post-conditions on success:
3226 *  - the returned value is the number of blocks beyond map->l_lblk
3227 *    that are allocated and initialized.
3228 *    It is guaranteed to be >= map->m_len.
3229 */
3230static int ext4_ext_convert_to_initialized(handle_t *handle,
3231                                           struct inode *inode,
3232                                           struct ext4_map_blocks *map,
3233                                           struct ext4_ext_path *path,
3234                                           int flags)
3235{
3236        struct ext4_sb_info *sbi;
3237        struct ext4_extent_header *eh;
3238        struct ext4_map_blocks split_map;
3239        struct ext4_extent zero_ex;
3240        struct ext4_extent *ex, *abut_ex;
3241        ext4_lblk_t ee_block, eof_block;
3242        unsigned int ee_len, depth, map_len = map->m_len;
3243        int allocated = 0, max_zeroout = 0;
3244        int err = 0;
3245        int split_flag = 0;
3246
3247        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3248                "block %llu, max_blocks %u\n", inode->i_ino,
3249                (unsigned long long)map->m_lblk, map_len);
3250
3251        sbi = EXT4_SB(inode->i_sb);
3252        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3253                inode->i_sb->s_blocksize_bits;
3254        if (eof_block < map->m_lblk + map_len)
3255                eof_block = map->m_lblk + map_len;
3256
3257        depth = ext_depth(inode);
3258        eh = path[depth].p_hdr;
3259        ex = path[depth].p_ext;
3260        ee_block = le32_to_cpu(ex->ee_block);
3261        ee_len = ext4_ext_get_actual_len(ex);
3262        zero_ex.ee_len = 0;
3263
3264        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3265
3266        /* Pre-conditions */
3267        BUG_ON(!ext4_ext_is_uninitialized(ex));
3268        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3269
3270        /*
3271         * Attempt to transfer newly initialized blocks from the currently
3272         * uninitialized extent to its neighbor. This is much cheaper
3273         * than an insertion followed by a merge as those involve costly
3274         * memmove() calls. Transferring to the left is the common case in
3275         * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3276         * followed by append writes.
3277         *
3278         * Limitations of the current logic:
3279         *  - L1: we do not deal with writes covering the whole extent.
3280         *    This would require removing the extent if the transfer
3281         *    is possible.
3282         *  - L2: we only attempt to merge with an extent stored in the
3283         *    same extent tree node.
3284         */
3285        if ((map->m_lblk == ee_block) &&
3286                /* See if we can merge left */
3287                (map_len < ee_len) &&           /*L1*/
3288                (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3289                ext4_lblk_t prev_lblk;
3290                ext4_fsblk_t prev_pblk, ee_pblk;
3291                unsigned int prev_len;
3292
3293                abut_ex = ex - 1;
3294                prev_lblk = le32_to_cpu(abut_ex->ee_block);
3295                prev_len = ext4_ext_get_actual_len(abut_ex);
3296                prev_pblk = ext4_ext_pblock(abut_ex);
3297                ee_pblk = ext4_ext_pblock(ex);
3298
3299                /*
3300                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3301                 * upon those conditions:
3302                 * - C1: abut_ex is initialized,
3303                 * - C2: abut_ex is logically abutting ex,
3304                 * - C3: abut_ex is physically abutting ex,
3305                 * - C4: abut_ex can receive the additional blocks without
3306                 *   overflowing the (initialized) length limit.
3307                 */
3308                if ((!ext4_ext_is_uninitialized(abut_ex)) &&            /*C1*/
3309                        ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3310                        ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3311                        (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3312                        err = ext4_ext_get_access(handle, inode, path + depth);
3313                        if (err)
3314                                goto out;
3315
3316                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
3317                                map, ex, abut_ex);
3318
3319                        /* Shift the start of ex by 'map_len' blocks */
3320                        ex->ee_block = cpu_to_le32(ee_block + map_len);
3321                        ext4_ext_store_pblock(ex, ee_pblk + map_len);
3322                        ex->ee_len = cpu_to_le16(ee_len - map_len);
3323                        ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3324
3325                        /* Extend abut_ex by 'map_len' blocks */
3326                        abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3327
3328                        /* Result: number of initialized blocks past m_lblk */
3329                        allocated = map_len;
3330                }
3331        } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3332                   (map_len < ee_len) &&        /*L1*/
3333                   ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3334                /* See if we can merge right */
3335                ext4_lblk_t next_lblk;
3336                ext4_fsblk_t next_pblk, ee_pblk;
3337                unsigned int next_len;
3338
3339                abut_ex = ex + 1;
3340                next_lblk = le32_to_cpu(abut_ex->ee_block);
3341                next_len = ext4_ext_get_actual_len(abut_ex);
3342                next_pblk = ext4_ext_pblock(abut_ex);
3343                ee_pblk = ext4_ext_pblock(ex);
3344
3345                /*
3346                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3347                 * upon those conditions:
3348                 * - C1: abut_ex is initialized,
3349                 * - C2: abut_ex is logically abutting ex,
3350                 * - C3: abut_ex is physically abutting ex,
3351                 * - C4: abut_ex can receive the additional blocks without
3352                 *   overflowing the (initialized) length limit.
3353                 */
3354                if ((!ext4_ext_is_uninitialized(abut_ex)) &&            /*C1*/
3355                    ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3356                    ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3357                    (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3358                        err = ext4_ext_get_access(handle, inode, path + depth);
3359                        if (err)
3360                                goto out;
3361
3362                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
3363                                map, ex, abut_ex);
3364
3365                        /* Shift the start of abut_ex by 'map_len' blocks */
3366                        abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3367                        ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3368                        ex->ee_len = cpu_to_le16(ee_len - map_len);
3369                        ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3370
3371                        /* Extend abut_ex by 'map_len' blocks */
3372                        abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3373
3374                        /* Result: number of initialized blocks past m_lblk */
3375                        allocated = map_len;
3376                }
3377        }
3378        if (allocated) {
3379                /* Mark the block containing both extents as dirty */
3380                ext4_ext_dirty(handle, inode, path + depth);
3381
3382                /* Update path to point to the right extent */
3383                path[depth].p_ext = abut_ex;
3384                goto out;
3385        } else
3386                allocated = ee_len - (map->m_lblk - ee_block);
3387
3388        WARN_ON(map->m_lblk < ee_block);
3389        /*
3390         * It is safe to convert extent to initialized via explicit
3391         * zeroout only if extent is fully insde i_size or new_size.
3392         */
3393        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3394
3395        if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3396                max_zeroout = sbi->s_extent_max_zeroout_kb >>
3397                        (inode->i_sb->s_blocksize_bits - 10);
3398
3399        /* If extent is less than s_max_zeroout_kb, zeroout directly */
3400        if (max_zeroout && (ee_len <= max_zeroout)) {
3401                err = ext4_ext_zeroout(inode, ex);
3402                if (err)
3403                        goto out;
3404                zero_ex.ee_block = ex->ee_block;
3405                zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
3406                ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
3407
3408                err = ext4_ext_get_access(handle, inode, path + depth);
3409                if (err)
3410                        goto out;
3411                ext4_ext_mark_initialized(ex);
3412                ext4_ext_try_to_merge(handle, inode, path, ex);
3413                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3414                goto out;
3415        }
3416
3417        /*
3418         * four cases:
3419         * 1. split the extent into three extents.
3420         * 2. split the extent into two extents, zeroout the first half.
3421         * 3. split the extent into two extents, zeroout the second half.
3422         * 4. split the extent into two extents with out zeroout.
3423         */
3424        split_map.m_lblk = map->m_lblk;
3425        split_map.m_len = map->m_len;
3426
3427        if (max_zeroout && (allocated > map->m_len)) {
3428                if (allocated <= max_zeroout) {
3429                        /* case 3 */
3430                        zero_ex.ee_block =
3431                                         cpu_to_le32(map->m_lblk);
3432                        zero_ex.ee_len = cpu_to_le16(allocated);
3433                        ext4_ext_store_pblock(&zero_ex,
3434                                ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3435                        err = ext4_ext_zeroout(inode, &zero_ex);
3436                        if (err)
3437                                goto out;
3438                        split_map.m_lblk = map->m_lblk;
3439                        split_map.m_len = allocated;
3440                } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
3441                        /* case 2 */
3442                        if (map->m_lblk != ee_block) {
3443                                zero_ex.ee_block = ex->ee_block;
3444                                zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3445                                                        ee_block);
3446                                ext4_ext_store_pblock(&zero_ex,
3447                                                      ext4_ext_pblock(ex));
3448                                err = ext4_ext_zeroout(inode, &zero_ex);
3449                                if (err)
3450                                        goto out;
3451                        }
3452
3453                        split_map.m_lblk = ee_block;
3454                        split_map.m_len = map->m_lblk - ee_block + map->m_len;
3455                        allocated = map->m_len;
3456                }
3457        }
3458
3459        allocated = ext4_split_extent(handle, inode, path,
3460                                      &split_map, split_flag, flags);
3461        if (allocated < 0)
3462                err = allocated;
3463
3464out:
3465        /* If we have gotten a failure, don't zero out status tree */
3466        if (!err)
3467                err = ext4_es_zeroout(inode, &zero_ex);
3468        return err ? err : allocated;
3469}
3470
3471/*
3472 * This function is called by ext4_ext_map_blocks() from
3473 * ext4_get_blocks_dio_write() when DIO to write
3474 * to an uninitialized extent.
3475 *
3476 * Writing to an uninitialized extent may result in splitting the uninitialized
3477 * extent into multiple initialized/uninitialized extents (up to three)
3478 * There are three possibilities:
3479 *   a> There is no split required: Entire extent should be uninitialized
3480 *   b> Splits in two extents: Write is happening at either end of the extent
3481 *   c> Splits in three extents: Somone is writing in middle of the extent
3482 *
3483 * One of more index blocks maybe needed if the extent tree grow after
3484 * the uninitialized extent split. To prevent ENOSPC occur at the IO
3485 * complete, we need to split the uninitialized extent before DIO submit
3486 * the IO. The uninitialized extent called at this time will be split
3487 * into three uninitialized extent(at most). After IO complete, the part
3488 * being filled will be convert to initialized by the end_io callback function
3489 * via ext4_convert_unwritten_extents().
3490 *
3491 * Returns the size of uninitialized extent to be written on success.
3492 */
3493static int ext4_split_unwritten_extents(handle_t *handle,
3494                                        struct inode *inode,
3495                                        struct ext4_map_blocks *map,
3496                                        struct ext4_ext_path *path,
3497                                        int flags)
3498{
3499        ext4_lblk_t eof_block;
3500        ext4_lblk_t ee_block;
3501        struct ext4_extent *ex;
3502        unsigned int ee_len;
3503        int split_flag = 0, depth;
3504
3505        ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
3506                "block %llu, max_blocks %u\n", inode->i_ino,
3507                (unsigned long long)map->m_lblk, map->m_len);
3508
3509        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3510                inode->i_sb->s_blocksize_bits;
3511        if (eof_block < map->m_lblk + map->m_len)
3512                eof_block = map->m_lblk + map->m_len;
3513        /*
3514         * It is safe to convert extent to initialized via explicit
3515         * zeroout only if extent is fully insde i_size or new_size.
3516         */
3517        depth = ext_depth(inode);
3518        ex = path[depth].p_ext;
3519        ee_block = le32_to_cpu(ex->ee_block);
3520        ee_len = ext4_ext_get_actual_len(ex);
3521
3522        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3523        split_flag |= EXT4_EXT_MARK_UNINIT2;
3524        if (flags & EXT4_GET_BLOCKS_CONVERT)
3525                split_flag |= EXT4_EXT_DATA_VALID2;
3526        flags |= EXT4_GET_BLOCKS_PRE_IO;
3527        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3528}
3529
3530static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3531                                                struct inode *inode,
3532                                                struct ext4_map_blocks *map,
3533                                                struct ext4_ext_path *path)
3534{
3535        struct ext4_extent *ex;
3536        ext4_lblk_t ee_block;
3537        unsigned int ee_len;
3538        int depth;
3539        int err = 0;
3540
3541        depth = ext_depth(inode);
3542        ex = path[depth].p_ext;
3543        ee_block = le32_to_cpu(ex->ee_block);
3544        ee_len = ext4_ext_get_actual_len(ex);
3545
3546        ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3547                "block %llu, max_blocks %u\n", inode->i_ino,
3548                  (unsigned long long)ee_block, ee_len);
3549
3550        /* If extent is larger than requested it is a clear sign that we still
3551         * have some extent state machine issues left. So extent_split is still
3552         * required.
3553         * TODO: Once all related issues will be fixed this situation should be
3554         * illegal.
3555         */
3556        if (ee_block != map->m_lblk || ee_len > map->m_len) {
3557#ifdef EXT4_DEBUG
3558                ext4_warning("Inode (%ld) finished: extent logical block %llu,"
3559                             " len %u; IO logical block %llu, len %u\n",
3560                             inode->i_ino, (unsigned long long)ee_block, ee_len,
3561                             (unsigned long long)map->m_lblk, map->m_len);
3562#endif
3563                err = ext4_split_unwritten_extents(handle, inode, map, path,
3564                                                   EXT4_GET_BLOCKS_CONVERT);
3565                if (err < 0)
3566                        goto out;
3567                ext4_ext_drop_refs(path);
3568                path = ext4_ext_find_extent(inode, map->m_lblk, path);
3569                if (IS_ERR(path)) {
3570                        err = PTR_ERR(path);
3571                        goto out;
3572                }
3573                depth = ext_depth(inode);
3574                ex = path[depth].p_ext;
3575        }
3576
3577        err = ext4_ext_get_access(handle, inode, path + depth);
3578        if (err)
3579                goto out;
3580        /* first mark the extent as initialized */
3581        ext4_ext_mark_initialized(ex);
3582
3583        /* note: ext4_ext_correct_indexes() isn't needed here because
3584         * borders are not changed
3585         */
3586        ext4_ext_try_to_merge(handle, inode, path, ex);
3587
3588        /* Mark modified extent as dirty */
3589        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3590out:
3591        ext4_ext_show_leaf(inode, path);
3592        return err;
3593}
3594
3595static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3596                        sector_t block, int count)
3597{
3598        int i;
3599        for (i = 0; i < count; i++)
3600                unmap_underlying_metadata(bdev, block + i);
3601}
3602
3603/*
3604 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3605 */
3606static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3607                              ext4_lblk_t lblk,
3608                              struct ext4_ext_path *path,
3609                              unsigned int len)
3610{
3611        int i, depth;
3612        struct ext4_extent_header *eh;
3613        struct ext4_extent *last_ex;
3614
3615        if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3616                return 0;
3617
3618        depth = ext_depth(inode);
3619        eh = path[depth].p_hdr;
3620
3621        /*
3622         * We're going to remove EOFBLOCKS_FL entirely in future so we
3623         * do not care for this case anymore. Simply remove the flag
3624         * if there are no extents.
3625         */
3626        if (unlikely(!eh->eh_entries))
3627                goto out;
3628        last_ex = EXT_LAST_EXTENT(eh);
3629        /*
3630         * We should clear the EOFBLOCKS_FL flag if we are writing the
3631         * last block in the last extent in the file.  We test this by
3632         * first checking to see if the caller to
3633         * ext4_ext_get_blocks() was interested in the last block (or
3634         * a block beyond the last block) in the current extent.  If
3635         * this turns out to be false, we can bail out from this
3636         * function immediately.
3637         */
3638        if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3639            ext4_ext_get_actual_len(last_ex))
3640                return 0;
3641        /*
3642         * If the caller does appear to be planning to write at or
3643         * beyond the end of the current extent, we then test to see
3644         * if the current extent is the last extent in the file, by
3645         * checking to make sure it was reached via the rightmost node
3646         * at each level of the tree.
3647         */
3648        for (i = depth-1; i >= 0; i--)
3649                if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3650                        return 0;
3651out:
3652        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3653        return ext4_mark_inode_dirty(handle, inode);
3654}
3655
3656/**
3657 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3658 *
3659 * Return 1 if there is a delalloc block in the range, otherwise 0.
3660 */
3661int ext4_find_delalloc_range(struct inode *inode,
3662                             ext4_lblk_t lblk_start,
3663                             ext4_lblk_t lblk_end)
3664{
3665        struct extent_status es;
3666
3667        ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
3668        if (es.es_len == 0)
3669                return 0; /* there is no delay extent in this tree */
3670        else if (es.es_lblk <= lblk_start &&
3671                 lblk_start < es.es_lblk + es.es_len)
3672                return 1;
3673        else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
3674                return 1;
3675        else
3676                return 0;
3677}
3678
3679int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
3680{
3681        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3682        ext4_lblk_t lblk_start, lblk_end;
3683        lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3684        lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3685
3686        return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
3687}
3688
3689/**
3690 * Determines how many complete clusters (out of those specified by the 'map')
3691 * are under delalloc and were reserved quota for.
3692 * This function is called when we are writing out the blocks that were
3693 * originally written with their allocation delayed, but then the space was
3694 * allocated using fallocate() before the delayed allocation could be resolved.
3695 * The cases to look for are:
3696 * ('=' indicated delayed allocated blocks
3697 *  '-' indicates non-delayed allocated blocks)
3698 * (a) partial clusters towards beginning and/or end outside of allocated range
3699 *     are not delalloc'ed.
3700 *      Ex:
3701 *      |----c---=|====c====|====c====|===-c----|
3702 *               |++++++ allocated ++++++|
3703 *      ==> 4 complete clusters in above example
3704 *
3705 * (b) partial cluster (outside of allocated range) towards either end is
3706 *     marked for delayed allocation. In this case, we will exclude that
3707 *     cluster.
3708 *      Ex:
3709 *      |----====c========|========c========|
3710 *           |++++++ allocated ++++++|
3711 *      ==> 1 complete clusters in above example
3712 *
3713 *      Ex:
3714 *      |================c================|
3715 *            |++++++ allocated ++++++|
3716 *      ==> 0 complete clusters in above example
3717 *
3718 * The ext4_da_update_reserve_space will be called only if we
3719 * determine here that there were some "entire" clusters that span
3720 * this 'allocated' range.
3721 * In the non-bigalloc case, this function will just end up returning num_blks
3722 * without ever calling ext4_find_delalloc_range.
3723 */
3724static unsigned int
3725get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3726                           unsigned int num_blks)
3727{
3728        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3729        ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3730        ext4_lblk_t lblk_from, lblk_to, c_offset;
3731        unsigned int allocated_clusters = 0;
3732
3733        alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3734        alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3735
3736        /* max possible clusters for this allocation */
3737        allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3738
3739        trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3740
3741        /* Check towards left side */
3742        c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
3743        if (c_offset) {
3744                lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3745                lblk_to = lblk_from + c_offset - 1;
3746
3747                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3748                        allocated_clusters--;
3749        }
3750
3751        /* Now check towards right. */
3752        c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
3753        if (allocated_clusters && c_offset) {
3754                lblk_from = lblk_start + num_blks;
3755                lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3756
3757                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3758                        allocated_clusters--;
3759        }
3760
3761        return allocated_clusters;
3762}
3763
3764static int
3765ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3766                        struct ext4_map_blocks *map,
3767                        struct ext4_ext_path *path, int flags,
3768                        unsigned int allocated, ext4_fsblk_t newblock)
3769{
3770        int ret = 0;
3771        int err = 0;
3772        ext4_io_end_t *io = ext4_inode_aio(inode);
3773
3774        ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
3775                  "block %llu, max_blocks %u, flags %x, allocated %u\n",
3776                  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3777                  flags, allocated);
3778        ext4_ext_show_leaf(inode, path);
3779
3780        /*
3781         * When writing into uninitialized space, we should not fail to
3782         * allocate metadata blocks for the new extent block if needed.
3783         */
3784        flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
3785
3786        trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
3787                                                    allocated, newblock);
3788
3789        /* get_block() before submit the IO, split the extent */
3790        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3791                ret = ext4_split_unwritten_extents(handle, inode, map,
3792                                                   path, flags);
3793                if (ret <= 0)
3794                        goto out;
3795                /*
3796                 * Flag the inode(non aio case) or end_io struct (aio case)
3797                 * that this IO needs to conversion to written when IO is
3798                 * completed
3799                 */
3800                if (io)
3801                        ext4_set_io_unwritten_flag(inode, io);
3802                else
3803                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3804                map->m_flags |= EXT4_MAP_UNWRITTEN;
3805                if (ext4_should_dioread_nolock(inode))
3806                        map->m_flags |= EXT4_MAP_UNINIT;
3807                goto out;
3808        }
3809        /* IO end_io complete, convert the filled extent to written */
3810        if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3811                ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
3812                                                        path);
3813                if (ret >= 0) {
3814                        ext4_update_inode_fsync_trans(handle, inode, 1);
3815                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
3816                                                 path, map->m_len);
3817                } else
3818                        err = ret;
3819                map->m_flags |= EXT4_MAP_MAPPED;
3820                if (allocated > map->m_len)
3821                        allocated = map->m_len;
3822                map->m_len = allocated;
3823                goto out2;
3824        }
3825        /* buffered IO case */
3826        /*
3827         * repeat fallocate creation request
3828         * we already have an unwritten extent
3829         */
3830        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) {
3831                map->m_flags |= EXT4_MAP_UNWRITTEN;
3832                goto map_out;
3833        }
3834
3835        /* buffered READ or buffered write_begin() lookup */
3836        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3837                /*
3838                 * We have blocks reserved already.  We
3839                 * return allocated blocks so that delalloc
3840                 * won't do block reservation for us.  But
3841                 * the buffer head will be unmapped so that
3842                 * a read from the block returns 0s.
3843                 */
3844                map->m_flags |= EXT4_MAP_UNWRITTEN;
3845                goto out1;
3846        }
3847
3848        /* buffered write, writepage time, convert*/
3849        ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
3850        if (ret >= 0)
3851                ext4_update_inode_fsync_trans(handle, inode, 1);
3852out:
3853        if (ret <= 0) {
3854                err = ret;
3855                goto out2;
3856        } else
3857                allocated = ret;
3858        map->m_flags |= EXT4_MAP_NEW;
3859        /*
3860         * if we allocated more blocks than requested
3861         * we need to make sure we unmap the extra block
3862         * allocated. The actual needed block will get
3863         * unmapped later when we find the buffer_head marked
3864         * new.
3865         */
3866        if (allocated > map->m_len) {
3867                unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3868                                        newblock + map->m_len,
3869                                        allocated - map->m_len);
3870                allocated = map->m_len;
3871        }
3872        map->m_len = allocated;
3873
3874        /*
3875         * If we have done fallocate with the offset that is already
3876         * delayed allocated, we would have block reservation
3877         * and quota reservation done in the delayed write path.
3878         * But fallocate would have already updated quota and block
3879         * count for this offset. So cancel these reservation
3880         */
3881        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
3882                unsigned int reserved_clusters;
3883                reserved_clusters = get_reserved_cluster_alloc(inode,
3884                                map->m_lblk, map->m_len);
3885                if (reserved_clusters)
3886                        ext4_da_update_reserve_space(inode,
3887                                                     reserved_clusters,
3888                                                     0);
3889        }
3890
3891map_out:
3892        map->m_flags |= EXT4_MAP_MAPPED;
3893        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
3894                err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3895                                         map->m_len);
3896                if (err < 0)
3897                        goto out2;
3898        }
3899out1:
3900        if (allocated > map->m_len)
3901                allocated = map->m_len;
3902        ext4_ext_show_leaf(inode, path);
3903        map->m_pblk = newblock;
3904        map->m_len = allocated;
3905out2:
3906        if (path) {
3907                ext4_ext_drop_refs(path);
3908                kfree(path);
3909        }
3910        return err ? err : allocated;
3911}
3912
3913/*
3914 * get_implied_cluster_alloc - check to see if the requested
3915 * allocation (in the map structure) overlaps with a cluster already
3916 * allocated in an extent.
3917 *      @sb     The filesystem superblock structure
3918 *      @map    The requested lblk->pblk mapping
3919 *      @ex     The extent structure which might contain an implied
3920 *                      cluster allocation
3921 *
3922 * This function is called by ext4_ext_map_blocks() after we failed to
3923 * find blocks that were already in the inode's extent tree.  Hence,
3924 * we know that the beginning of the requested region cannot overlap
3925 * the extent from the inode's extent tree.  There are three cases we
3926 * want to catch.  The first is this case:
3927 *
3928 *               |--- cluster # N--|
3929 *    |--- extent ---|  |---- requested region ---|
3930 *                      |==========|
3931 *
3932 * The second case that we need to test for is this one:
3933 *
3934 *   |--------- cluster # N ----------------|
3935 *         |--- requested region --|   |------- extent ----|
3936 *         |=======================|
3937 *
3938 * The third case is when the requested region lies between two extents
3939 * within the same cluster:
3940 *          |------------- cluster # N-------------|
3941 * |----- ex -----|                  |---- ex_right ----|
3942 *                  |------ requested region ------|
3943 *                  |================|
3944 *
3945 * In each of the above cases, we need to set the map->m_pblk and
3946 * map->m_len so it corresponds to the return the extent labelled as
3947 * "|====|" from cluster #N, since it is already in use for data in
3948 * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
3949 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3950 * as a new "allocated" block region.  Otherwise, we will return 0 and
3951 * ext4_ext_map_blocks() will then allocate one or more new clusters
3952 * by calling ext4_mb_new_blocks().
3953 */
3954static int get_implied_cluster_alloc(struct super_block *sb,
3955                                     struct ext4_map_blocks *map,
3956                                     struct ext4_extent *ex,
3957                                     struct ext4_ext_path *path)
3958{
3959        struct ext4_sb_info *sbi = EXT4_SB(sb);
3960        ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3961        ext4_lblk_t ex_cluster_start, ex_cluster_end;
3962        ext4_lblk_t rr_cluster_start;
3963        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3964        ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3965        unsigned short ee_len = ext4_ext_get_actual_len(ex);
3966
3967        /* The extent passed in that we are trying to match */
3968        ex_cluster_start = EXT4_B2C(sbi, ee_block);
3969        ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3970
3971        /* The requested region passed into ext4_map_blocks() */
3972        rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3973
3974        if ((rr_cluster_start == ex_cluster_end) ||
3975            (rr_cluster_start == ex_cluster_start)) {
3976                if (rr_cluster_start == ex_cluster_end)
3977                        ee_start += ee_len - 1;
3978                map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
3979                        c_offset;
3980                map->m_len = min(map->m_len,
3981                                 (unsigned) sbi->s_cluster_ratio - c_offset);
3982                /*
3983                 * Check for and handle this case:
3984                 *
3985                 *   |--------- cluster # N-------------|
3986                 *                     |------- extent ----|
3987                 *         |--- requested region ---|
3988                 *         |===========|
3989                 */
3990
3991                if (map->m_lblk < ee_block)
3992                        map->m_len = min(map->m_len, ee_block - map->m_lblk);
3993
3994                /*
3995                 * Check for the case where there is already another allocated
3996                 * block to the right of 'ex' but before the end of the cluster.
3997                 *
3998                 *          |------------- cluster # N-------------|
3999                 * |----- ex -----|                  |---- ex_right ----|
4000                 *                  |------ requested region ------|
4001                 *                  |================|
4002                 */
4003                if (map->m_lblk > ee_block) {
4004                        ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4005                        map->m_len = min(map->m_len, next - map->m_lblk);
4006                }
4007
4008                trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4009                return 1;
4010        }
4011
4012        trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4013        return 0;
4014}
4015
4016
4017/*
4018 * Block allocation/map/preallocation routine for extents based files
4019 *
4020 *
4021 * Need to be called with
4022 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4023 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4024 *
4025 * return > 0, number of of blocks already mapped/allocated
4026 *          if create == 0 and these are pre-allocated blocks
4027 *              buffer head is unmapped
4028 *          otherwise blocks are mapped
4029 *
4030 * return = 0, if plain look up failed (blocks have not been allocated)
4031 *          buffer head is unmapped
4032 *
4033 * return < 0, error case.
4034 */
4035int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4036                        struct ext4_map_blocks *map, int flags)
4037{
4038        struct ext4_ext_path *path = NULL;
4039        struct ext4_extent newex, *ex, *ex2;
4040        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4041        ext4_fsblk_t newblock = 0;
4042        int free_on_err = 0, err = 0, depth;
4043        unsigned int allocated = 0, offset = 0;
4044        unsigned int allocated_clusters = 0;
4045        struct ext4_allocation_request ar;
4046        ext4_io_end_t *io = ext4_inode_aio(inode);
4047        ext4_lblk_t cluster_offset;
4048        int set_unwritten = 0;
4049
4050        ext_debug("blocks %u/%u requested for inode %lu\n",
4051                  map->m_lblk, map->m_len, inode->i_ino);
4052        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4053
4054        /* find extent for this block */
4055        path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
4056        if (IS_ERR(path)) {
4057                err = PTR_ERR(path);
4058                path = NULL;
4059                goto out2;
4060        }
4061
4062        depth = ext_depth(inode);
4063
4064        /*
4065         * consistent leaf must not be empty;
4066         * this situation is possible, though, _during_ tree modification;
4067         * this is why assert can't be put in ext4_ext_find_extent()
4068         */
4069        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4070                EXT4_ERROR_INODE(inode, "bad extent address "
4071                                 "lblock: %lu, depth: %d pblock %lld",
4072                                 (unsigned long) map->m_lblk, depth,
4073                                 path[depth].p_block);
4074                err = -EIO;
4075                goto out2;
4076        }
4077
4078        ex = path[depth].p_ext;
4079        if (ex) {
4080                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4081                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4082                unsigned short ee_len;
4083
4084                /*
4085                 * Uninitialized extents are treated as holes, except that
4086                 * we split out initialized portions during a write.
4087                 */
4088                ee_len = ext4_ext_get_actual_len(ex);
4089
4090                trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4091
4092                /* if found extent covers block, simply return it */
4093                if (in_range(map->m_lblk, ee_block, ee_len)) {
4094                        newblock = map->m_lblk - ee_block + ee_start;
4095                        /* number of remaining blocks in the extent */
4096                        allocated = ee_len - (map->m_lblk - ee_block);
4097                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4098                                  ee_block, ee_len, newblock);
4099
4100                        if (!ext4_ext_is_uninitialized(ex))
4101                                goto out;
4102
4103                        allocated = ext4_ext_handle_uninitialized_extents(
4104                                handle, inode, map, path, flags,
4105                                allocated, newblock);
4106                        goto out3;
4107                }
4108        }
4109
4110        if ((sbi->s_cluster_ratio > 1) &&
4111            ext4_find_delalloc_cluster(inode, map->m_lblk))
4112                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4113
4114        /*
4115         * requested block isn't allocated yet;
4116         * we couldn't try to create block if create flag is zero
4117         */
4118        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4119                /*
4120                 * put just found gap into cache to speed up
4121                 * subsequent requests
4122                 */
4123                if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
4124                        ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
4125                goto out2;
4126        }
4127
4128        /*
4129         * Okay, we need to do block allocation.
4130         */
4131        map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
4132        newex.ee_block = cpu_to_le32(map->m_lblk);
4133        cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
4134
4135        /*
4136         * If we are doing bigalloc, check to see if the extent returned
4137         * by ext4_ext_find_extent() implies a cluster we can use.
4138         */
4139        if (cluster_offset && ex &&
4140            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4141                ar.len = allocated = map->m_len;
4142                newblock = map->m_pblk;
4143                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4144                goto got_allocated_blocks;
4145        }
4146
4147        /* find neighbour allocated blocks */
4148        ar.lleft = map->m_lblk;
4149        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4150        if (err)
4151                goto out2;
4152        ar.lright = map->m_lblk;
4153        ex2 = NULL;
4154        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4155        if (err)
4156                goto out2;
4157
4158        /* Check if the extent after searching to the right implies a
4159         * cluster we can use. */
4160        if ((sbi->s_cluster_ratio > 1) && ex2 &&
4161            get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4162                ar.len = allocated = map->m_len;
4163                newblock = map->m_pblk;
4164                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4165                goto got_allocated_blocks;
4166        }
4167
4168        /*
4169         * See if request is beyond maximum number of blocks we can have in
4170         * a single extent. For an initialized extent this limit is
4171         * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
4172         * EXT_UNINIT_MAX_LEN.
4173         */
4174        if (map->m_len > EXT_INIT_MAX_LEN &&
4175            !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
4176                map->m_len = EXT_INIT_MAX_LEN;
4177        else if (map->m_len > EXT_UNINIT_MAX_LEN &&
4178                 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
4179                map->m_len = EXT_UNINIT_MAX_LEN;
4180
4181        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4182        newex.ee_len = cpu_to_le16(map->m_len);
4183        err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4184        if (err)
4185                allocated = ext4_ext_get_actual_len(&newex);
4186        else
4187                allocated = map->m_len;
4188
4189        /* allocate new block */
4190        ar.inode = inode;
4191        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4192        ar.logical = map->m_lblk;
4193        /*
4194         * We calculate the offset from the beginning of the cluster
4195         * for the logical block number, since when we allocate a
4196         * physical cluster, the physical block should start at the
4197         * same offset from the beginning of the cluster.  This is
4198         * needed so that future calls to get_implied_cluster_alloc()
4199         * work correctly.
4200         */
4201        offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
4202        ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4203        ar.goal -= offset;
4204        ar.logical -= offset;
4205        if (S_ISREG(inode->i_mode))
4206                ar.flags = EXT4_MB_HINT_DATA;
4207        else
4208                /* disable in-core preallocation for non-regular files */
4209                ar.flags = 0;
4210        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4211                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4212        newblock = ext4_mb_new_blocks(handle, &ar, &err);
4213        if (!newblock)
4214                goto out2;
4215        ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4216                  ar.goal, newblock, allocated);
4217        free_on_err = 1;
4218        allocated_clusters = ar.len;
4219        ar.len = EXT4_C2B(sbi, ar.len) - offset;
4220        if (ar.len > allocated)
4221                ar.len = allocated;
4222
4223got_allocated_blocks:
4224        /* try to insert new extent into found leaf and return */
4225        ext4_ext_store_pblock(&newex, newblock + offset);
4226        newex.ee_len = cpu_to_le16(ar.len);
4227        /* Mark uninitialized */
4228        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
4229                ext4_ext_mark_uninitialized(&newex);
4230                map->m_flags |= EXT4_MAP_UNWRITTEN;
4231                /*
4232                 * io_end structure was created for every IO write to an
4233                 * uninitialized extent. To avoid unnecessary conversion,
4234                 * here we flag the IO that really needs the conversion.
4235                 * For non asycn direct IO case, flag the inode state
4236                 * that we need to perform conversion when IO is done.
4237                 */
4238                if ((flags & EXT4_GET_BLOCKS_PRE_IO))
4239                        set_unwritten = 1;
4240                if (ext4_should_dioread_nolock(inode))
4241                        map->m_flags |= EXT4_MAP_UNINIT;
4242        }
4243
4244        err = 0;
4245        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4246                err = check_eofblocks_fl(handle, inode, map->m_lblk,
4247                                         path, ar.len);
4248        if (!err)
4249                err = ext4_ext_insert_extent(handle, inode, path,
4250                                             &newex, flags);
4251
4252        if (!err && set_unwritten) {
4253                if (io)
4254                        ext4_set_io_unwritten_flag(inode, io);
4255                else
4256                        ext4_set_inode_state(inode,
4257                                             EXT4_STATE_DIO_UNWRITTEN);
4258        }
4259
4260        if (err && free_on_err) {
4261                int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4262                        EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4263                /* free data blocks we just allocated */
4264                /* not a good idea to call discard here directly,
4265                 * but otherwise we'd need to call it every free() */
4266                ext4_discard_preallocations(inode);
4267                ext4_free_blocks(handle, inode, NULL, newblock,
4268                                 EXT4_C2B(sbi, allocated_clusters), fb_flags);
4269                goto out2;
4270        }
4271
4272        /* previous routine could use block we allocated */
4273        newblock = ext4_ext_pblock(&newex);
4274        allocated = ext4_ext_get_actual_len(&newex);
4275        if (allocated > map->m_len)
4276                allocated = map->m_len;
4277        map->m_flags |= EXT4_MAP_NEW;
4278
4279        /*
4280         * Update reserved blocks/metadata blocks after successful
4281         * block allocation which had been deferred till now.
4282         */
4283        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4284                unsigned int reserved_clusters;
4285                /*
4286                 * Check how many clusters we had reserved this allocated range
4287                 */
4288                reserved_clusters = get_reserved_cluster_alloc(inode,
4289                                                map->m_lblk, allocated);
4290                if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4291                        if (reserved_clusters) {
4292                                /*
4293                                 * We have clusters reserved for this range.
4294                                 * But since we are not doing actual allocation
4295                                 * and are simply using blocks from previously
4296                                 * allocated cluster, we should release the
4297                                 * reservation and not claim quota.
4298                                 */
4299                                ext4_da_update_reserve_space(inode,
4300                                                reserved_clusters, 0);
4301                        }
4302                } else {
4303                        BUG_ON(allocated_clusters < reserved_clusters);
4304                        if (reserved_clusters < allocated_clusters) {
4305                                struct ext4_inode_info *ei = EXT4_I(inode);
4306                                int reservation = allocated_clusters -
4307                                                  reserved_clusters;
4308                                /*
4309                                 * It seems we claimed few clusters outside of
4310                                 * the range of this allocation. We should give
4311                                 * it back to the reservation pool. This can
4312                                 * happen in the following case:
4313                                 *
4314                                 * * Suppose s_cluster_ratio is 4 (i.e., each
4315                                 *   cluster has 4 blocks. Thus, the clusters
4316                                 *   are [0-3],[4-7],[8-11]...
4317                                 * * First comes delayed allocation write for
4318                                 *   logical blocks 10 & 11. Since there were no
4319                                 *   previous delayed allocated blocks in the
4320                                 *   range [8-11], we would reserve 1 cluster
4321                                 *   for this write.
4322                                 * * Next comes write for logical blocks 3 to 8.
4323                                 *   In this case, we will reserve 2 clusters
4324                                 *   (for [0-3] and [4-7]; and not for [8-11] as
4325                                 *   that range has a delayed allocated blocks.
4326                                 *   Thus total reserved clusters now becomes 3.
4327                                 * * Now, during the delayed allocation writeout
4328                                 *   time, we will first write blocks [3-8] and
4329                                 *   allocate 3 clusters for writing these
4330                                 *   blocks. Also, we would claim all these
4331                                 *   three clusters above.
4332                                 * * Now when we come here to writeout the
4333                                 *   blocks [10-11], we would expect to claim
4334                                 *   the reservation of 1 cluster we had made
4335                                 *   (and we would claim it since there are no
4336                                 *   more delayed allocated blocks in the range
4337                                 *   [8-11]. But our reserved cluster count had
4338                                 *   already gone to 0.
4339                                 *
4340                                 *   Thus, at the step 4 above when we determine
4341                                 *   that there are still some unwritten delayed
4342                                 *   allocated blocks outside of our current
4343                                 *   block range, we should increment the
4344                                 *   reserved clusters count so that when the
4345                                 *   remaining blocks finally gets written, we
4346                                 *   could claim them.
4347                                 */
4348                                dquot_reserve_block(inode,
4349                                                EXT4_C2B(sbi, reservation));
4350                                spin_lock(&ei->i_block_reservation_lock);
4351                                ei->i_reserved_data_blocks += reservation;
4352                                spin_unlock(&ei->i_block_reservation_lock);
4353                        }
4354                        /*
4355                         * We will claim quota for all newly allocated blocks.
4356                         * We're updating the reserved space *after* the
4357                         * correction above so we do not accidentally free
4358                         * all the metadata reservation because we might
4359                         * actually need it later on.
4360                         */
4361                        ext4_da_update_reserve_space(inode, allocated_clusters,
4362                                                        1);
4363                }
4364        }
4365
4366        /*
4367         * Cache the extent and update transaction to commit on fdatasync only
4368         * when it is _not_ an uninitialized extent.
4369         */
4370        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
4371                ext4_update_inode_fsync_trans(handle, inode, 1);
4372        else
4373                ext4_update_inode_fsync_trans(handle, inode, 0);
4374out:
4375        if (allocated > map->m_len)
4376                allocated = map->m_len;
4377        ext4_ext_show_leaf(inode, path);
4378        map->m_flags |= EXT4_MAP_MAPPED;
4379        map->m_pblk = newblock;
4380        map->m_len = allocated;
4381out2:
4382        if (path) {
4383                ext4_ext_drop_refs(path);
4384                kfree(path);
4385        }
4386
4387out3:
4388        trace_ext4_ext_map_blocks_exit(inode, flags, map,
4389                                       err ? err : allocated);
4390        ext4_es_lru_add(inode);
4391        return err ? err : allocated;
4392}
4393
4394void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4395{
4396        struct super_block *sb = inode->i_sb;
4397        ext4_lblk_t last_block;
4398        int err = 0;
4399
4400        /*
4401         * TODO: optimization is possible here.
4402         * Probably we need not scan at all,
4403         * because page truncation is enough.
4404         */
4405
4406        /* we have to know where to truncate from in crash case */
4407        EXT4_I(inode)->i_disksize = inode->i_size;
4408        ext4_mark_inode_dirty(handle, inode);
4409
4410        last_block = (inode->i_size + sb->s_blocksize - 1)
4411                        >> EXT4_BLOCK_SIZE_BITS(sb);
4412retry:
4413        err = ext4_es_remove_extent(inode, last_block,
4414                                    EXT_MAX_BLOCKS - last_block);
4415        if (err == -ENOMEM) {
4416                cond_resched();
4417                congestion_wait(BLK_RW_ASYNC, HZ/50);
4418                goto retry;
4419        }
4420        if (err) {
4421                ext4_std_error(inode->i_sb, err);
4422                return;
4423        }
4424        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4425        ext4_std_error(inode->i_sb, err);
4426}
4427
4428static void ext4_falloc_update_inode(struct inode *inode,
4429                                int mode, loff_t new_size, int update_ctime)
4430{
4431        struct timespec now;
4432
4433        if (update_ctime) {
4434                now = current_fs_time(inode->i_sb);
4435                if (!timespec_equal(&inode->i_ctime, &now))
4436                        inode->i_ctime = now;
4437        }
4438        /*
4439         * Update only when preallocation was requested beyond
4440         * the file size.
4441         */
4442        if (!(mode & FALLOC_FL_KEEP_SIZE)) {
4443                if (new_size > i_size_read(inode))
4444                        i_size_write(inode, new_size);
4445                if (new_size > EXT4_I(inode)->i_disksize)
4446                        ext4_update_i_disksize(inode, new_size);
4447        } else {
4448                /*
4449                 * Mark that we allocate beyond EOF so the subsequent truncate
4450                 * can proceed even if the new size is the same as i_size.
4451                 */
4452                if (new_size > i_size_read(inode))
4453                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4454        }
4455
4456}
4457
4458/*
4459 * preallocate space for a file. This implements ext4's fallocate file
4460 * operation, which gets called from sys_fallocate system call.
4461 * For block-mapped files, posix_fallocate should fall back to the method
4462 * of writing zeroes to the required new blocks (the same behavior which is
4463 * expected for file systems which do not support fallocate() system call).
4464 */
4465long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4466{
4467        struct inode *inode = file_inode(file);
4468        handle_t *handle;
4469        loff_t new_size;
4470        unsigned int max_blocks;
4471        int ret = 0;
4472        int ret2 = 0;
4473        int retries = 0;
4474        int flags;
4475        struct ext4_map_blocks map;
4476        unsigned int credits, blkbits = inode->i_blkbits;
4477
4478        /* Return error if mode is not supported */
4479        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
4480                return -EOPNOTSUPP;
4481
4482        if (mode & FALLOC_FL_PUNCH_HOLE)
4483                return ext4_punch_hole(inode, offset, len);
4484
4485        ret = ext4_convert_inline_data(inode);
4486        if (ret)
4487                return ret;
4488
4489        /*
4490         * currently supporting (pre)allocate mode for extent-based
4491         * files _only_
4492         */
4493        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4494                return -EOPNOTSUPP;
4495
4496        trace_ext4_fallocate_enter(inode, offset, len, mode);
4497        map.m_lblk = offset >> blkbits;
4498        /*
4499         * We can't just convert len to max_blocks because
4500         * If blocksize = 4096 offset = 3072 and len = 2048
4501         */
4502        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4503                - map.m_lblk;
4504        /*
4505         * credits to insert 1 extent into extent tree
4506         */
4507        credits = ext4_chunk_trans_blocks(inode, max_blocks);
4508        mutex_lock(&inode->i_mutex);
4509        ret = inode_newsize_ok(inode, (len + offset));
4510        if (ret) {
4511                mutex_unlock(&inode->i_mutex);
4512                trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4513                return ret;
4514        }
4515        flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
4516        if (mode & FALLOC_FL_KEEP_SIZE)
4517                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4518        /*
4519         * Don't normalize the request if it can fit in one extent so
4520         * that it doesn't get unnecessarily split into multiple
4521         * extents.
4522         */
4523        if (len <= EXT_UNINIT_MAX_LEN << blkbits)
4524                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4525
4526retry:
4527        while (ret >= 0 && ret < max_blocks) {
4528                map.m_lblk = map.m_lblk + ret;
4529                map.m_len = max_blocks = max_blocks - ret;
4530                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4531                                            credits);
4532                if (IS_ERR(handle)) {
4533                        ret = PTR_ERR(handle);
4534                        break;
4535                }
4536                ret = ext4_map_blocks(handle, inode, &map, flags);
4537                if (ret <= 0) {
4538#ifdef EXT4FS_DEBUG
4539                        ext4_warning(inode->i_sb,
4540                                     "inode #%lu: block %u: len %u: "
4541                                     "ext4_ext_map_blocks returned %d",
4542                                     inode->i_ino, map.m_lblk,
4543                                     map.m_len, ret);
4544#endif
4545                        ext4_mark_inode_dirty(handle, inode);
4546                        ret2 = ext4_journal_stop(handle);
4547                        break;
4548                }
4549                if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
4550                                                blkbits) >> blkbits))
4551                        new_size = offset + len;
4552                else
4553                        new_size = ((loff_t) map.m_lblk + ret) << blkbits;
4554
4555                ext4_falloc_update_inode(inode, mode, new_size,
4556                                         (map.m_flags & EXT4_MAP_NEW));
4557                ext4_mark_inode_dirty(handle, inode);
4558                if ((file->f_flags & O_SYNC) && ret >= max_blocks)
4559                        ext4_handle_sync(handle);
4560                ret2 = ext4_journal_stop(handle);
4561                if (ret2)
4562                        break;
4563        }
4564        if (ret == -ENOSPC &&
4565                        ext4_should_retry_alloc(inode->i_sb, &retries)) {
4566                ret = 0;
4567                goto retry;
4568        }
4569        mutex_unlock(&inode->i_mutex);
4570        trace_ext4_fallocate_exit(inode, offset, max_blocks,
4571                                ret > 0 ? ret2 : ret);
4572        return ret > 0 ? ret2 : ret;
4573}
4574
4575/*
4576 * This function convert a range of blocks to written extents
4577 * The caller of this function will pass the start offset and the size.
4578 * all unwritten extents within this range will be converted to
4579 * written extents.
4580 *
4581 * This function is called from the direct IO end io call back
4582 * function, to convert the fallocated extents after IO is completed.
4583 * Returns 0 on success.
4584 */
4585int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4586                                   loff_t offset, ssize_t len)
4587{
4588        unsigned int max_blocks;
4589        int ret = 0;
4590        int ret2 = 0;
4591        struct ext4_map_blocks map;
4592        unsigned int credits, blkbits = inode->i_blkbits;
4593
4594        map.m_lblk = offset >> blkbits;
4595        /*
4596         * We can't just convert len to max_blocks because
4597         * If blocksize = 4096 offset = 3072 and len = 2048
4598         */
4599        max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4600                      map.m_lblk);
4601        /*
4602         * This is somewhat ugly but the idea is clear: When transaction is
4603         * reserved, everything goes into it. Otherwise we rather start several
4604         * smaller transactions for conversion of each extent separately.
4605         */
4606        if (handle) {
4607                handle = ext4_journal_start_reserved(handle,
4608                                                     EXT4_HT_EXT_CONVERT);
4609                if (IS_ERR(handle))
4610                        return PTR_ERR(handle);
4611                credits = 0;
4612        } else {
4613                /*
4614                 * credits to insert 1 extent into extent tree
4615                 */
4616                credits = ext4_chunk_trans_blocks(inode, max_blocks);
4617        }
4618        while (ret >= 0 && ret < max_blocks) {
4619                map.m_lblk += ret;
4620                map.m_len = (max_blocks -= ret);
4621                if (credits) {
4622                        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4623                                                    credits);
4624                        if (IS_ERR(handle)) {
4625                                ret = PTR_ERR(handle);
4626                                break;
4627                        }
4628                }
4629                ret = ext4_map_blocks(handle, inode, &map,
4630                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4631                if (ret <= 0)
4632                        ext4_warning(inode->i_sb,
4633                                     "inode #%lu: block %u: len %u: "
4634                                     "ext4_ext_map_blocks returned %d",
4635                                     inode->i_ino, map.m_lblk,
4636                                     map.m_len, ret);
4637                ext4_mark_inode_dirty(handle, inode);
4638                if (credits)
4639                        ret2 = ext4_journal_stop(handle);
4640                if (ret <= 0 || ret2)
4641                        break;
4642        }
4643        if (!credits)
4644                ret2 = ext4_journal_stop(handle);
4645        return ret > 0 ? ret2 : ret;
4646}
4647
4648/*
4649 * If newes is not existing extent (newes->ec_pblk equals zero) find
4650 * delayed extent at start of newes and update newes accordingly and
4651 * return start of the next delayed extent.
4652 *
4653 * If newes is existing extent (newes->ec_pblk is not equal zero)
4654 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
4655 * extent found. Leave newes unmodified.
4656 */
4657static int ext4_find_delayed_extent(struct inode *inode,
4658                                    struct extent_status *newes)
4659{
4660        struct extent_status es;
4661        ext4_lblk_t block, next_del;
4662
4663        if (newes->es_pblk == 0) {
4664                ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
4665                                newes->es_lblk + newes->es_len - 1, &es);
4666
4667                /*
4668                 * No extent in extent-tree contains block @newes->es_pblk,
4669                 * then the block may stay in 1)a hole or 2)delayed-extent.
4670                 */
4671                if (es.es_len == 0)
4672                        /* A hole found. */
4673                        return 0;
4674
4675                if (es.es_lblk > newes->es_lblk) {
4676                        /* A hole found. */
4677                        newes->es_len = min(es.es_lblk - newes->es_lblk,
4678                                            newes->es_len);
4679                        return 0;
4680                }
4681
4682                newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
4683        }
4684
4685        block = newes->es_lblk + newes->es_len;
4686        ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
4687        if (es.es_len == 0)
4688                next_del = EXT_MAX_BLOCKS;
4689        else
4690                next_del = es.es_lblk;
4691
4692        return next_del;
4693}
4694/* fiemap flags we can handle specified here */
4695#define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4696
4697static int ext4_xattr_fiemap(struct inode *inode,
4698                                struct fiemap_extent_info *fieinfo)
4699{
4700        __u64 physical = 0;
4701        __u64 length;
4702        __u32 flags = FIEMAP_EXTENT_LAST;
4703        int blockbits = inode->i_sb->s_blocksize_bits;
4704        int error = 0;
4705
4706        /* in-inode? */
4707        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4708                struct ext4_iloc iloc;
4709                int offset;     /* offset of xattr in inode */
4710
4711                error = ext4_get_inode_loc(inode, &iloc);
4712                if (error)
4713                        return error;
4714                physical = (__u64)iloc.bh->b_blocknr << blockbits;
4715                offset = EXT4_GOOD_OLD_INODE_SIZE +
4716                                EXT4_I(inode)->i_extra_isize;
4717                physical += offset;
4718                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4719                flags |= FIEMAP_EXTENT_DATA_INLINE;
4720                brelse(iloc.bh);
4721        } else { /* external block */
4722                physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4723                length = inode->i_sb->s_blocksize;
4724        }
4725
4726        if (physical)
4727                error = fiemap_fill_next_extent(fieinfo, 0, physical,
4728                                                length, flags);
4729        return (error < 0 ? error : 0);
4730}
4731
4732int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4733                __u64 start, __u64 len)
4734{
4735        ext4_lblk_t start_blk;
4736        int error = 0;
4737
4738        if (ext4_has_inline_data(inode)) {
4739                int has_inline = 1;
4740
4741                error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
4742
4743                if (has_inline)
4744                        return error;
4745        }
4746
4747        /* fallback to generic here if not in extents fmt */
4748        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4749                return generic_block_fiemap(inode, fieinfo, start, len,
4750                        ext4_get_block);
4751
4752        if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
4753                return -EBADR;
4754
4755        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4756                error = ext4_xattr_fiemap(inode, fieinfo);
4757        } else {
4758                ext4_lblk_t len_blks;
4759                __u64 last_blk;
4760
4761                start_blk = start >> inode->i_sb->s_blocksize_bits;
4762                last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4763                if (last_blk >= EXT_MAX_BLOCKS)
4764                        last_blk = EXT_MAX_BLOCKS-1;
4765                len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4766
4767                /*
4768                 * Walk the extent tree gathering extent information
4769                 * and pushing extents back to the user.
4770                 */
4771                error = ext4_fill_fiemap_extents(inode, start_blk,
4772                                                 len_blks, fieinfo);
4773        }
4774
4775        return error;
4776}
4777