linux/fs/ext4/extents.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   3 * Written by Alex Tomas <alex@clusterfs.com>
   4 *
   5 * Architecture independence:
   6 *   Copyright (c) 2005, Bull S.A.
   7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License version 2 as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful,
  14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 * GNU General Public License for more details.
  17 *
  18 * You should have received a copy of the GNU General Public Licens
  19 * along with this program; if not, write to the Free Software
  20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  21 */
  22
  23/*
  24 * Extents support for EXT4
  25 *
  26 * TODO:
  27 *   - ext4*_error() should be used in some situations
  28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
  29 *   - smart tree reduction
  30 */
  31
  32#include <linux/module.h>
  33#include <linux/fs.h>
  34#include <linux/time.h>
  35#include <linux/jbd2.h>
  36#include <linux/highuid.h>
  37#include <linux/pagemap.h>
  38#include <linux/quotaops.h>
  39#include <linux/string.h>
  40#include <linux/slab.h>
  41#include <linux/falloc.h>
  42#include <asm/uaccess.h>
  43#include <linux/fiemap.h>
  44#include "ext4_jbd2.h"
  45#include "ext4_extents.h"
  46
  47
  48/*
  49 * ext_pblock:
  50 * combine low and high parts of physical block number into ext4_fsblk_t
  51 */
  52ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
  53{
  54        ext4_fsblk_t block;
  55
  56        block = le32_to_cpu(ex->ee_start_lo);
  57        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
  58        return block;
  59}
  60
  61/*
  62 * idx_pblock:
  63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
  64 */
  65ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
  66{
  67        ext4_fsblk_t block;
  68
  69        block = le32_to_cpu(ix->ei_leaf_lo);
  70        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
  71        return block;
  72}
  73
  74/*
  75 * ext4_ext_store_pblock:
  76 * stores a large physical block number into an extent struct,
  77 * breaking it into parts
  78 */
  79void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
  80{
  81        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
  82        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
  83}
  84
  85/*
  86 * ext4_idx_store_pblock:
  87 * stores a large physical block number into an index struct,
  88 * breaking it into parts
  89 */
  90static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
  91{
  92        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
  93        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
  94}
  95
  96static int ext4_ext_truncate_extend_restart(handle_t *handle,
  97                                            struct inode *inode,
  98                                            int needed)
  99{
 100        int err;
 101
 102        if (!ext4_handle_valid(handle))
 103                return 0;
 104        if (handle->h_buffer_credits > needed)
 105                return 0;
 106        err = ext4_journal_extend(handle, needed);
 107        if (err <= 0)
 108                return err;
 109        err = ext4_truncate_restart_trans(handle, inode, needed);
 110        /*
 111         * We have dropped i_data_sem so someone might have cached again
 112         * an extent we are going to truncate.
 113         */
 114        ext4_ext_invalidate_cache(inode);
 115
 116        return err;
 117}
 118
 119/*
 120 * could return:
 121 *  - EROFS
 122 *  - ENOMEM
 123 */
 124static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 125                                struct ext4_ext_path *path)
 126{
 127        if (path->p_bh) {
 128                /* path points to block */
 129                return ext4_journal_get_write_access(handle, path->p_bh);
 130        }
 131        /* path points to leaf/index in inode body */
 132        /* we use in-core data, no need to protect them */
 133        return 0;
 134}
 135
 136/*
 137 * could return:
 138 *  - EROFS
 139 *  - ENOMEM
 140 *  - EIO
 141 */
 142static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
 143                                struct ext4_ext_path *path)
 144{
 145        int err;
 146        if (path->p_bh) {
 147                /* path points to block */
 148                err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
 149        } else {
 150                /* path points to leaf/index in inode body */
 151                err = ext4_mark_inode_dirty(handle, inode);
 152        }
 153        return err;
 154}
 155
 156static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 157                              struct ext4_ext_path *path,
 158                              ext4_lblk_t block)
 159{
 160        struct ext4_inode_info *ei = EXT4_I(inode);
 161        ext4_fsblk_t bg_start;
 162        ext4_fsblk_t last_block;
 163        ext4_grpblk_t colour;
 164        ext4_group_t block_group;
 165        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 166        int depth;
 167
 168        if (path) {
 169                struct ext4_extent *ex;
 170                depth = path->p_depth;
 171
 172                /* try to predict block placement */
 173                ex = path[depth].p_ext;
 174                if (ex)
 175                        return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
 176
 177                /* it looks like index is empty;
 178                 * try to find starting block from index itself */
 179                if (path[depth].p_bh)
 180                        return path[depth].p_bh->b_blocknr;
 181        }
 182
 183        /* OK. use inode's group */
 184        block_group = ei->i_block_group;
 185        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
 186                /*
 187                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
 188                 * block groups per flexgroup, reserve the first block 
 189                 * group for directories and special files.  Regular 
 190                 * files will start at the second block group.  This
 191                 * tends to speed up directory access and improves 
 192                 * fsck times.
 193                 */
 194                block_group &= ~(flex_size-1);
 195                if (S_ISREG(inode->i_mode))
 196                        block_group++;
 197        }
 198        bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
 199                le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
 200        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 201
 202        /*
 203         * If we are doing delayed allocation, we don't need take
 204         * colour into account.
 205         */
 206        if (test_opt(inode->i_sb, DELALLOC))
 207                return bg_start;
 208
 209        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 210                colour = (current->pid % 16) *
 211                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 212        else
 213                colour = (current->pid % 16) * ((last_block - bg_start) / 16);
 214        return bg_start + colour + block;
 215}
 216
 217/*
 218 * Allocation for a meta data block
 219 */
 220static ext4_fsblk_t
 221ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 222                        struct ext4_ext_path *path,
 223                        struct ext4_extent *ex, int *err)
 224{
 225        ext4_fsblk_t goal, newblock;
 226
 227        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
 228        newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
 229        return newblock;
 230}
 231
 232static inline int ext4_ext_space_block(struct inode *inode, int check)
 233{
 234        int size;
 235
 236        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 237                        / sizeof(struct ext4_extent);
 238        if (!check) {
 239#ifdef AGGRESSIVE_TEST
 240                if (size > 6)
 241                        size = 6;
 242#endif
 243        }
 244        return size;
 245}
 246
 247static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 248{
 249        int size;
 250
 251        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 252                        / sizeof(struct ext4_extent_idx);
 253        if (!check) {
 254#ifdef AGGRESSIVE_TEST
 255                if (size > 5)
 256                        size = 5;
 257#endif
 258        }
 259        return size;
 260}
 261
 262static inline int ext4_ext_space_root(struct inode *inode, int check)
 263{
 264        int size;
 265
 266        size = sizeof(EXT4_I(inode)->i_data);
 267        size -= sizeof(struct ext4_extent_header);
 268        size /= sizeof(struct ext4_extent);
 269        if (!check) {
 270#ifdef AGGRESSIVE_TEST
 271                if (size > 3)
 272                        size = 3;
 273#endif
 274        }
 275        return size;
 276}
 277
 278static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 279{
 280        int size;
 281
 282        size = sizeof(EXT4_I(inode)->i_data);
 283        size -= sizeof(struct ext4_extent_header);
 284        size /= sizeof(struct ext4_extent_idx);
 285        if (!check) {
 286#ifdef AGGRESSIVE_TEST
 287                if (size > 4)
 288                        size = 4;
 289#endif
 290        }
 291        return size;
 292}
 293
 294/*
 295 * Calculate the number of metadata blocks needed
 296 * to allocate @blocks
 297 * Worse case is one block per extent
 298 */
 299int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
 300{
 301        int lcap, icap, rcap, leafs, idxs, num;
 302        int newextents = blocks;
 303
 304        rcap = ext4_ext_space_root_idx(inode, 0);
 305        lcap = ext4_ext_space_block(inode, 0);
 306        icap = ext4_ext_space_block_idx(inode, 0);
 307
 308        /* number of new leaf blocks needed */
 309        num = leafs = (newextents + lcap - 1) / lcap;
 310
 311        /*
 312         * Worse case, we need separate index block(s)
 313         * to link all new leaf blocks
 314         */
 315        idxs = (leafs + icap - 1) / icap;
 316        do {
 317                num += idxs;
 318                idxs = (idxs + icap - 1) / icap;
 319        } while (idxs > rcap);
 320
 321        return num;
 322}
 323
 324static int
 325ext4_ext_max_entries(struct inode *inode, int depth)
 326{
 327        int max;
 328
 329        if (depth == ext_depth(inode)) {
 330                if (depth == 0)
 331                        max = ext4_ext_space_root(inode, 1);
 332                else
 333                        max = ext4_ext_space_root_idx(inode, 1);
 334        } else {
 335                if (depth == 0)
 336                        max = ext4_ext_space_block(inode, 1);
 337                else
 338                        max = ext4_ext_space_block_idx(inode, 1);
 339        }
 340
 341        return max;
 342}
 343
 344static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 345{
 346        ext4_fsblk_t block = ext_pblock(ext);
 347        int len = ext4_ext_get_actual_len(ext);
 348
 349        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 350}
 351
 352static int ext4_valid_extent_idx(struct inode *inode,
 353                                struct ext4_extent_idx *ext_idx)
 354{
 355        ext4_fsblk_t block = idx_pblock(ext_idx);
 356
 357        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 358}
 359
 360static int ext4_valid_extent_entries(struct inode *inode,
 361                                struct ext4_extent_header *eh,
 362                                int depth)
 363{
 364        struct ext4_extent *ext;
 365        struct ext4_extent_idx *ext_idx;
 366        unsigned short entries;
 367        if (eh->eh_entries == 0)
 368                return 1;
 369
 370        entries = le16_to_cpu(eh->eh_entries);
 371
 372        if (depth == 0) {
 373                /* leaf entries */
 374                ext = EXT_FIRST_EXTENT(eh);
 375                while (entries) {
 376                        if (!ext4_valid_extent(inode, ext))
 377                                return 0;
 378                        ext++;
 379                        entries--;
 380                }
 381        } else {
 382                ext_idx = EXT_FIRST_INDEX(eh);
 383                while (entries) {
 384                        if (!ext4_valid_extent_idx(inode, ext_idx))
 385                                return 0;
 386                        ext_idx++;
 387                        entries--;
 388                }
 389        }
 390        return 1;
 391}
 392
 393static int __ext4_ext_check(const char *function, struct inode *inode,
 394                                        struct ext4_extent_header *eh,
 395                                        int depth)
 396{
 397        const char *error_msg;
 398        int max = 0;
 399
 400        if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
 401                error_msg = "invalid magic";
 402                goto corrupted;
 403        }
 404        if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
 405                error_msg = "unexpected eh_depth";
 406                goto corrupted;
 407        }
 408        if (unlikely(eh->eh_max == 0)) {
 409                error_msg = "invalid eh_max";
 410                goto corrupted;
 411        }
 412        max = ext4_ext_max_entries(inode, depth);
 413        if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
 414                error_msg = "too large eh_max";
 415                goto corrupted;
 416        }
 417        if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
 418                error_msg = "invalid eh_entries";
 419                goto corrupted;
 420        }
 421        if (!ext4_valid_extent_entries(inode, eh, depth)) {
 422                error_msg = "invalid extent entries";
 423                goto corrupted;
 424        }
 425        return 0;
 426
 427corrupted:
 428        ext4_error(inode->i_sb, function,
 429                        "bad header/extent in inode #%lu: %s - magic %x, "
 430                        "entries %u, max %u(%u), depth %u(%u)",
 431                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
 432                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
 433                        max, le16_to_cpu(eh->eh_depth), depth);
 434
 435        return -EIO;
 436}
 437
 438#define ext4_ext_check(inode, eh, depth)        \
 439        __ext4_ext_check(__func__, inode, eh, depth)
 440
 441int ext4_ext_check_inode(struct inode *inode)
 442{
 443        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
 444}
 445
 446#ifdef EXT_DEBUG
 447static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 448{
 449        int k, l = path->p_depth;
 450
 451        ext_debug("path:");
 452        for (k = 0; k <= l; k++, path++) {
 453                if (path->p_idx) {
 454                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
 455                            idx_pblock(path->p_idx));
 456                } else if (path->p_ext) {
 457                        ext_debug("  %d:[%d]%d:%llu ",
 458                                  le32_to_cpu(path->p_ext->ee_block),
 459                                  ext4_ext_is_uninitialized(path->p_ext),
 460                                  ext4_ext_get_actual_len(path->p_ext),
 461                                  ext_pblock(path->p_ext));
 462                } else
 463                        ext_debug("  []");
 464        }
 465        ext_debug("\n");
 466}
 467
 468static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 469{
 470        int depth = ext_depth(inode);
 471        struct ext4_extent_header *eh;
 472        struct ext4_extent *ex;
 473        int i;
 474
 475        if (!path)
 476                return;
 477
 478        eh = path[depth].p_hdr;
 479        ex = EXT_FIRST_EXTENT(eh);
 480
 481        ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
 482
 483        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
 484                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
 485                          ext4_ext_is_uninitialized(ex),
 486                          ext4_ext_get_actual_len(ex), ext_pblock(ex));
 487        }
 488        ext_debug("\n");
 489}
 490#else
 491#define ext4_ext_show_path(inode, path)
 492#define ext4_ext_show_leaf(inode, path)
 493#endif
 494
 495void ext4_ext_drop_refs(struct ext4_ext_path *path)
 496{
 497        int depth = path->p_depth;
 498        int i;
 499
 500        for (i = 0; i <= depth; i++, path++)
 501                if (path->p_bh) {
 502                        brelse(path->p_bh);
 503                        path->p_bh = NULL;
 504                }
 505}
 506
 507/*
 508 * ext4_ext_binsearch_idx:
 509 * binary search for the closest index of the given block
 510 * the header must be checked before calling this
 511 */
 512static void
 513ext4_ext_binsearch_idx(struct inode *inode,
 514                        struct ext4_ext_path *path, ext4_lblk_t block)
 515{
 516        struct ext4_extent_header *eh = path->p_hdr;
 517        struct ext4_extent_idx *r, *l, *m;
 518
 519
 520        ext_debug("binsearch for %u(idx):  ", block);
 521
 522        l = EXT_FIRST_INDEX(eh) + 1;
 523        r = EXT_LAST_INDEX(eh);
 524        while (l <= r) {
 525                m = l + (r - l) / 2;
 526                if (block < le32_to_cpu(m->ei_block))
 527                        r = m - 1;
 528                else
 529                        l = m + 1;
 530                ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
 531                                m, le32_to_cpu(m->ei_block),
 532                                r, le32_to_cpu(r->ei_block));
 533        }
 534
 535        path->p_idx = l - 1;
 536        ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
 537                  idx_pblock(path->p_idx));
 538
 539#ifdef CHECK_BINSEARCH
 540        {
 541                struct ext4_extent_idx *chix, *ix;
 542                int k;
 543
 544                chix = ix = EXT_FIRST_INDEX(eh);
 545                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
 546                  if (k != 0 &&
 547                      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
 548                                printk(KERN_DEBUG "k=%d, ix=0x%p, "
 549                                       "first=0x%p\n", k,
 550                                       ix, EXT_FIRST_INDEX(eh));
 551                                printk(KERN_DEBUG "%u <= %u\n",
 552                                       le32_to_cpu(ix->ei_block),
 553                                       le32_to_cpu(ix[-1].ei_block));
 554                        }
 555                        BUG_ON(k && le32_to_cpu(ix->ei_block)
 556                                           <= le32_to_cpu(ix[-1].ei_block));
 557                        if (block < le32_to_cpu(ix->ei_block))
 558                                break;
 559                        chix = ix;
 560                }
 561                BUG_ON(chix != path->p_idx);
 562        }
 563#endif
 564
 565}
 566
 567/*
 568 * ext4_ext_binsearch:
 569 * binary search for closest extent of the given block
 570 * the header must be checked before calling this
 571 */
 572static void
 573ext4_ext_binsearch(struct inode *inode,
 574                struct ext4_ext_path *path, ext4_lblk_t block)
 575{
 576        struct ext4_extent_header *eh = path->p_hdr;
 577        struct ext4_extent *r, *l, *m;
 578
 579        if (eh->eh_entries == 0) {
 580                /*
 581                 * this leaf is empty:
 582                 * we get such a leaf in split/add case
 583                 */
 584                return;
 585        }
 586
 587        ext_debug("binsearch for %u:  ", block);
 588
 589        l = EXT_FIRST_EXTENT(eh) + 1;
 590        r = EXT_LAST_EXTENT(eh);
 591
 592        while (l <= r) {
 593                m = l + (r - l) / 2;
 594                if (block < le32_to_cpu(m->ee_block))
 595                        r = m - 1;
 596                else
 597                        l = m + 1;
 598                ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
 599                                m, le32_to_cpu(m->ee_block),
 600                                r, le32_to_cpu(r->ee_block));
 601        }
 602
 603        path->p_ext = l - 1;
 604        ext_debug("  -> %d:%llu:[%d]%d ",
 605                        le32_to_cpu(path->p_ext->ee_block),
 606                        ext_pblock(path->p_ext),
 607                        ext4_ext_is_uninitialized(path->p_ext),
 608                        ext4_ext_get_actual_len(path->p_ext));
 609
 610#ifdef CHECK_BINSEARCH
 611        {
 612                struct ext4_extent *chex, *ex;
 613                int k;
 614
 615                chex = ex = EXT_FIRST_EXTENT(eh);
 616                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
 617                        BUG_ON(k && le32_to_cpu(ex->ee_block)
 618                                          <= le32_to_cpu(ex[-1].ee_block));
 619                        if (block < le32_to_cpu(ex->ee_block))
 620                                break;
 621                        chex = ex;
 622                }
 623                BUG_ON(chex != path->p_ext);
 624        }
 625#endif
 626
 627}
 628
 629int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 630{
 631        struct ext4_extent_header *eh;
 632
 633        eh = ext_inode_hdr(inode);
 634        eh->eh_depth = 0;
 635        eh->eh_entries = 0;
 636        eh->eh_magic = EXT4_EXT_MAGIC;
 637        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
 638        ext4_mark_inode_dirty(handle, inode);
 639        ext4_ext_invalidate_cache(inode);
 640        return 0;
 641}
 642
 643struct ext4_ext_path *
 644ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 645                                        struct ext4_ext_path *path)
 646{
 647        struct ext4_extent_header *eh;
 648        struct buffer_head *bh;
 649        short int depth, i, ppos = 0, alloc = 0;
 650
 651        eh = ext_inode_hdr(inode);
 652        depth = ext_depth(inode);
 653
 654        /* account possible depth increase */
 655        if (!path) {
 656                path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
 657                                GFP_NOFS);
 658                if (!path)
 659                        return ERR_PTR(-ENOMEM);
 660                alloc = 1;
 661        }
 662        path[0].p_hdr = eh;
 663        path[0].p_bh = NULL;
 664
 665        i = depth;
 666        /* walk through the tree */
 667        while (i) {
 668                int need_to_validate = 0;
 669
 670                ext_debug("depth %d: num %d, max %d\n",
 671                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 672
 673                ext4_ext_binsearch_idx(inode, path + ppos, block);
 674                path[ppos].p_block = idx_pblock(path[ppos].p_idx);
 675                path[ppos].p_depth = i;
 676                path[ppos].p_ext = NULL;
 677
 678                bh = sb_getblk(inode->i_sb, path[ppos].p_block);
 679                if (unlikely(!bh))
 680                        goto err;
 681                if (!bh_uptodate_or_lock(bh)) {
 682                        if (bh_submit_read(bh) < 0) {
 683                                put_bh(bh);
 684                                goto err;
 685                        }
 686                        /* validate the extent entries */
 687                        need_to_validate = 1;
 688                }
 689                eh = ext_block_hdr(bh);
 690                ppos++;
 691                BUG_ON(ppos > depth);
 692                path[ppos].p_bh = bh;
 693                path[ppos].p_hdr = eh;
 694                i--;
 695
 696                if (need_to_validate && ext4_ext_check(inode, eh, i))
 697                        goto err;
 698        }
 699
 700        path[ppos].p_depth = i;
 701        path[ppos].p_ext = NULL;
 702        path[ppos].p_idx = NULL;
 703
 704        /* find extent */
 705        ext4_ext_binsearch(inode, path + ppos, block);
 706        /* if not an empty leaf */
 707        if (path[ppos].p_ext)
 708                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
 709
 710        ext4_ext_show_path(inode, path);
 711
 712        return path;
 713
 714err:
 715        ext4_ext_drop_refs(path);
 716        if (alloc)
 717                kfree(path);
 718        return ERR_PTR(-EIO);
 719}
 720
 721/*
 722 * ext4_ext_insert_index:
 723 * insert new index [@logical;@ptr] into the block at @curp;
 724 * check where to insert: before @curp or after @curp
 725 */
 726int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 727                                struct ext4_ext_path *curp,
 728                                int logical, ext4_fsblk_t ptr)
 729{
 730        struct ext4_extent_idx *ix;
 731        int len, err;
 732
 733        err = ext4_ext_get_access(handle, inode, curp);
 734        if (err)
 735                return err;
 736
 737        BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
 738        len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
 739        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
 740                /* insert after */
 741                if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
 742                        len = (len - 1) * sizeof(struct ext4_extent_idx);
 743                        len = len < 0 ? 0 : len;
 744                        ext_debug("insert new index %d after: %llu. "
 745                                        "move %d from 0x%p to 0x%p\n",
 746                                        logical, ptr, len,
 747                                        (curp->p_idx + 1), (curp->p_idx + 2));
 748                        memmove(curp->p_idx + 2, curp->p_idx + 1, len);
 749                }
 750                ix = curp->p_idx + 1;
 751        } else {
 752                /* insert before */
 753                len = len * sizeof(struct ext4_extent_idx);
 754                len = len < 0 ? 0 : len;
 755                ext_debug("insert new index %d before: %llu. "
 756                                "move %d from 0x%p to 0x%p\n",
 757                                logical, ptr, len,
 758                                curp->p_idx, (curp->p_idx + 1));
 759                memmove(curp->p_idx + 1, curp->p_idx, len);
 760                ix = curp->p_idx;
 761        }
 762
 763        ix->ei_block = cpu_to_le32(logical);
 764        ext4_idx_store_pblock(ix, ptr);
 765        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
 766
 767        BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
 768                             > le16_to_cpu(curp->p_hdr->eh_max));
 769        BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
 770
 771        err = ext4_ext_dirty(handle, inode, curp);
 772        ext4_std_error(inode->i_sb, err);
 773
 774        return err;
 775}
 776
 777/*
 778 * ext4_ext_split:
 779 * inserts new subtree into the path, using free index entry
 780 * at depth @at:
 781 * - allocates all needed blocks (new leaf and all intermediate index blocks)
 782 * - makes decision where to split
 783 * - moves remaining extents and index entries (right to the split point)
 784 *   into the newly allocated blocks
 785 * - initializes subtree
 786 */
 787static int ext4_ext_split(handle_t *handle, struct inode *inode,
 788                                struct ext4_ext_path *path,
 789                                struct ext4_extent *newext, int at)
 790{
 791        struct buffer_head *bh = NULL;
 792        int depth = ext_depth(inode);
 793        struct ext4_extent_header *neh;
 794        struct ext4_extent_idx *fidx;
 795        struct ext4_extent *ex;
 796        int i = at, k, m, a;
 797        ext4_fsblk_t newblock, oldblock;
 798        __le32 border;
 799        ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
 800        int err = 0;
 801
 802        /* make decision: where to split? */
 803        /* FIXME: now decision is simplest: at current extent */
 804
 805        /* if current leaf will be split, then we should use
 806         * border from split point */
 807        BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
 808        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
 809                border = path[depth].p_ext[1].ee_block;
 810                ext_debug("leaf will be split."
 811                                " next leaf starts at %d\n",
 812                                  le32_to_cpu(border));
 813        } else {
 814                border = newext->ee_block;
 815                ext_debug("leaf will be added."
 816                                " next leaf starts at %d\n",
 817                                le32_to_cpu(border));
 818        }
 819
 820        /*
 821         * If error occurs, then we break processing
 822         * and mark filesystem read-only. index won't
 823         * be inserted and tree will be in consistent
 824         * state. Next mount will repair buffers too.
 825         */
 826
 827        /*
 828         * Get array to track all allocated blocks.
 829         * We need this to handle errors and free blocks
 830         * upon them.
 831         */
 832        ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
 833        if (!ablocks)
 834                return -ENOMEM;
 835
 836        /* allocate all needed blocks */
 837        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
 838        for (a = 0; a < depth - at; a++) {
 839                newblock = ext4_ext_new_meta_block(handle, inode, path,
 840                                                   newext, &err);
 841                if (newblock == 0)
 842                        goto cleanup;
 843                ablocks[a] = newblock;
 844        }
 845
 846        /* initialize new leaf */
 847        newblock = ablocks[--a];
 848        BUG_ON(newblock == 0);
 849        bh = sb_getblk(inode->i_sb, newblock);
 850        if (!bh) {
 851                err = -EIO;
 852                goto cleanup;
 853        }
 854        lock_buffer(bh);
 855
 856        err = ext4_journal_get_create_access(handle, bh);
 857        if (err)
 858                goto cleanup;
 859
 860        neh = ext_block_hdr(bh);
 861        neh->eh_entries = 0;
 862        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
 863        neh->eh_magic = EXT4_EXT_MAGIC;
 864        neh->eh_depth = 0;
 865        ex = EXT_FIRST_EXTENT(neh);
 866
 867        /* move remainder of path[depth] to the new leaf */
 868        BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
 869        /* start copy from next extent */
 870        /* TODO: we could do it by single memmove */
 871        m = 0;
 872        path[depth].p_ext++;
 873        while (path[depth].p_ext <=
 874                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
 875                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
 876                                le32_to_cpu(path[depth].p_ext->ee_block),
 877                                ext_pblock(path[depth].p_ext),
 878                                ext4_ext_is_uninitialized(path[depth].p_ext),
 879                                ext4_ext_get_actual_len(path[depth].p_ext),
 880                                newblock);
 881                /*memmove(ex++, path[depth].p_ext++,
 882                                sizeof(struct ext4_extent));
 883                neh->eh_entries++;*/
 884                path[depth].p_ext++;
 885                m++;
 886        }
 887        if (m) {
 888                memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
 889                le16_add_cpu(&neh->eh_entries, m);
 890        }
 891
 892        set_buffer_uptodate(bh);
 893        unlock_buffer(bh);
 894
 895        err = ext4_handle_dirty_metadata(handle, inode, bh);
 896        if (err)
 897                goto cleanup;
 898        brelse(bh);
 899        bh = NULL;
 900
 901        /* correct old leaf */
 902        if (m) {
 903                err = ext4_ext_get_access(handle, inode, path + depth);
 904                if (err)
 905                        goto cleanup;
 906                le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
 907                err = ext4_ext_dirty(handle, inode, path + depth);
 908                if (err)
 909                        goto cleanup;
 910
 911        }
 912
 913        /* create intermediate indexes */
 914        k = depth - at - 1;
 915        BUG_ON(k < 0);
 916        if (k)
 917                ext_debug("create %d intermediate indices\n", k);
 918        /* insert new index into current index block */
 919        /* current depth stored in i var */
 920        i = depth - 1;
 921        while (k--) {
 922                oldblock = newblock;
 923                newblock = ablocks[--a];
 924                bh = sb_getblk(inode->i_sb, newblock);
 925                if (!bh) {
 926                        err = -EIO;
 927                        goto cleanup;
 928                }
 929                lock_buffer(bh);
 930
 931                err = ext4_journal_get_create_access(handle, bh);
 932                if (err)
 933                        goto cleanup;
 934
 935                neh = ext_block_hdr(bh);
 936                neh->eh_entries = cpu_to_le16(1);
 937                neh->eh_magic = EXT4_EXT_MAGIC;
 938                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
 939                neh->eh_depth = cpu_to_le16(depth - i);
 940                fidx = EXT_FIRST_INDEX(neh);
 941                fidx->ei_block = border;
 942                ext4_idx_store_pblock(fidx, oldblock);
 943
 944                ext_debug("int.index at %d (block %llu): %u -> %llu\n",
 945                                i, newblock, le32_to_cpu(border), oldblock);
 946                /* copy indexes */
 947                m = 0;
 948                path[i].p_idx++;
 949
 950                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
 951                                EXT_MAX_INDEX(path[i].p_hdr));
 952                BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
 953                                EXT_LAST_INDEX(path[i].p_hdr));
 954                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
 955                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
 956                                        le32_to_cpu(path[i].p_idx->ei_block),
 957                                        idx_pblock(path[i].p_idx),
 958                                        newblock);
 959                        /*memmove(++fidx, path[i].p_idx++,
 960                                        sizeof(struct ext4_extent_idx));
 961                        neh->eh_entries++;
 962                        BUG_ON(neh->eh_entries > neh->eh_max);*/
 963                        path[i].p_idx++;
 964                        m++;
 965                }
 966                if (m) {
 967                        memmove(++fidx, path[i].p_idx - m,
 968                                sizeof(struct ext4_extent_idx) * m);
 969                        le16_add_cpu(&neh->eh_entries, m);
 970                }
 971                set_buffer_uptodate(bh);
 972                unlock_buffer(bh);
 973
 974                err = ext4_handle_dirty_metadata(handle, inode, bh);
 975                if (err)
 976                        goto cleanup;
 977                brelse(bh);
 978                bh = NULL;
 979
 980                /* correct old index */
 981                if (m) {
 982                        err = ext4_ext_get_access(handle, inode, path + i);
 983                        if (err)
 984                                goto cleanup;
 985                        le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
 986                        err = ext4_ext_dirty(handle, inode, path + i);
 987                        if (err)
 988                                goto cleanup;
 989                }
 990
 991                i--;
 992        }
 993
 994        /* insert new index */
 995        err = ext4_ext_insert_index(handle, inode, path + at,
 996                                    le32_to_cpu(border), newblock);
 997
 998cleanup:
 999        if (bh) {
1000                if (buffer_locked(bh))
1001                        unlock_buffer(bh);
1002                brelse(bh);
1003        }
1004
1005        if (err) {
1006                /* free all allocated blocks in error case */
1007                for (i = 0; i < depth; i++) {
1008                        if (!ablocks[i])
1009                                continue;
1010                        ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
1011                }
1012        }
1013        kfree(ablocks);
1014
1015        return err;
1016}
1017
1018/*
1019 * ext4_ext_grow_indepth:
1020 * implements tree growing procedure:
1021 * - allocates new block
1022 * - moves top-level data (index block or leaf) into the new block
1023 * - initializes new top-level, creating index that points to the
1024 *   just created block
1025 */
1026static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1027                                        struct ext4_ext_path *path,
1028                                        struct ext4_extent *newext)
1029{
1030        struct ext4_ext_path *curp = path;
1031        struct ext4_extent_header *neh;
1032        struct ext4_extent_idx *fidx;
1033        struct buffer_head *bh;
1034        ext4_fsblk_t newblock;
1035        int err = 0;
1036
1037        newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1038        if (newblock == 0)
1039                return err;
1040
1041        bh = sb_getblk(inode->i_sb, newblock);
1042        if (!bh) {
1043                err = -EIO;
1044                ext4_std_error(inode->i_sb, err);
1045                return err;
1046        }
1047        lock_buffer(bh);
1048
1049        err = ext4_journal_get_create_access(handle, bh);
1050        if (err) {
1051                unlock_buffer(bh);
1052                goto out;
1053        }
1054
1055        /* move top-level index/leaf into new block */
1056        memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
1057
1058        /* set size of new block */
1059        neh = ext_block_hdr(bh);
1060        /* old root could have indexes or leaves
1061         * so calculate e_max right way */
1062        if (ext_depth(inode))
1063                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1064        else
1065                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1066        neh->eh_magic = EXT4_EXT_MAGIC;
1067        set_buffer_uptodate(bh);
1068        unlock_buffer(bh);
1069
1070        err = ext4_handle_dirty_metadata(handle, inode, bh);
1071        if (err)
1072                goto out;
1073
1074        /* create index in new top-level index: num,max,pointer */
1075        err = ext4_ext_get_access(handle, inode, curp);
1076        if (err)
1077                goto out;
1078
1079        curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1080        curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1081        curp->p_hdr->eh_entries = cpu_to_le16(1);
1082        curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1083
1084        if (path[0].p_hdr->eh_depth)
1085                curp->p_idx->ei_block =
1086                        EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1087        else
1088                curp->p_idx->ei_block =
1089                        EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1090        ext4_idx_store_pblock(curp->p_idx, newblock);
1091
1092        neh = ext_inode_hdr(inode);
1093        fidx = EXT_FIRST_INDEX(neh);
1094        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1095                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1096                  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
1097
1098        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1099        err = ext4_ext_dirty(handle, inode, curp);
1100out:
1101        brelse(bh);
1102
1103        return err;
1104}
1105
1106/*
1107 * ext4_ext_create_new_leaf:
1108 * finds empty index and adds new leaf.
1109 * if no free index is found, then it requests in-depth growing.
1110 */
1111static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1112                                        struct ext4_ext_path *path,
1113                                        struct ext4_extent *newext)
1114{
1115        struct ext4_ext_path *curp;
1116        int depth, i, err = 0;
1117
1118repeat:
1119        i = depth = ext_depth(inode);
1120
1121        /* walk up to the tree and look for free index entry */
1122        curp = path + depth;
1123        while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1124                i--;
1125                curp--;
1126        }
1127
1128        /* we use already allocated block for index block,
1129         * so subsequent data blocks should be contiguous */
1130        if (EXT_HAS_FREE_INDEX(curp)) {
1131                /* if we found index with free entry, then use that
1132                 * entry: create all needed subtree and add new leaf */
1133                err = ext4_ext_split(handle, inode, path, newext, i);
1134                if (err)
1135                        goto out;
1136
1137                /* refill path */
1138                ext4_ext_drop_refs(path);
1139                path = ext4_ext_find_extent(inode,
1140                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1141                                    path);
1142                if (IS_ERR(path))
1143                        err = PTR_ERR(path);
1144        } else {
1145                /* tree is full, time to grow in depth */
1146                err = ext4_ext_grow_indepth(handle, inode, path, newext);
1147                if (err)
1148                        goto out;
1149
1150                /* refill path */
1151                ext4_ext_drop_refs(path);
1152                path = ext4_ext_find_extent(inode,
1153                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1154                                    path);
1155                if (IS_ERR(path)) {
1156                        err = PTR_ERR(path);
1157                        goto out;
1158                }
1159
1160                /*
1161                 * only first (depth 0 -> 1) produces free space;
1162                 * in all other cases we have to split the grown tree
1163                 */
1164                depth = ext_depth(inode);
1165                if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1166                        /* now we need to split */
1167                        goto repeat;
1168                }
1169        }
1170
1171out:
1172        return err;
1173}
1174
1175/*
1176 * search the closest allocated block to the left for *logical
1177 * and returns it at @logical + it's physical address at @phys
1178 * if *logical is the smallest allocated block, the function
1179 * returns 0 at @phys
1180 * return value contains 0 (success) or error code
1181 */
1182int
1183ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1184                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
1185{
1186        struct ext4_extent_idx *ix;
1187        struct ext4_extent *ex;
1188        int depth, ee_len;
1189
1190        BUG_ON(path == NULL);
1191        depth = path->p_depth;
1192        *phys = 0;
1193
1194        if (depth == 0 && path->p_ext == NULL)
1195                return 0;
1196
1197        /* usually extent in the path covers blocks smaller
1198         * then *logical, but it can be that extent is the
1199         * first one in the file */
1200
1201        ex = path[depth].p_ext;
1202        ee_len = ext4_ext_get_actual_len(ex);
1203        if (*logical < le32_to_cpu(ex->ee_block)) {
1204                BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1205                while (--depth >= 0) {
1206                        ix = path[depth].p_idx;
1207                        BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1208                }
1209                return 0;
1210        }
1211
1212        BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1213
1214        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1215        *phys = ext_pblock(ex) + ee_len - 1;
1216        return 0;
1217}
1218
1219/*
1220 * search the closest allocated block to the right for *logical
1221 * and returns it at @logical + it's physical address at @phys
1222 * if *logical is the smallest allocated block, the function
1223 * returns 0 at @phys
1224 * return value contains 0 (success) or error code
1225 */
1226int
1227ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1228                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
1229{
1230        struct buffer_head *bh = NULL;
1231        struct ext4_extent_header *eh;
1232        struct ext4_extent_idx *ix;
1233        struct ext4_extent *ex;
1234        ext4_fsblk_t block;
1235        int depth;      /* Note, NOT eh_depth; depth from top of tree */
1236        int ee_len;
1237
1238        BUG_ON(path == NULL);
1239        depth = path->p_depth;
1240        *phys = 0;
1241
1242        if (depth == 0 && path->p_ext == NULL)
1243                return 0;
1244
1245        /* usually extent in the path covers blocks smaller
1246         * then *logical, but it can be that extent is the
1247         * first one in the file */
1248
1249        ex = path[depth].p_ext;
1250        ee_len = ext4_ext_get_actual_len(ex);
1251        if (*logical < le32_to_cpu(ex->ee_block)) {
1252                BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1253                while (--depth >= 0) {
1254                        ix = path[depth].p_idx;
1255                        BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1256                }
1257                *logical = le32_to_cpu(ex->ee_block);
1258                *phys = ext_pblock(ex);
1259                return 0;
1260        }
1261
1262        BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1263
1264        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1265                /* next allocated block in this leaf */
1266                ex++;
1267                *logical = le32_to_cpu(ex->ee_block);
1268                *phys = ext_pblock(ex);
1269                return 0;
1270        }
1271
1272        /* go up and search for index to the right */
1273        while (--depth >= 0) {
1274                ix = path[depth].p_idx;
1275                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1276                        goto got_index;
1277        }
1278
1279        /* we've gone up to the root and found no index to the right */
1280        return 0;
1281
1282got_index:
1283        /* we've found index to the right, let's
1284         * follow it and find the closest allocated
1285         * block to the right */
1286        ix++;
1287        block = idx_pblock(ix);
1288        while (++depth < path->p_depth) {
1289                bh = sb_bread(inode->i_sb, block);
1290                if (bh == NULL)
1291                        return -EIO;
1292                eh = ext_block_hdr(bh);
1293                /* subtract from p_depth to get proper eh_depth */
1294                if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1295                        put_bh(bh);
1296                        return -EIO;
1297                }
1298                ix = EXT_FIRST_INDEX(eh);
1299                block = idx_pblock(ix);
1300                put_bh(bh);
1301        }
1302
1303        bh = sb_bread(inode->i_sb, block);
1304        if (bh == NULL)
1305                return -EIO;
1306        eh = ext_block_hdr(bh);
1307        if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1308                put_bh(bh);
1309                return -EIO;
1310        }
1311        ex = EXT_FIRST_EXTENT(eh);
1312        *logical = le32_to_cpu(ex->ee_block);
1313        *phys = ext_pblock(ex);
1314        put_bh(bh);
1315        return 0;
1316}
1317
1318/*
1319 * ext4_ext_next_allocated_block:
1320 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1321 * NOTE: it considers block number from index entry as
1322 * allocated block. Thus, index entries have to be consistent
1323 * with leaves.
1324 */
1325static ext4_lblk_t
1326ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1327{
1328        int depth;
1329
1330        BUG_ON(path == NULL);
1331        depth = path->p_depth;
1332
1333        if (depth == 0 && path->p_ext == NULL)
1334                return EXT_MAX_BLOCK;
1335
1336        while (depth >= 0) {
1337                if (depth == path->p_depth) {
1338                        /* leaf */
1339                        if (path[depth].p_ext !=
1340                                        EXT_LAST_EXTENT(path[depth].p_hdr))
1341                          return le32_to_cpu(path[depth].p_ext[1].ee_block);
1342                } else {
1343                        /* index */
1344                        if (path[depth].p_idx !=
1345                                        EXT_LAST_INDEX(path[depth].p_hdr))
1346                          return le32_to_cpu(path[depth].p_idx[1].ei_block);
1347                }
1348                depth--;
1349        }
1350
1351        return EXT_MAX_BLOCK;
1352}
1353
1354/*
1355 * ext4_ext_next_leaf_block:
1356 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1357 */
1358static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1359                                        struct ext4_ext_path *path)
1360{
1361        int depth;
1362
1363        BUG_ON(path == NULL);
1364        depth = path->p_depth;
1365
1366        /* zero-tree has no leaf blocks at all */
1367        if (depth == 0)
1368                return EXT_MAX_BLOCK;
1369
1370        /* go to index block */
1371        depth--;
1372
1373        while (depth >= 0) {
1374                if (path[depth].p_idx !=
1375                                EXT_LAST_INDEX(path[depth].p_hdr))
1376                        return (ext4_lblk_t)
1377                                le32_to_cpu(path[depth].p_idx[1].ei_block);
1378                depth--;
1379        }
1380
1381        return EXT_MAX_BLOCK;
1382}
1383
1384/*
1385 * ext4_ext_correct_indexes:
1386 * if leaf gets modified and modified extent is first in the leaf,
1387 * then we have to correct all indexes above.
1388 * TODO: do we need to correct tree in all cases?
1389 */
1390static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1391                                struct ext4_ext_path *path)
1392{
1393        struct ext4_extent_header *eh;
1394        int depth = ext_depth(inode);
1395        struct ext4_extent *ex;
1396        __le32 border;
1397        int k, err = 0;
1398
1399        eh = path[depth].p_hdr;
1400        ex = path[depth].p_ext;
1401        BUG_ON(ex == NULL);
1402        BUG_ON(eh == NULL);
1403
1404        if (depth == 0) {
1405                /* there is no tree at all */
1406                return 0;
1407        }
1408
1409        if (ex != EXT_FIRST_EXTENT(eh)) {
1410                /* we correct tree if first leaf got modified only */
1411                return 0;
1412        }
1413
1414        /*
1415         * TODO: we need correction if border is smaller than current one
1416         */
1417        k = depth - 1;
1418        border = path[depth].p_ext->ee_block;
1419        err = ext4_ext_get_access(handle, inode, path + k);
1420        if (err)
1421                return err;
1422        path[k].p_idx->ei_block = border;
1423        err = ext4_ext_dirty(handle, inode, path + k);
1424        if (err)
1425                return err;
1426
1427        while (k--) {
1428                /* change all left-side indexes */
1429                if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1430                        break;
1431                err = ext4_ext_get_access(handle, inode, path + k);
1432                if (err)
1433                        break;
1434                path[k].p_idx->ei_block = border;
1435                err = ext4_ext_dirty(handle, inode, path + k);
1436                if (err)
1437                        break;
1438        }
1439
1440        return err;
1441}
1442
1443int
1444ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1445                                struct ext4_extent *ex2)
1446{
1447        unsigned short ext1_ee_len, ext2_ee_len, max_len;
1448
1449        /*
1450         * Make sure that either both extents are uninitialized, or
1451         * both are _not_.
1452         */
1453        if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1454                return 0;
1455
1456        if (ext4_ext_is_uninitialized(ex1))
1457                max_len = EXT_UNINIT_MAX_LEN;
1458        else
1459                max_len = EXT_INIT_MAX_LEN;
1460
1461        ext1_ee_len = ext4_ext_get_actual_len(ex1);
1462        ext2_ee_len = ext4_ext_get_actual_len(ex2);
1463
1464        if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1465                        le32_to_cpu(ex2->ee_block))
1466                return 0;
1467
1468        /*
1469         * To allow future support for preallocated extents to be added
1470         * as an RO_COMPAT feature, refuse to merge to extents if
1471         * this can result in the top bit of ee_len being set.
1472         */
1473        if (ext1_ee_len + ext2_ee_len > max_len)
1474                return 0;
1475#ifdef AGGRESSIVE_TEST
1476        if (ext1_ee_len >= 4)
1477                return 0;
1478#endif
1479
1480        if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1481                return 1;
1482        return 0;
1483}
1484
1485/*
1486 * This function tries to merge the "ex" extent to the next extent in the tree.
1487 * It always tries to merge towards right. If you want to merge towards
1488 * left, pass "ex - 1" as argument instead of "ex".
1489 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1490 * 1 if they got merged.
1491 */
1492int ext4_ext_try_to_merge(struct inode *inode,
1493                          struct ext4_ext_path *path,
1494                          struct ext4_extent *ex)
1495{
1496        struct ext4_extent_header *eh;
1497        unsigned int depth, len;
1498        int merge_done = 0;
1499        int uninitialized = 0;
1500
1501        depth = ext_depth(inode);
1502        BUG_ON(path[depth].p_hdr == NULL);
1503        eh = path[depth].p_hdr;
1504
1505        while (ex < EXT_LAST_EXTENT(eh)) {
1506                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1507                        break;
1508                /* merge with next extent! */
1509                if (ext4_ext_is_uninitialized(ex))
1510                        uninitialized = 1;
1511                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1512                                + ext4_ext_get_actual_len(ex + 1));
1513                if (uninitialized)
1514                        ext4_ext_mark_uninitialized(ex);
1515
1516                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1517                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
1518                                * sizeof(struct ext4_extent);
1519                        memmove(ex + 1, ex + 2, len);
1520                }
1521                le16_add_cpu(&eh->eh_entries, -1);
1522                merge_done = 1;
1523                WARN_ON(eh->eh_entries == 0);
1524                if (!eh->eh_entries)
1525                        ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
1526                           "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
1527        }
1528
1529        return merge_done;
1530}
1531
1532/*
1533 * check if a portion of the "newext" extent overlaps with an
1534 * existing extent.
1535 *
1536 * If there is an overlap discovered, it updates the length of the newext
1537 * such that there will be no overlap, and then returns 1.
1538 * If there is no overlap found, it returns 0.
1539 */
1540unsigned int ext4_ext_check_overlap(struct inode *inode,
1541                                    struct ext4_extent *newext,
1542                                    struct ext4_ext_path *path)
1543{
1544        ext4_lblk_t b1, b2;
1545        unsigned int depth, len1;
1546        unsigned int ret = 0;
1547
1548        b1 = le32_to_cpu(newext->ee_block);
1549        len1 = ext4_ext_get_actual_len(newext);
1550        depth = ext_depth(inode);
1551        if (!path[depth].p_ext)
1552                goto out;
1553        b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1554
1555        /*
1556         * get the next allocated block if the extent in the path
1557         * is before the requested block(s)
1558         */
1559        if (b2 < b1) {
1560                b2 = ext4_ext_next_allocated_block(path);
1561                if (b2 == EXT_MAX_BLOCK)
1562                        goto out;
1563        }
1564
1565        /* check for wrap through zero on extent logical start block*/
1566        if (b1 + len1 < b1) {
1567                len1 = EXT_MAX_BLOCK - b1;
1568                newext->ee_len = cpu_to_le16(len1);
1569                ret = 1;
1570        }
1571
1572        /* check for overlap */
1573        if (b1 + len1 > b2) {
1574                newext->ee_len = cpu_to_le16(b2 - b1);
1575                ret = 1;
1576        }
1577out:
1578        return ret;
1579}
1580
1581/*
1582 * ext4_ext_insert_extent:
1583 * tries to merge requsted extent into the existing extent or
1584 * inserts requested extent as new one into the tree,
1585 * creating new leaf in the no-space case.
1586 */
1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1588                                struct ext4_ext_path *path,
1589                                struct ext4_extent *newext, int flag)
1590{
1591        struct ext4_extent_header *eh;
1592        struct ext4_extent *ex, *fex;
1593        struct ext4_extent *nearex; /* nearest extent */
1594        struct ext4_ext_path *npath = NULL;
1595        int depth, len, err;
1596        ext4_lblk_t next;
1597        unsigned uninitialized = 0;
1598
1599        BUG_ON(ext4_ext_get_actual_len(newext) == 0);
1600        depth = ext_depth(inode);
1601        ex = path[depth].p_ext;
1602        BUG_ON(path[depth].p_hdr == NULL);
1603
1604        /* try to insert block into found extent and return */
1605        if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1606                && ext4_can_extents_be_merged(inode, ex, newext)) {
1607                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1608                                ext4_ext_is_uninitialized(newext),
1609                                ext4_ext_get_actual_len(newext),
1610                                le32_to_cpu(ex->ee_block),
1611                                ext4_ext_is_uninitialized(ex),
1612                                ext4_ext_get_actual_len(ex), ext_pblock(ex));
1613                err = ext4_ext_get_access(handle, inode, path + depth);
1614                if (err)
1615                        return err;
1616
1617                /*
1618                 * ext4_can_extents_be_merged should have checked that either
1619                 * both extents are uninitialized, or both aren't. Thus we
1620                 * need to check only one of them here.
1621                 */
1622                if (ext4_ext_is_uninitialized(ex))
1623                        uninitialized = 1;
1624                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1625                                        + ext4_ext_get_actual_len(newext));
1626                if (uninitialized)
1627                        ext4_ext_mark_uninitialized(ex);
1628                eh = path[depth].p_hdr;
1629                nearex = ex;
1630                goto merge;
1631        }
1632
1633repeat:
1634        depth = ext_depth(inode);
1635        eh = path[depth].p_hdr;
1636        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1637                goto has_space;
1638
1639        /* probably next leaf has space for us? */
1640        fex = EXT_LAST_EXTENT(eh);
1641        next = ext4_ext_next_leaf_block(inode, path);
1642        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1643            && next != EXT_MAX_BLOCK) {
1644                ext_debug("next leaf block - %d\n", next);
1645                BUG_ON(npath != NULL);
1646                npath = ext4_ext_find_extent(inode, next, NULL);
1647                if (IS_ERR(npath))
1648                        return PTR_ERR(npath);
1649                BUG_ON(npath->p_depth != path->p_depth);
1650                eh = npath[depth].p_hdr;
1651                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1652                        ext_debug("next leaf isnt full(%d)\n",
1653                                  le16_to_cpu(eh->eh_entries));
1654                        path = npath;
1655                        goto repeat;
1656                }
1657                ext_debug("next leaf has no free space(%d,%d)\n",
1658                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1659        }
1660
1661        /*
1662         * There is no free space in the found leaf.
1663         * We're gonna add a new leaf in the tree.
1664         */
1665        err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1666        if (err)
1667                goto cleanup;
1668        depth = ext_depth(inode);
1669        eh = path[depth].p_hdr;
1670
1671has_space:
1672        nearex = path[depth].p_ext;
1673
1674        err = ext4_ext_get_access(handle, inode, path + depth);
1675        if (err)
1676                goto cleanup;
1677
1678        if (!nearex) {
1679                /* there is no extent in this leaf, create first one */
1680                ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1681                                le32_to_cpu(newext->ee_block),
1682                                ext_pblock(newext),
1683                                ext4_ext_is_uninitialized(newext),
1684                                ext4_ext_get_actual_len(newext));
1685                path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1686        } else if (le32_to_cpu(newext->ee_block)
1687                           > le32_to_cpu(nearex->ee_block)) {
1688/*              BUG_ON(newext->ee_block == nearex->ee_block); */
1689                if (nearex != EXT_LAST_EXTENT(eh)) {
1690                        len = EXT_MAX_EXTENT(eh) - nearex;
1691                        len = (len - 1) * sizeof(struct ext4_extent);
1692                        len = len < 0 ? 0 : len;
1693                        ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1694                                        "move %d from 0x%p to 0x%p\n",
1695                                        le32_to_cpu(newext->ee_block),
1696                                        ext_pblock(newext),
1697                                        ext4_ext_is_uninitialized(newext),
1698                                        ext4_ext_get_actual_len(newext),
1699                                        nearex, len, nearex + 1, nearex + 2);
1700                        memmove(nearex + 2, nearex + 1, len);
1701                }
1702                path[depth].p_ext = nearex + 1;
1703        } else {
1704                BUG_ON(newext->ee_block == nearex->ee_block);
1705                len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1706                len = len < 0 ? 0 : len;
1707                ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1708                                "move %d from 0x%p to 0x%p\n",
1709                                le32_to_cpu(newext->ee_block),
1710                                ext_pblock(newext),
1711                                ext4_ext_is_uninitialized(newext),
1712                                ext4_ext_get_actual_len(newext),
1713                                nearex, len, nearex + 1, nearex + 2);
1714                memmove(nearex + 1, nearex, len);
1715                path[depth].p_ext = nearex;
1716        }
1717
1718        le16_add_cpu(&eh->eh_entries, 1);
1719        nearex = path[depth].p_ext;
1720        nearex->ee_block = newext->ee_block;
1721        ext4_ext_store_pblock(nearex, ext_pblock(newext));
1722        nearex->ee_len = newext->ee_len;
1723
1724merge:
1725        /* try to merge extents to the right */
1726        if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1727                ext4_ext_try_to_merge(inode, path, nearex);
1728
1729        /* try to merge extents to the left */
1730
1731        /* time to correct all indexes above */
1732        err = ext4_ext_correct_indexes(handle, inode, path);
1733        if (err)
1734                goto cleanup;
1735
1736        err = ext4_ext_dirty(handle, inode, path + depth);
1737
1738cleanup:
1739        if (npath) {
1740                ext4_ext_drop_refs(npath);
1741                kfree(npath);
1742        }
1743        ext4_ext_invalidate_cache(inode);
1744        return err;
1745}
1746
1747int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1748                        ext4_lblk_t num, ext_prepare_callback func,
1749                        void *cbdata)
1750{
1751        struct ext4_ext_path *path = NULL;
1752        struct ext4_ext_cache cbex;
1753        struct ext4_extent *ex;
1754        ext4_lblk_t next, start = 0, end = 0;
1755        ext4_lblk_t last = block + num;
1756        int depth, exists, err = 0;
1757
1758        BUG_ON(func == NULL);
1759        BUG_ON(inode == NULL);
1760
1761        while (block < last && block != EXT_MAX_BLOCK) {
1762                num = last - block;
1763                /* find extent for this block */
1764                path = ext4_ext_find_extent(inode, block, path);
1765                if (IS_ERR(path)) {
1766                        err = PTR_ERR(path);
1767                        path = NULL;
1768                        break;
1769                }
1770
1771                depth = ext_depth(inode);
1772                BUG_ON(path[depth].p_hdr == NULL);
1773                ex = path[depth].p_ext;
1774                next = ext4_ext_next_allocated_block(path);
1775
1776                exists = 0;
1777                if (!ex) {
1778                        /* there is no extent yet, so try to allocate
1779                         * all requested space */
1780                        start = block;
1781                        end = block + num;
1782                } else if (le32_to_cpu(ex->ee_block) > block) {
1783                        /* need to allocate space before found extent */
1784                        start = block;
1785                        end = le32_to_cpu(ex->ee_block);
1786                        if (block + num < end)
1787                                end = block + num;
1788                } else if (block >= le32_to_cpu(ex->ee_block)
1789                                        + ext4_ext_get_actual_len(ex)) {
1790                        /* need to allocate space after found extent */
1791                        start = block;
1792                        end = block + num;
1793                        if (end >= next)
1794                                end = next;
1795                } else if (block >= le32_to_cpu(ex->ee_block)) {
1796                        /*
1797                         * some part of requested space is covered
1798                         * by found extent
1799                         */
1800                        start = block;
1801                        end = le32_to_cpu(ex->ee_block)
1802                                + ext4_ext_get_actual_len(ex);
1803                        if (block + num < end)
1804                                end = block + num;
1805                        exists = 1;
1806                } else {
1807                        BUG();
1808                }
1809                BUG_ON(end <= start);
1810
1811                if (!exists) {
1812                        cbex.ec_block = start;
1813                        cbex.ec_len = end - start;
1814                        cbex.ec_start = 0;
1815                        cbex.ec_type = EXT4_EXT_CACHE_GAP;
1816                } else {
1817                        cbex.ec_block = le32_to_cpu(ex->ee_block);
1818                        cbex.ec_len = ext4_ext_get_actual_len(ex);
1819                        cbex.ec_start = ext_pblock(ex);
1820                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1821                }
1822
1823                BUG_ON(cbex.ec_len == 0);
1824                err = func(inode, path, &cbex, ex, cbdata);
1825                ext4_ext_drop_refs(path);
1826
1827                if (err < 0)
1828                        break;
1829
1830                if (err == EXT_REPEAT)
1831                        continue;
1832                else if (err == EXT_BREAK) {
1833                        err = 0;
1834                        break;
1835                }
1836
1837                if (ext_depth(inode) != depth) {
1838                        /* depth was changed. we have to realloc path */
1839                        kfree(path);
1840                        path = NULL;
1841                }
1842
1843                block = cbex.ec_block + cbex.ec_len;
1844        }
1845
1846        if (path) {
1847                ext4_ext_drop_refs(path);
1848                kfree(path);
1849        }
1850
1851        return err;
1852}
1853
1854static void
1855ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1856                        __u32 len, ext4_fsblk_t start, int type)
1857{
1858        struct ext4_ext_cache *cex;
1859        BUG_ON(len == 0);
1860        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1861        cex = &EXT4_I(inode)->i_cached_extent;
1862        cex->ec_type = type;
1863        cex->ec_block = block;
1864        cex->ec_len = len;
1865        cex->ec_start = start;
1866        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1867}
1868
1869/*
1870 * ext4_ext_put_gap_in_cache:
1871 * calculate boundaries of the gap that the requested block fits into
1872 * and cache this gap
1873 */
1874static void
1875ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1876                                ext4_lblk_t block)
1877{
1878        int depth = ext_depth(inode);
1879        unsigned long len;
1880        ext4_lblk_t lblock;
1881        struct ext4_extent *ex;
1882
1883        ex = path[depth].p_ext;
1884        if (ex == NULL) {
1885                /* there is no extent yet, so gap is [0;-] */
1886                lblock = 0;
1887                len = EXT_MAX_BLOCK;
1888                ext_debug("cache gap(whole file):");
1889        } else if (block < le32_to_cpu(ex->ee_block)) {
1890                lblock = block;
1891                len = le32_to_cpu(ex->ee_block) - block;
1892                ext_debug("cache gap(before): %u [%u:%u]",
1893                                block,
1894                                le32_to_cpu(ex->ee_block),
1895                                 ext4_ext_get_actual_len(ex));
1896        } else if (block >= le32_to_cpu(ex->ee_block)
1897                        + ext4_ext_get_actual_len(ex)) {
1898                ext4_lblk_t next;
1899                lblock = le32_to_cpu(ex->ee_block)
1900                        + ext4_ext_get_actual_len(ex);
1901
1902                next = ext4_ext_next_allocated_block(path);
1903                ext_debug("cache gap(after): [%u:%u] %u",
1904                                le32_to_cpu(ex->ee_block),
1905                                ext4_ext_get_actual_len(ex),
1906                                block);
1907                BUG_ON(next == lblock);
1908                len = next - lblock;
1909        } else {
1910                lblock = len = 0;
1911                BUG();
1912        }
1913
1914        ext_debug(" -> %u:%lu\n", lblock, len);
1915        ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1916}
1917
1918static int
1919ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1920                        struct ext4_extent *ex)
1921{
1922        struct ext4_ext_cache *cex;
1923        int ret = EXT4_EXT_CACHE_NO;
1924
1925        /* 
1926         * We borrow i_block_reservation_lock to protect i_cached_extent
1927         */
1928        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1929        cex = &EXT4_I(inode)->i_cached_extent;
1930
1931        /* has cache valid data? */
1932        if (cex->ec_type == EXT4_EXT_CACHE_NO)
1933                goto errout;
1934
1935        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1936                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1937        if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1938                ex->ee_block = cpu_to_le32(cex->ec_block);
1939                ext4_ext_store_pblock(ex, cex->ec_start);
1940                ex->ee_len = cpu_to_le16(cex->ec_len);
1941                ext_debug("%u cached by %u:%u:%llu\n",
1942                                block,
1943                                cex->ec_block, cex->ec_len, cex->ec_start);
1944                ret = cex->ec_type;
1945        }
1946errout:
1947        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1948        return ret;
1949}
1950
1951/*
1952 * ext4_ext_rm_idx:
1953 * removes index from the index block.
1954 * It's used in truncate case only, thus all requests are for
1955 * last index in the block only.
1956 */
1957static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1958                        struct ext4_ext_path *path)
1959{
1960        struct buffer_head *bh;
1961        int err;
1962        ext4_fsblk_t leaf;
1963
1964        /* free index block */
1965        path--;
1966        leaf = idx_pblock(path->p_idx);
1967        BUG_ON(path->p_hdr->eh_entries == 0);
1968        err = ext4_ext_get_access(handle, inode, path);
1969        if (err)
1970                return err;
1971        le16_add_cpu(&path->p_hdr->eh_entries, -1);
1972        err = ext4_ext_dirty(handle, inode, path);
1973        if (err)
1974                return err;
1975        ext_debug("index is empty, remove it, free block %llu\n", leaf);
1976        bh = sb_find_get_block(inode->i_sb, leaf);
1977        ext4_forget(handle, 1, inode, bh, leaf);
1978        ext4_free_blocks(handle, inode, leaf, 1, 1);
1979        return err;
1980}
1981
1982/*
1983 * ext4_ext_calc_credits_for_single_extent:
1984 * This routine returns max. credits that needed to insert an extent
1985 * to the extent tree.
1986 * When pass the actual path, the caller should calculate credits
1987 * under i_data_sem.
1988 */
1989int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
1990                                                struct ext4_ext_path *path)
1991{
1992        if (path) {
1993                int depth = ext_depth(inode);
1994                int ret = 0;
1995
1996                /* probably there is space in leaf? */
1997                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1998                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {
1999
2000                        /*
2001                         *  There are some space in the leaf tree, no
2002                         *  need to account for leaf block credit
2003                         *
2004                         *  bitmaps and block group descriptor blocks
2005                         *  and other metadat blocks still need to be
2006                         *  accounted.
2007                         */
2008                        /* 1 bitmap, 1 block group descriptor */
2009                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2010                        return ret;
2011                }
2012        }
2013
2014        return ext4_chunk_trans_blocks(inode, nrblocks);
2015}
2016
2017/*
2018 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2019 *
2020 * if nrblocks are fit in a single extent (chunk flag is 1), then
2021 * in the worse case, each tree level index/leaf need to be changed
2022 * if the tree split due to insert a new extent, then the old tree
2023 * index/leaf need to be updated too
2024 *
2025 * If the nrblocks are discontiguous, they could cause
2026 * the whole tree split more than once, but this is really rare.
2027 */
2028int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2029{
2030        int index;
2031        int depth = ext_depth(inode);
2032
2033        if (chunk)
2034                index = depth * 2;
2035        else
2036                index = depth * 3;
2037
2038        return index;
2039}
2040
2041static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2042                                struct ext4_extent *ex,
2043                                ext4_lblk_t from, ext4_lblk_t to)
2044{
2045        struct buffer_head *bh;
2046        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2047        int i, metadata = 0;
2048
2049        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2050                metadata = 1;
2051#ifdef EXTENTS_STATS
2052        {
2053                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2054                spin_lock(&sbi->s_ext_stats_lock);
2055                sbi->s_ext_blocks += ee_len;
2056                sbi->s_ext_extents++;
2057                if (ee_len < sbi->s_ext_min)
2058                        sbi->s_ext_min = ee_len;
2059                if (ee_len > sbi->s_ext_max)
2060                        sbi->s_ext_max = ee_len;
2061                if (ext_depth(inode) > sbi->s_depth_max)
2062                        sbi->s_depth_max = ext_depth(inode);
2063                spin_unlock(&sbi->s_ext_stats_lock);
2064        }
2065#endif
2066        if (from >= le32_to_cpu(ex->ee_block)
2067            && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2068                /* tail removal */
2069                ext4_lblk_t num;
2070                ext4_fsblk_t start;
2071
2072                num = le32_to_cpu(ex->ee_block) + ee_len - from;
2073                start = ext_pblock(ex) + ee_len - num;
2074                ext_debug("free last %u blocks starting %llu\n", num, start);
2075                for (i = 0; i < num; i++) {
2076                        bh = sb_find_get_block(inode->i_sb, start + i);
2077                        ext4_forget(handle, 0, inode, bh, start + i);
2078                }
2079                ext4_free_blocks(handle, inode, start, num, metadata);
2080        } else if (from == le32_to_cpu(ex->ee_block)
2081                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2082                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
2083                        from, to, le32_to_cpu(ex->ee_block), ee_len);
2084        } else {
2085                printk(KERN_INFO "strange request: removal(2) "
2086                                "%u-%u from %u:%u\n",
2087                                from, to, le32_to_cpu(ex->ee_block), ee_len);
2088        }
2089        return 0;
2090}
2091
2092static int
2093ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2094                struct ext4_ext_path *path, ext4_lblk_t start)
2095{
2096        int err = 0, correct_index = 0;
2097        int depth = ext_depth(inode), credits;
2098        struct ext4_extent_header *eh;
2099        ext4_lblk_t a, b, block;
2100        unsigned num;
2101        ext4_lblk_t ex_ee_block;
2102        unsigned short ex_ee_len;
2103        unsigned uninitialized = 0;
2104        struct ext4_extent *ex;
2105
2106        /* the header must be checked already in ext4_ext_remove_space() */
2107        ext_debug("truncate since %u in leaf\n", start);
2108        if (!path[depth].p_hdr)
2109                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2110        eh = path[depth].p_hdr;
2111        BUG_ON(eh == NULL);
2112
2113        /* find where to start removing */
2114        ex = EXT_LAST_EXTENT(eh);
2115
2116        ex_ee_block = le32_to_cpu(ex->ee_block);
2117        ex_ee_len = ext4_ext_get_actual_len(ex);
2118
2119        while (ex >= EXT_FIRST_EXTENT(eh) &&
2120                        ex_ee_block + ex_ee_len > start) {
2121
2122                if (ext4_ext_is_uninitialized(ex))
2123                        uninitialized = 1;
2124                else
2125                        uninitialized = 0;
2126
2127                ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2128                         uninitialized, ex_ee_len);
2129                path[depth].p_ext = ex;
2130
2131                a = ex_ee_block > start ? ex_ee_block : start;
2132                b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
2133                        ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
2134
2135                ext_debug("  border %u:%u\n", a, b);
2136
2137                if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
2138                        block = 0;
2139                        num = 0;
2140                        BUG();
2141                } else if (a != ex_ee_block) {
2142                        /* remove tail of the extent */
2143                        block = ex_ee_block;
2144                        num = a - block;
2145                } else if (b != ex_ee_block + ex_ee_len - 1) {
2146                        /* remove head of the extent */
2147                        block = a;
2148                        num = b - a;
2149                        /* there is no "make a hole" API yet */
2150                        BUG();
2151                } else {
2152                        /* remove whole extent: excellent! */
2153                        block = ex_ee_block;
2154                        num = 0;
2155                        BUG_ON(a != ex_ee_block);
2156                        BUG_ON(b != ex_ee_block + ex_ee_len - 1);
2157                }
2158
2159                /*
2160                 * 3 for leaf, sb, and inode plus 2 (bmap and group
2161                 * descriptor) for each block group; assume two block
2162                 * groups plus ex_ee_len/blocks_per_block_group for
2163                 * the worst case
2164                 */
2165                credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2166                if (ex == EXT_FIRST_EXTENT(eh)) {
2167                        correct_index = 1;
2168                        credits += (ext_depth(inode)) + 1;
2169                }
2170                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2171
2172                err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2173                if (err)
2174                        goto out;
2175
2176                err = ext4_ext_get_access(handle, inode, path + depth);
2177                if (err)
2178                        goto out;
2179
2180                err = ext4_remove_blocks(handle, inode, ex, a, b);
2181                if (err)
2182                        goto out;
2183
2184                if (num == 0) {
2185                        /* this extent is removed; mark slot entirely unused */
2186                        ext4_ext_store_pblock(ex, 0);
2187                        le16_add_cpu(&eh->eh_entries, -1);
2188                }
2189
2190                ex->ee_block = cpu_to_le32(block);
2191                ex->ee_len = cpu_to_le16(num);
2192                /*
2193                 * Do not mark uninitialized if all the blocks in the
2194                 * extent have been removed.
2195                 */
2196                if (uninitialized && num)
2197                        ext4_ext_mark_uninitialized(ex);
2198
2199                err = ext4_ext_dirty(handle, inode, path + depth);
2200                if (err)
2201                        goto out;
2202
2203                ext_debug("new extent: %u:%u:%llu\n", block, num,
2204                                ext_pblock(ex));
2205                ex--;
2206                ex_ee_block = le32_to_cpu(ex->ee_block);
2207                ex_ee_len = ext4_ext_get_actual_len(ex);
2208        }
2209
2210        if (correct_index && eh->eh_entries)
2211                err = ext4_ext_correct_indexes(handle, inode, path);
2212
2213        /* if this leaf is free, then we should
2214         * remove it from index block above */
2215        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2216                err = ext4_ext_rm_idx(handle, inode, path + depth);
2217
2218out:
2219        return err;
2220}
2221
2222/*
2223 * ext4_ext_more_to_rm:
2224 * returns 1 if current index has to be freed (even partial)
2225 */
2226static int
2227ext4_ext_more_to_rm(struct ext4_ext_path *path)
2228{
2229        BUG_ON(path->p_idx == NULL);
2230
2231        if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2232                return 0;
2233
2234        /*
2235         * if truncate on deeper level happened, it wasn't partial,
2236         * so we have to consider current index for truncation
2237         */
2238        if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2239                return 0;
2240        return 1;
2241}
2242
2243static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2244{
2245        struct super_block *sb = inode->i_sb;
2246        int depth = ext_depth(inode);
2247        struct ext4_ext_path *path;
2248        handle_t *handle;
2249        int i = 0, err = 0;
2250
2251        ext_debug("truncate since %u\n", start);
2252
2253        /* probably first extent we're gonna free will be last in block */
2254        handle = ext4_journal_start(inode, depth + 1);
2255        if (IS_ERR(handle))
2256                return PTR_ERR(handle);
2257
2258        ext4_ext_invalidate_cache(inode);
2259
2260        /*
2261         * We start scanning from right side, freeing all the blocks
2262         * after i_size and walking into the tree depth-wise.
2263         */
2264        path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2265        if (path == NULL) {
2266                ext4_journal_stop(handle);
2267                return -ENOMEM;
2268        }
2269        path[0].p_hdr = ext_inode_hdr(inode);
2270        if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2271                err = -EIO;
2272                goto out;
2273        }
2274        path[0].p_depth = depth;
2275
2276        while (i >= 0 && err == 0) {
2277                if (i == depth) {
2278                        /* this is leaf block */
2279                        err = ext4_ext_rm_leaf(handle, inode, path, start);
2280                        /* root level has p_bh == NULL, brelse() eats this */
2281                        brelse(path[i].p_bh);
2282                        path[i].p_bh = NULL;
2283                        i--;
2284                        continue;
2285                }
2286
2287                /* this is index block */
2288                if (!path[i].p_hdr) {
2289                        ext_debug("initialize header\n");
2290                        path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2291                }
2292
2293                if (!path[i].p_idx) {
2294                        /* this level hasn't been touched yet */
2295                        path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2296                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2297                        ext_debug("init index ptr: hdr 0x%p, num %d\n",
2298                                  path[i].p_hdr,
2299                                  le16_to_cpu(path[i].p_hdr->eh_entries));
2300                } else {
2301                        /* we were already here, see at next index */
2302                        path[i].p_idx--;
2303                }
2304
2305                ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2306                                i, EXT_FIRST_INDEX(path[i].p_hdr),
2307                                path[i].p_idx);
2308                if (ext4_ext_more_to_rm(path + i)) {
2309                        struct buffer_head *bh;
2310                        /* go to the next level */
2311                        ext_debug("move to level %d (block %llu)\n",
2312                                  i + 1, idx_pblock(path[i].p_idx));
2313                        memset(path + i + 1, 0, sizeof(*path));
2314                        bh = sb_bread(sb, idx_pblock(path[i].p_idx));
2315                        if (!bh) {
2316                                /* should we reset i_size? */
2317                                err = -EIO;
2318                                break;
2319                        }
2320                        if (WARN_ON(i + 1 > depth)) {
2321                                err = -EIO;
2322                                break;
2323                        }
2324                        if (ext4_ext_check(inode, ext_block_hdr(bh),
2325                                                        depth - i - 1)) {
2326                                err = -EIO;
2327                                break;
2328                        }
2329                        path[i + 1].p_bh = bh;
2330
2331                        /* save actual number of indexes since this
2332                         * number is changed at the next iteration */
2333                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2334                        i++;
2335                } else {
2336                        /* we finished processing this index, go up */
2337                        if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2338                                /* index is empty, remove it;
2339                                 * handle must be already prepared by the
2340                                 * truncatei_leaf() */
2341                                err = ext4_ext_rm_idx(handle, inode, path + i);
2342                        }
2343                        /* root level has p_bh == NULL, brelse() eats this */
2344                        brelse(path[i].p_bh);
2345                        path[i].p_bh = NULL;
2346                        i--;
2347                        ext_debug("return to level %d\n", i);
2348                }
2349        }
2350
2351        /* TODO: flexible tree reduction should be here */
2352        if (path->p_hdr->eh_entries == 0) {
2353                /*
2354                 * truncate to zero freed all the tree,
2355                 * so we need to correct eh_depth
2356                 */
2357                err = ext4_ext_get_access(handle, inode, path);
2358                if (err == 0) {
2359                        ext_inode_hdr(inode)->eh_depth = 0;
2360                        ext_inode_hdr(inode)->eh_max =
2361                                cpu_to_le16(ext4_ext_space_root(inode, 0));
2362                        err = ext4_ext_dirty(handle, inode, path);
2363                }
2364        }
2365out:
2366        ext4_ext_drop_refs(path);
2367        kfree(path);
2368        ext4_journal_stop(handle);
2369
2370        return err;
2371}
2372
2373/*
2374 * called at mount time
2375 */
2376void ext4_ext_init(struct super_block *sb)
2377{
2378        /*
2379         * possible initialization would be here
2380         */
2381
2382        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2383#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2384                printk(KERN_INFO "EXT4-fs: file extents enabled");
2385#ifdef AGGRESSIVE_TEST
2386                printk(", aggressive tests");
2387#endif
2388#ifdef CHECK_BINSEARCH
2389                printk(", check binsearch");
2390#endif
2391#ifdef EXTENTS_STATS
2392                printk(", stats");
2393#endif
2394                printk("\n");
2395#endif
2396#ifdef EXTENTS_STATS
2397                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2398                EXT4_SB(sb)->s_ext_min = 1 << 30;
2399                EXT4_SB(sb)->s_ext_max = 0;
2400#endif
2401        }
2402}
2403
2404/*
2405 * called at umount time
2406 */
2407void ext4_ext_release(struct super_block *sb)
2408{
2409        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2410                return;
2411
2412#ifdef EXTENTS_STATS
2413        if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2414                struct ext4_sb_info *sbi = EXT4_SB(sb);
2415                printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2416                        sbi->s_ext_blocks, sbi->s_ext_extents,
2417                        sbi->s_ext_blocks / sbi->s_ext_extents);
2418                printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2419                        sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2420        }
2421#endif
2422}
2423
2424static void bi_complete(struct bio *bio, int error)
2425{
2426        complete((struct completion *)bio->bi_private);
2427}
2428
2429/* FIXME!! we need to try to merge to left or right after zero-out  */
2430static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2431{
2432        int ret = -EIO;
2433        struct bio *bio;
2434        int blkbits, blocksize;
2435        sector_t ee_pblock;
2436        struct completion event;
2437        unsigned int ee_len, len, done, offset;
2438
2439
2440        blkbits   = inode->i_blkbits;
2441        blocksize = inode->i_sb->s_blocksize;
2442        ee_len    = ext4_ext_get_actual_len(ex);
2443        ee_pblock = ext_pblock(ex);
2444
2445        /* convert ee_pblock to 512 byte sectors */
2446        ee_pblock = ee_pblock << (blkbits - 9);
2447
2448        while (ee_len > 0) {
2449
2450                if (ee_len > BIO_MAX_PAGES)
2451                        len = BIO_MAX_PAGES;
2452                else
2453                        len = ee_len;
2454
2455                bio = bio_alloc(GFP_NOIO, len);
2456                bio->bi_sector = ee_pblock;
2457                bio->bi_bdev   = inode->i_sb->s_bdev;
2458
2459                done = 0;
2460                offset = 0;
2461                while (done < len) {
2462                        ret = bio_add_page(bio, ZERO_PAGE(0),
2463                                                        blocksize, offset);
2464                        if (ret != blocksize) {
2465                                /*
2466                                 * We can't add any more pages because of
2467                                 * hardware limitations.  Start a new bio.
2468                                 */
2469                                break;
2470                        }
2471                        done++;
2472                        offset += blocksize;
2473                        if (offset >= PAGE_CACHE_SIZE)
2474                                offset = 0;
2475                }
2476
2477                init_completion(&event);
2478                bio->bi_private = &event;
2479                bio->bi_end_io = bi_complete;
2480                submit_bio(WRITE, bio);
2481                wait_for_completion(&event);
2482
2483                if (test_bit(BIO_UPTODATE, &bio->bi_flags))
2484                        ret = 0;
2485                else {
2486                        ret = -EIO;
2487                        break;
2488                }
2489                bio_put(bio);
2490                ee_len    -= done;
2491                ee_pblock += done  << (blkbits - 9);
2492        }
2493        return ret;
2494}
2495
2496#define EXT4_EXT_ZERO_LEN 7
2497/*
2498 * This function is called by ext4_ext_get_blocks() if someone tries to write
2499 * to an uninitialized extent. It may result in splitting the uninitialized
2500 * extent into multiple extents (upto three - one initialized and two
2501 * uninitialized).
2502 * There are three possibilities:
2503 *   a> There is no split required: Entire extent should be initialized
2504 *   b> Splits in two extents: Write is happening at either end of the extent
2505 *   c> Splits in three extents: Somone is writing in middle of the extent
2506 */
2507static int ext4_ext_convert_to_initialized(handle_t *handle,
2508                                                struct inode *inode,
2509                                                struct ext4_ext_path *path,
2510                                                ext4_lblk_t iblock,
2511                                                unsigned int max_blocks)
2512{
2513        struct ext4_extent *ex, newex, orig_ex;
2514        struct ext4_extent *ex1 = NULL;
2515        struct ext4_extent *ex2 = NULL;
2516        struct ext4_extent *ex3 = NULL;
2517        struct ext4_extent_header *eh;
2518        ext4_lblk_t ee_block;
2519        unsigned int allocated, ee_len, depth;
2520        ext4_fsblk_t newblock;
2521        int err = 0;
2522        int ret = 0;
2523
2524        depth = ext_depth(inode);
2525        eh = path[depth].p_hdr;
2526        ex = path[depth].p_ext;
2527        ee_block = le32_to_cpu(ex->ee_block);
2528        ee_len = ext4_ext_get_actual_len(ex);
2529        allocated = ee_len - (iblock - ee_block);
2530        newblock = iblock - ee_block + ext_pblock(ex);
2531        ex2 = ex;
2532        orig_ex.ee_block = ex->ee_block;
2533        orig_ex.ee_len   = cpu_to_le16(ee_len);
2534        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2535
2536        err = ext4_ext_get_access(handle, inode, path + depth);
2537        if (err)
2538                goto out;
2539        /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2540        if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
2541                err =  ext4_ext_zeroout(inode, &orig_ex);
2542                if (err)
2543                        goto fix_extent_len;
2544                /* update the extent length and mark as initialized */
2545                ex->ee_block = orig_ex.ee_block;
2546                ex->ee_len   = orig_ex.ee_len;
2547                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2548                ext4_ext_dirty(handle, inode, path + depth);
2549                /* zeroed the full extent */
2550                return allocated;
2551        }
2552
2553        /* ex1: ee_block to iblock - 1 : uninitialized */
2554        if (iblock > ee_block) {
2555                ex1 = ex;
2556                ex1->ee_len = cpu_to_le16(iblock - ee_block);
2557                ext4_ext_mark_uninitialized(ex1);
2558                ex2 = &newex;
2559        }
2560        /*
2561         * for sanity, update the length of the ex2 extent before
2562         * we insert ex3, if ex1 is NULL. This is to avoid temporary
2563         * overlap of blocks.
2564         */
2565        if (!ex1 && allocated > max_blocks)
2566                ex2->ee_len = cpu_to_le16(max_blocks);
2567        /* ex3: to ee_block + ee_len : uninitialised */
2568        if (allocated > max_blocks) {
2569                unsigned int newdepth;
2570                /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2571                if (allocated <= EXT4_EXT_ZERO_LEN) {
2572                        /*
2573                         * iblock == ee_block is handled by the zerouout
2574                         * at the beginning.
2575                         * Mark first half uninitialized.
2576                         * Mark second half initialized and zero out the
2577                         * initialized extent
2578                         */
2579                        ex->ee_block = orig_ex.ee_block;
2580                        ex->ee_len   = cpu_to_le16(ee_len - allocated);
2581                        ext4_ext_mark_uninitialized(ex);
2582                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2583                        ext4_ext_dirty(handle, inode, path + depth);
2584
2585                        ex3 = &newex;
2586                        ex3->ee_block = cpu_to_le32(iblock);
2587                        ext4_ext_store_pblock(ex3, newblock);
2588                        ex3->ee_len = cpu_to_le16(allocated);
2589                        err = ext4_ext_insert_extent(handle, inode, path,
2590                                                        ex3, 0);
2591                        if (err == -ENOSPC) {
2592                                err =  ext4_ext_zeroout(inode, &orig_ex);
2593                                if (err)
2594                                        goto fix_extent_len;
2595                                ex->ee_block = orig_ex.ee_block;
2596                                ex->ee_len   = orig_ex.ee_len;
2597                                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2598                                ext4_ext_dirty(handle, inode, path + depth);
2599                                /* blocks available from iblock */
2600                                return allocated;
2601
2602                        } else if (err)
2603                                goto fix_extent_len;
2604
2605                        /*
2606                         * We need to zero out the second half because
2607                         * an fallocate request can update file size and
2608                         * converting the second half to initialized extent
2609                         * implies that we can leak some junk data to user
2610                         * space.
2611                         */
2612                        err =  ext4_ext_zeroout(inode, ex3);
2613                        if (err) {
2614                                /*
2615                                 * We should actually mark the
2616                                 * second half as uninit and return error
2617                                 * Insert would have changed the extent
2618                                 */
2619                                depth = ext_depth(inode);
2620                                ext4_ext_drop_refs(path);
2621                                path = ext4_ext_find_extent(inode,
2622                                                                iblock, path);
2623                                if (IS_ERR(path)) {
2624                                        err = PTR_ERR(path);
2625                                        return err;
2626                                }
2627                                /* get the second half extent details */
2628                                ex = path[depth].p_ext;
2629                                err = ext4_ext_get_access(handle, inode,
2630                                                                path + depth);
2631                                if (err)
2632                                        return err;
2633                                ext4_ext_mark_uninitialized(ex);
2634                                ext4_ext_dirty(handle, inode, path + depth);
2635                                return err;
2636                        }
2637
2638                        /* zeroed the second half */
2639                        return allocated;
2640                }
2641                ex3 = &newex;
2642                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2643                ext4_ext_store_pblock(ex3, newblock + max_blocks);
2644                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2645                ext4_ext_mark_uninitialized(ex3);
2646                err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2647                if (err == -ENOSPC) {
2648                        err =  ext4_ext_zeroout(inode, &orig_ex);
2649                        if (err)
2650                                goto fix_extent_len;
2651                        /* update the extent length and mark as initialized */
2652                        ex->ee_block = orig_ex.ee_block;
2653                        ex->ee_len   = orig_ex.ee_len;
2654                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2655                        ext4_ext_dirty(handle, inode, path + depth);
2656                        /* zeroed the full extent */
2657                        /* blocks available from iblock */
2658                        return allocated;
2659
2660                } else if (err)
2661                        goto fix_extent_len;
2662                /*
2663                 * The depth, and hence eh & ex might change
2664                 * as part of the insert above.
2665                 */
2666                newdepth = ext_depth(inode);
2667                /*
2668                 * update the extent length after successful insert of the
2669                 * split extent
2670                 */
2671                orig_ex.ee_len = cpu_to_le16(ee_len -
2672                                                ext4_ext_get_actual_len(ex3));
2673                depth = newdepth;
2674                ext4_ext_drop_refs(path);
2675                path = ext4_ext_find_extent(inode, iblock, path);
2676                if (IS_ERR(path)) {
2677                        err = PTR_ERR(path);
2678                        goto out;
2679                }
2680                eh = path[depth].p_hdr;
2681                ex = path[depth].p_ext;
2682                if (ex2 != &newex)
2683                        ex2 = ex;
2684
2685                err = ext4_ext_get_access(handle, inode, path + depth);
2686                if (err)
2687                        goto out;
2688
2689                allocated = max_blocks;
2690
2691                /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2692                 * to insert a extent in the middle zerout directly
2693                 * otherwise give the extent a chance to merge to left
2694                 */
2695                if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2696                                                        iblock != ee_block) {
2697                        err =  ext4_ext_zeroout(inode, &orig_ex);
2698                        if (err)
2699                                goto fix_extent_len;
2700                        /* update the extent length and mark as initialized */
2701                        ex->ee_block = orig_ex.ee_block;
2702                        ex->ee_len   = orig_ex.ee_len;
2703                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2704                        ext4_ext_dirty(handle, inode, path + depth);
2705                        /* zero out the first half */
2706                        /* blocks available from iblock */
2707                        return allocated;
2708                }
2709        }
2710        /*
2711         * If there was a change of depth as part of the
2712         * insertion of ex3 above, we need to update the length
2713         * of the ex1 extent again here
2714         */
2715        if (ex1 && ex1 != ex) {
2716                ex1 = ex;
2717                ex1->ee_len = cpu_to_le16(iblock - ee_block);
2718                ext4_ext_mark_uninitialized(ex1);
2719                ex2 = &newex;
2720        }
2721        /* ex2: iblock to iblock + maxblocks-1 : initialised */
2722        ex2->ee_block = cpu_to_le32(iblock);
2723        ext4_ext_store_pblock(ex2, newblock);
2724        ex2->ee_len = cpu_to_le16(allocated);
2725        if (ex2 != ex)
2726                goto insert;
2727        /*
2728         * New (initialized) extent starts from the first block
2729         * in the current extent. i.e., ex2 == ex
2730         * We have to see if it can be merged with the extent
2731         * on the left.
2732         */
2733        if (ex2 > EXT_FIRST_EXTENT(eh)) {
2734                /*
2735                 * To merge left, pass "ex2 - 1" to try_to_merge(),
2736                 * since it merges towards right _only_.
2737                 */
2738                ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2739                if (ret) {
2740                        err = ext4_ext_correct_indexes(handle, inode, path);
2741                        if (err)
2742                                goto out;
2743                        depth = ext_depth(inode);
2744                        ex2--;
2745                }
2746        }
2747        /*
2748         * Try to Merge towards right. This might be required
2749         * only when the whole extent is being written to.
2750         * i.e. ex2 == ex and ex3 == NULL.
2751         */
2752        if (!ex3) {
2753                ret = ext4_ext_try_to_merge(inode, path, ex2);
2754                if (ret) {
2755                        err = ext4_ext_correct_indexes(handle, inode, path);
2756                        if (err)
2757                                goto out;
2758                }
2759        }
2760        /* Mark modified extent as dirty */
2761        err = ext4_ext_dirty(handle, inode, path + depth);
2762        goto out;
2763insert:
2764        err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2765        if (err == -ENOSPC) {
2766                err =  ext4_ext_zeroout(inode, &orig_ex);
2767                if (err)
2768                        goto fix_extent_len;
2769                /* update the extent length and mark as initialized */
2770                ex->ee_block = orig_ex.ee_block;
2771                ex->ee_len   = orig_ex.ee_len;
2772                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2773                ext4_ext_dirty(handle, inode, path + depth);
2774                /* zero out the first half */
2775                return allocated;
2776        } else if (err)
2777                goto fix_extent_len;
2778out:
2779        ext4_ext_show_leaf(inode, path);
2780        return err ? err : allocated;
2781
2782fix_extent_len:
2783        ex->ee_block = orig_ex.ee_block;
2784        ex->ee_len   = orig_ex.ee_len;
2785        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2786        ext4_ext_mark_uninitialized(ex);
2787        ext4_ext_dirty(handle, inode, path + depth);
2788        return err;
2789}
2790
2791/*
2792 * This function is called by ext4_ext_get_blocks() from
2793 * ext4_get_blocks_dio_write() when DIO to write
2794 * to an uninitialized extent.
2795 *
2796 * Writing to an uninitized extent may result in splitting the uninitialized
2797 * extent into multiple /intialized unintialized extents (up to three)
2798 * There are three possibilities:
2799 *   a> There is no split required: Entire extent should be uninitialized
2800 *   b> Splits in two extents: Write is happening at either end of the extent
2801 *   c> Splits in three extents: Somone is writing in middle of the extent
2802 *
2803 * One of more index blocks maybe needed if the extent tree grow after
2804 * the unintialized extent split. To prevent ENOSPC occur at the IO
2805 * complete, we need to split the uninitialized extent before DIO submit
2806 * the IO. The uninitilized extent called at this time will be split
2807 * into three uninitialized extent(at most). After IO complete, the part
2808 * being filled will be convert to initialized by the end_io callback function
2809 * via ext4_convert_unwritten_extents().
2810 *
2811 * Returns the size of uninitialized extent to be written on success.
2812 */
2813static int ext4_split_unwritten_extents(handle_t *handle,
2814                                        struct inode *inode,
2815                                        struct ext4_ext_path *path,
2816                                        ext4_lblk_t iblock,
2817                                        unsigned int max_blocks,
2818                                        int flags)
2819{
2820        struct ext4_extent *ex, newex, orig_ex;
2821        struct ext4_extent *ex1 = NULL;
2822        struct ext4_extent *ex2 = NULL;
2823        struct ext4_extent *ex3 = NULL;
2824        struct ext4_extent_header *eh;
2825        ext4_lblk_t ee_block;
2826        unsigned int allocated, ee_len, depth;
2827        ext4_fsblk_t newblock;
2828        int err = 0;
2829
2830        ext_debug("ext4_split_unwritten_extents: inode %lu,"
2831                  "iblock %llu, max_blocks %u\n", inode->i_ino,
2832                  (unsigned long long)iblock, max_blocks);
2833        depth = ext_depth(inode);
2834        eh = path[depth].p_hdr;
2835        ex = path[depth].p_ext;
2836        ee_block = le32_to_cpu(ex->ee_block);
2837        ee_len = ext4_ext_get_actual_len(ex);
2838        allocated = ee_len - (iblock - ee_block);
2839        newblock = iblock - ee_block + ext_pblock(ex);
2840        ex2 = ex;
2841        orig_ex.ee_block = ex->ee_block;
2842        orig_ex.ee_len   = cpu_to_le16(ee_len);
2843        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2844
2845        /*
2846         * If the uninitialized extent begins at the same logical
2847         * block where the write begins, and the write completely
2848         * covers the extent, then we don't need to split it.
2849         */
2850        if ((iblock == ee_block) && (allocated <= max_blocks))
2851                return allocated;
2852
2853        err = ext4_ext_get_access(handle, inode, path + depth);
2854        if (err)
2855                goto out;
2856        /* ex1: ee_block to iblock - 1 : uninitialized */
2857        if (iblock > ee_block) {
2858                ex1 = ex;
2859                ex1->ee_len = cpu_to_le16(iblock - ee_block);
2860                ext4_ext_mark_uninitialized(ex1);
2861                ex2 = &newex;
2862        }
2863        /*
2864         * for sanity, update the length of the ex2 extent before
2865         * we insert ex3, if ex1 is NULL. This is to avoid temporary
2866         * overlap of blocks.
2867         */
2868        if (!ex1 && allocated > max_blocks)
2869                ex2->ee_len = cpu_to_le16(max_blocks);
2870        /* ex3: to ee_block + ee_len : uninitialised */
2871        if (allocated > max_blocks) {
2872                unsigned int newdepth;
2873                ex3 = &newex;
2874                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2875                ext4_ext_store_pblock(ex3, newblock + max_blocks);
2876                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2877                ext4_ext_mark_uninitialized(ex3);
2878                err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2879                if (err == -ENOSPC) {
2880                        err =  ext4_ext_zeroout(inode, &orig_ex);
2881                        if (err)
2882                                goto fix_extent_len;
2883                        /* update the extent length and mark as initialized */
2884                        ex->ee_block = orig_ex.ee_block;
2885                        ex->ee_len   = orig_ex.ee_len;
2886                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2887                        ext4_ext_dirty(handle, inode, path + depth);
2888                        /* zeroed the full extent */
2889                        /* blocks available from iblock */
2890                        return allocated;
2891
2892                } else if (err)
2893                        goto fix_extent_len;
2894                /*
2895                 * The depth, and hence eh & ex might change
2896                 * as part of the insert above.
2897                 */
2898                newdepth = ext_depth(inode);
2899                /*
2900                 * update the extent length after successful insert of the
2901                 * split extent
2902                 */
2903                orig_ex.ee_len = cpu_to_le16(ee_len -
2904                                                ext4_ext_get_actual_len(ex3));
2905                depth = newdepth;
2906                ext4_ext_drop_refs(path);
2907                path = ext4_ext_find_extent(inode, iblock, path);
2908                if (IS_ERR(path)) {
2909                        err = PTR_ERR(path);
2910                        goto out;
2911                }
2912                eh = path[depth].p_hdr;
2913                ex = path[depth].p_ext;
2914                if (ex2 != &newex)
2915                        ex2 = ex;
2916
2917                err = ext4_ext_get_access(handle, inode, path + depth);
2918                if (err)
2919                        goto out;
2920
2921                allocated = max_blocks;
2922        }
2923        /*
2924         * If there was a change of depth as part of the
2925         * insertion of ex3 above, we need to update the length
2926         * of the ex1 extent again here
2927         */
2928        if (ex1 && ex1 != ex) {
2929                ex1 = ex;
2930                ex1->ee_len = cpu_to_le16(iblock - ee_block);
2931                ext4_ext_mark_uninitialized(ex1);
2932                ex2 = &newex;
2933        }
2934        /*
2935         * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
2936         * uninitialised still.
2937         */
2938        ex2->ee_block = cpu_to_le32(iblock);
2939        ext4_ext_store_pblock(ex2, newblock);
2940        ex2->ee_len = cpu_to_le16(allocated);
2941        ext4_ext_mark_uninitialized(ex2);
2942        if (ex2 != ex)
2943                goto insert;
2944        /* Mark modified extent as dirty */
2945        err = ext4_ext_dirty(handle, inode, path + depth);
2946        ext_debug("out here\n");
2947        goto out;
2948insert:
2949        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2950        if (err == -ENOSPC) {
2951                err =  ext4_ext_zeroout(inode, &orig_ex);
2952                if (err)
2953                        goto fix_extent_len;
2954                /* update the extent length and mark as initialized */
2955                ex->ee_block = orig_ex.ee_block;
2956                ex->ee_len   = orig_ex.ee_len;
2957                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2958                ext4_ext_dirty(handle, inode, path + depth);
2959                /* zero out the first half */
2960                return allocated;
2961        } else if (err)
2962                goto fix_extent_len;
2963out:
2964        ext4_ext_show_leaf(inode, path);
2965        return err ? err : allocated;
2966
2967fix_extent_len:
2968        ex->ee_block = orig_ex.ee_block;
2969        ex->ee_len   = orig_ex.ee_len;
2970        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2971        ext4_ext_mark_uninitialized(ex);
2972        ext4_ext_dirty(handle, inode, path + depth);
2973        return err;
2974}
2975static int ext4_convert_unwritten_extents_dio(handle_t *handle,
2976                                              struct inode *inode,
2977                                              struct ext4_ext_path *path)
2978{
2979        struct ext4_extent *ex;
2980        struct ext4_extent_header *eh;
2981        int depth;
2982        int err = 0;
2983        int ret = 0;
2984
2985        depth = ext_depth(inode);
2986        eh = path[depth].p_hdr;
2987        ex = path[depth].p_ext;
2988
2989        err = ext4_ext_get_access(handle, inode, path + depth);
2990        if (err)
2991                goto out;
2992        /* first mark the extent as initialized */
2993        ext4_ext_mark_initialized(ex);
2994
2995        /*
2996         * We have to see if it can be merged with the extent
2997         * on the left.
2998         */
2999        if (ex > EXT_FIRST_EXTENT(eh)) {
3000                /*
3001                 * To merge left, pass "ex - 1" to try_to_merge(),
3002                 * since it merges towards right _only_.
3003                 */
3004                ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3005                if (ret) {
3006                        err = ext4_ext_correct_indexes(handle, inode, path);
3007                        if (err)
3008                                goto out;
3009                        depth = ext_depth(inode);
3010                        ex--;
3011                }
3012        }
3013        /*
3014         * Try to Merge towards right.
3015         */
3016        ret = ext4_ext_try_to_merge(inode, path, ex);
3017        if (ret) {
3018                err = ext4_ext_correct_indexes(handle, inode, path);
3019                if (err)
3020                        goto out;
3021                depth = ext_depth(inode);
3022        }
3023        /* Mark modified extent as dirty */
3024        err = ext4_ext_dirty(handle, inode, path + depth);
3025out:
3026        ext4_ext_show_leaf(inode, path);
3027        return err;
3028}
3029
3030static int
3031ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3032                        ext4_lblk_t iblock, unsigned int max_blocks,
3033                        struct ext4_ext_path *path, int flags,
3034                        unsigned int allocated, struct buffer_head *bh_result,
3035                        ext4_fsblk_t newblock)
3036{
3037        int ret = 0;
3038        int err = 0;
3039        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3040
3041        ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3042                  "block %llu, max_blocks %u, flags %d, allocated %u",
3043                  inode->i_ino, (unsigned long long)iblock, max_blocks,
3044                  flags, allocated);
3045        ext4_ext_show_leaf(inode, path);
3046
3047        /* DIO get_block() before submit the IO, split the extent */
3048        if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3049                ret = ext4_split_unwritten_extents(handle,
3050                                                inode, path, iblock,
3051                                                max_blocks, flags);
3052                /*
3053                 * Flag the inode(non aio case) or end_io struct (aio case)
3054                 * that this IO needs to convertion to written when IO is
3055                 * completed
3056                 */
3057                if (io)
3058                        io->flag = DIO_AIO_UNWRITTEN;
3059                else
3060                        EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
3061                goto out;
3062        }
3063        /* async DIO end_io complete, convert the filled extent to written */
3064        if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3065                ret = ext4_convert_unwritten_extents_dio(handle, inode,
3066                                                        path);
3067                goto out2;
3068        }
3069        /* buffered IO case */
3070        /*
3071         * repeat fallocate creation request
3072         * we already have an unwritten extent
3073         */
3074        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3075                goto map_out;
3076
3077        /* buffered READ or buffered write_begin() lookup */
3078        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3079                /*
3080                 * We have blocks reserved already.  We
3081                 * return allocated blocks so that delalloc
3082                 * won't do block reservation for us.  But
3083                 * the buffer head will be unmapped so that
3084                 * a read from the block returns 0s.
3085                 */
3086                set_buffer_unwritten(bh_result);
3087                goto out1;
3088        }
3089
3090        /* buffered write, writepage time, convert*/
3091        ret = ext4_ext_convert_to_initialized(handle, inode,
3092                                                path, iblock,
3093                                                max_blocks);
3094out:
3095        if (ret <= 0) {
3096                err = ret;
3097                goto out2;
3098        } else
3099                allocated = ret;
3100        set_buffer_new(bh_result);
3101map_out:
3102        set_buffer_mapped(bh_result);
3103out1:
3104        if (allocated > max_blocks)
3105                allocated = max_blocks;
3106        ext4_ext_show_leaf(inode, path);
3107        bh_result->b_bdev = inode->i_sb->s_bdev;
3108        bh_result->b_blocknr = newblock;
3109out2:
3110        if (path) {
3111                ext4_ext_drop_refs(path);
3112                kfree(path);
3113        }
3114        return err ? err : allocated;
3115}
3116/*
3117 * Block allocation/map/preallocation routine for extents based files
3118 *
3119 *
3120 * Need to be called with
3121 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3122 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3123 *
3124 * return > 0, number of of blocks already mapped/allocated
3125 *          if create == 0 and these are pre-allocated blocks
3126 *              buffer head is unmapped
3127 *          otherwise blocks are mapped
3128 *
3129 * return = 0, if plain look up failed (blocks have not been allocated)
3130 *          buffer head is unmapped
3131 *
3132 * return < 0, error case.
3133 */
3134int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3135                        ext4_lblk_t iblock,
3136                        unsigned int max_blocks, struct buffer_head *bh_result,
3137                        int flags)
3138{
3139        struct ext4_ext_path *path = NULL;
3140        struct ext4_extent_header *eh;
3141        struct ext4_extent newex, *ex;
3142        ext4_fsblk_t newblock;
3143        int err = 0, depth, ret, cache_type;
3144        unsigned int allocated = 0;
3145        struct ext4_allocation_request ar;
3146        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3147
3148        __clear_bit(BH_New, &bh_result->b_state);
3149        ext_debug("blocks %u/%u requested for inode %lu\n",
3150                        iblock, max_blocks, inode->i_ino);
3151
3152        /* check in cache */
3153        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
3154        if (cache_type) {
3155                if (cache_type == EXT4_EXT_CACHE_GAP) {
3156                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3157                                /*
3158                                 * block isn't allocated yet and
3159                                 * user doesn't want to allocate it
3160                                 */
3161                                goto out2;
3162                        }
3163                        /* we should allocate requested block */
3164                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3165                        /* block is already allocated */
3166                        newblock = iblock
3167                                   - le32_to_cpu(newex.ee_block)
3168                                   + ext_pblock(&newex);
3169                        /* number of remaining blocks in the extent */
3170                        allocated = ext4_ext_get_actual_len(&newex) -
3171                                        (iblock - le32_to_cpu(newex.ee_block));
3172                        goto out;
3173                } else {
3174                        BUG();
3175                }
3176        }
3177
3178        /* find extent for this block */
3179        path = ext4_ext_find_extent(inode, iblock, NULL);
3180        if (IS_ERR(path)) {
3181                err = PTR_ERR(path);
3182                path = NULL;
3183                goto out2;
3184        }
3185
3186        depth = ext_depth(inode);
3187
3188        /*
3189         * consistent leaf must not be empty;
3190         * this situation is possible, though, _during_ tree modification;
3191         * this is why assert can't be put in ext4_ext_find_extent()
3192         */
3193        BUG_ON(path[depth].p_ext == NULL && depth != 0);
3194        eh = path[depth].p_hdr;
3195
3196        ex = path[depth].p_ext;
3197        if (ex) {
3198                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3199                ext4_fsblk_t ee_start = ext_pblock(ex);
3200                unsigned short ee_len;
3201
3202                /*
3203                 * Uninitialized extents are treated as holes, except that
3204                 * we split out initialized portions during a write.
3205                 */
3206                ee_len = ext4_ext_get_actual_len(ex);
3207                /* if found extent covers block, simply return it */
3208                if (iblock >= ee_block && iblock < ee_block + ee_len) {
3209                        newblock = iblock - ee_block + ee_start;
3210                        /* number of remaining blocks in the extent */
3211                        allocated = ee_len - (iblock - ee_block);
3212                        ext_debug("%u fit into %u:%d -> %llu\n", iblock,
3213                                        ee_block, ee_len, newblock);
3214
3215                        /* Do not put uninitialized extent in the cache */
3216                        if (!ext4_ext_is_uninitialized(ex)) {
3217                                ext4_ext_put_in_cache(inode, ee_block,
3218                                                        ee_len, ee_start,
3219                                                        EXT4_EXT_CACHE_EXTENT);
3220                                goto out;
3221                        }
3222                        ret = ext4_ext_handle_uninitialized_extents(handle,
3223                                        inode, iblock, max_blocks, path,
3224                                        flags, allocated, bh_result, newblock);
3225                        return ret;
3226                }
3227        }
3228
3229        /*
3230         * requested block isn't allocated yet;
3231         * we couldn't try to create block if create flag is zero
3232         */
3233        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3234                /*
3235                 * put just found gap into cache to speed up
3236                 * subsequent requests
3237                 */
3238                ext4_ext_put_gap_in_cache(inode, path, iblock);
3239                goto out2;
3240        }
3241        /*
3242         * Okay, we need to do block allocation.
3243         */
3244
3245        /* find neighbour allocated blocks */
3246        ar.lleft = iblock;
3247        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3248        if (err)
3249                goto out2;
3250        ar.lright = iblock;
3251        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3252        if (err)
3253                goto out2;
3254
3255        /*
3256         * See if request is beyond maximum number of blocks we can have in
3257         * a single extent. For an initialized extent this limit is
3258         * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3259         * EXT_UNINIT_MAX_LEN.
3260         */
3261        if (max_blocks > EXT_INIT_MAX_LEN &&
3262            !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3263                max_blocks = EXT_INIT_MAX_LEN;
3264        else if (max_blocks > EXT_UNINIT_MAX_LEN &&
3265                 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3266                max_blocks = EXT_UNINIT_MAX_LEN;
3267
3268        /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
3269        newex.ee_block = cpu_to_le32(iblock);
3270        newex.ee_len = cpu_to_le16(max_blocks);
3271        err = ext4_ext_check_overlap(inode, &newex, path);
3272        if (err)
3273                allocated = ext4_ext_get_actual_len(&newex);
3274        else
3275                allocated = max_blocks;
3276
3277        /* allocate new block */
3278        ar.inode = inode;
3279        ar.goal = ext4_ext_find_goal(inode, path, iblock);
3280        ar.logical = iblock;
3281        ar.len = allocated;
3282        if (S_ISREG(inode->i_mode))
3283                ar.flags = EXT4_MB_HINT_DATA;
3284        else
3285                /* disable in-core preallocation for non-regular files */
3286                ar.flags = 0;
3287        newblock = ext4_mb_new_blocks(handle, &ar, &err);
3288        if (!newblock)
3289                goto out2;
3290        ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3291                  ar.goal, newblock, allocated);
3292
3293        /* try to insert new extent into found leaf and return */
3294        ext4_ext_store_pblock(&newex, newblock);
3295        newex.ee_len = cpu_to_le16(ar.len);
3296        /* Mark uninitialized */
3297        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3298                ext4_ext_mark_uninitialized(&newex);
3299                /*
3300                 * io_end structure was created for every async
3301                 * direct IO write to the middle of the file.
3302                 * To avoid unecessary convertion for every aio dio rewrite
3303                 * to the mid of file, here we flag the IO that is really
3304                 * need the convertion.
3305                 * For non asycn direct IO case, flag the inode state
3306                 * that we need to perform convertion when IO is done.
3307                 */
3308                if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3309                        if (io)
3310                                io->flag = DIO_AIO_UNWRITTEN;
3311                        else
3312                                EXT4_I(inode)->i_state |=
3313                                        EXT4_STATE_DIO_UNWRITTEN;;
3314                }
3315        }
3316        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3317        if (err) {
3318                /* free data blocks we just allocated */
3319                /* not a good idea to call discard here directly,
3320                 * but otherwise we'd need to call it every free() */
3321                ext4_discard_preallocations(inode);
3322                ext4_free_blocks(handle, inode, ext_pblock(&newex),
3323                                        ext4_ext_get_actual_len(&newex), 0);
3324                goto out2;
3325        }
3326
3327        /* previous routine could use block we allocated */
3328        newblock = ext_pblock(&newex);
3329        allocated = ext4_ext_get_actual_len(&newex);
3330        set_buffer_new(bh_result);
3331
3332        /* Cache only when it is _not_ an uninitialized extent */
3333        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
3334                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
3335                                                EXT4_EXT_CACHE_EXTENT);
3336out:
3337        if (allocated > max_blocks)
3338                allocated = max_blocks;
3339        ext4_ext_show_leaf(inode, path);
3340        set_buffer_mapped(bh_result);
3341        bh_result->b_bdev = inode->i_sb->s_bdev;
3342        bh_result->b_blocknr = newblock;
3343out2:
3344        if (path) {
3345                ext4_ext_drop_refs(path);
3346                kfree(path);
3347        }
3348        return err ? err : allocated;
3349}
3350
3351void ext4_ext_truncate(struct inode *inode)
3352{
3353        struct address_space *mapping = inode->i_mapping;
3354        struct super_block *sb = inode->i_sb;
3355        ext4_lblk_t last_block;
3356        handle_t *handle;
3357        int err = 0;
3358
3359        /*
3360         * probably first extent we're gonna free will be last in block
3361         */
3362        err = ext4_writepage_trans_blocks(inode);
3363        handle = ext4_journal_start(inode, err);
3364        if (IS_ERR(handle))
3365                return;
3366
3367        if (inode->i_size & (sb->s_blocksize - 1))
3368                ext4_block_truncate_page(handle, mapping, inode->i_size);
3369
3370        if (ext4_orphan_add(handle, inode))
3371                goto out_stop;
3372
3373        down_write(&EXT4_I(inode)->i_data_sem);
3374        ext4_ext_invalidate_cache(inode);
3375
3376        ext4_discard_preallocations(inode);
3377
3378        /*
3379         * TODO: optimization is possible here.
3380         * Probably we need not scan at all,
3381         * because page truncation is enough.
3382         */
3383
3384        /* we have to know where to truncate from in crash case */
3385        EXT4_I(inode)->i_disksize = inode->i_size;
3386        ext4_mark_inode_dirty(handle, inode);
3387
3388        last_block = (inode->i_size + sb->s_blocksize - 1)
3389                        >> EXT4_BLOCK_SIZE_BITS(sb);
3390        err = ext4_ext_remove_space(inode, last_block);
3391
3392        /* In a multi-transaction truncate, we only make the final
3393         * transaction synchronous.
3394         */
3395        if (IS_SYNC(inode))
3396                ext4_handle_sync(handle);
3397
3398out_stop:
3399        up_write(&EXT4_I(inode)->i_data_sem);
3400        /*
3401         * If this was a simple ftruncate() and the file will remain alive,
3402         * then we need to clear up the orphan record which we created above.
3403         * However, if this was a real unlink then we were called by
3404         * ext4_delete_inode(), and we allow that function to clean up the
3405         * orphan info for us.
3406         */
3407        if (inode->i_nlink)
3408                ext4_orphan_del(handle, inode);
3409
3410        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3411        ext4_mark_inode_dirty(handle, inode);
3412        ext4_journal_stop(handle);
3413}
3414
3415static void ext4_falloc_update_inode(struct inode *inode,
3416                                int mode, loff_t new_size, int update_ctime)
3417{
3418        struct timespec now;
3419
3420        if (update_ctime) {
3421                now = current_fs_time(inode->i_sb);
3422                if (!timespec_equal(&inode->i_ctime, &now))
3423                        inode->i_ctime = now;
3424        }
3425        /*
3426         * Update only when preallocation was requested beyond
3427         * the file size.
3428         */
3429        if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3430                if (new_size > i_size_read(inode))
3431                        i_size_write(inode, new_size);
3432                if (new_size > EXT4_I(inode)->i_disksize)
3433                        ext4_update_i_disksize(inode, new_size);
3434        }
3435
3436}
3437
3438/*
3439 * preallocate space for a file. This implements ext4's fallocate inode
3440 * operation, which gets called from sys_fallocate system call.
3441 * For block-mapped files, posix_fallocate should fall back to the method
3442 * of writing zeroes to the required new blocks (the same behavior which is
3443 * expected for file systems which do not support fallocate() system call).
3444 */
3445long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3446{
3447        handle_t *handle;
3448        ext4_lblk_t block;
3449        loff_t new_size;
3450        unsigned int max_blocks;
3451        int ret = 0;
3452        int ret2 = 0;
3453        int retries = 0;
3454        struct buffer_head map_bh;
3455        unsigned int credits, blkbits = inode->i_blkbits;
3456
3457        /*
3458         * currently supporting (pre)allocate mode for extent-based
3459         * files _only_
3460         */
3461        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3462                return -EOPNOTSUPP;
3463
3464        /* preallocation to directories is currently not supported */
3465        if (S_ISDIR(inode->i_mode))
3466                return -ENODEV;
3467
3468        block = offset >> blkbits;
3469        /*
3470         * We can't just convert len to max_blocks because
3471         * If blocksize = 4096 offset = 3072 and len = 2048
3472         */
3473        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3474                                                        - block;
3475        /*
3476         * credits to insert 1 extent into extent tree
3477         */
3478        credits = ext4_chunk_trans_blocks(inode, max_blocks);
3479        mutex_lock(&inode->i_mutex);
3480retry:
3481        while (ret >= 0 && ret < max_blocks) {
3482                block = block + ret;
3483                max_blocks = max_blocks - ret;
3484                handle = ext4_journal_start(inode, credits);
3485                if (IS_ERR(handle)) {
3486                        ret = PTR_ERR(handle);
3487                        break;
3488                }
3489                map_bh.b_state = 0;
3490                ret = ext4_get_blocks(handle, inode, block,
3491                                      max_blocks, &map_bh,
3492                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3493                if (ret <= 0) {
3494#ifdef EXT4FS_DEBUG
3495                        WARN_ON(ret <= 0);
3496                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
3497                                    "returned error inode#%lu, block=%u, "
3498                                    "max_blocks=%u", __func__,
3499                                    inode->i_ino, block, max_blocks);
3500#endif
3501                        ext4_mark_inode_dirty(handle, inode);
3502                        ret2 = ext4_journal_stop(handle);
3503                        break;
3504                }
3505                if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3506                                                blkbits) >> blkbits))
3507                        new_size = offset + len;
3508                else
3509                        new_size = (block + ret) << blkbits;
3510
3511                ext4_falloc_update_inode(inode, mode, new_size,
3512                                                buffer_new(&map_bh));
3513                ext4_mark_inode_dirty(handle, inode);
3514                ret2 = ext4_journal_stop(handle);
3515                if (ret2)
3516                        break;
3517        }
3518        if (ret == -ENOSPC &&
3519                        ext4_should_retry_alloc(inode->i_sb, &retries)) {
3520                ret = 0;
3521                goto retry;
3522        }
3523        mutex_unlock(&inode->i_mutex);
3524        return ret > 0 ? ret2 : ret;
3525}
3526
3527/*
3528 * This function convert a range of blocks to written extents
3529 * The caller of this function will pass the start offset and the size.
3530 * all unwritten extents within this range will be converted to
3531 * written extents.
3532 *
3533 * This function is called from the direct IO end io call back
3534 * function, to convert the fallocated extents after IO is completed.
3535 * Returns 0 on success.
3536 */
3537int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3538                                    loff_t len)
3539{
3540        handle_t *handle;
3541        ext4_lblk_t block;
3542        unsigned int max_blocks;
3543        int ret = 0;
3544        int ret2 = 0;
3545        struct buffer_head map_bh;
3546        unsigned int credits, blkbits = inode->i_blkbits;
3547
3548        block = offset >> blkbits;
3549        /*
3550         * We can't just convert len to max_blocks because
3551         * If blocksize = 4096 offset = 3072 and len = 2048
3552         */
3553        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3554                                                        - block;
3555        /*
3556         * credits to insert 1 extent into extent tree
3557         */
3558        credits = ext4_chunk_trans_blocks(inode, max_blocks);
3559        while (ret >= 0 && ret < max_blocks) {
3560                block = block + ret;
3561                max_blocks = max_blocks - ret;
3562                handle = ext4_journal_start(inode, credits);
3563                if (IS_ERR(handle)) {
3564                        ret = PTR_ERR(handle);
3565                        break;
3566                }
3567                map_bh.b_state = 0;
3568                ret = ext4_get_blocks(handle, inode, block,
3569                                      max_blocks, &map_bh,
3570                                      EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
3571                if (ret <= 0) {
3572                        WARN_ON(ret <= 0);
3573                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
3574                                    "returned error inode#%lu, block=%u, "
3575                                    "max_blocks=%u", __func__,
3576                                    inode->i_ino, block, max_blocks);
3577                }
3578                ext4_mark_inode_dirty(handle, inode);
3579                ret2 = ext4_journal_stop(handle);
3580                if (ret <= 0 || ret2 )
3581                        break;
3582        }
3583        return ret > 0 ? ret2 : ret;
3584}
3585/*
3586 * Callback function called for each extent to gather FIEMAP information.
3587 */
3588static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3589                       struct ext4_ext_cache *newex, struct ext4_extent *ex,
3590                       void *data)
3591{
3592        struct fiemap_extent_info *fieinfo = data;
3593        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3594        __u64   logical;
3595        __u64   physical;
3596        __u64   length;
3597        __u32   flags = 0;
3598        int     error;
3599
3600        logical =  (__u64)newex->ec_block << blksize_bits;
3601
3602        if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3603                pgoff_t offset;
3604                struct page *page;
3605                struct buffer_head *bh = NULL;
3606
3607                offset = logical >> PAGE_SHIFT;
3608                page = find_get_page(inode->i_mapping, offset);
3609                if (!page || !page_has_buffers(page))
3610                        return EXT_CONTINUE;
3611
3612                bh = page_buffers(page);
3613
3614                if (!bh)
3615                        return EXT_CONTINUE;
3616
3617                if (buffer_delay(bh)) {
3618                        flags |= FIEMAP_EXTENT_DELALLOC;
3619                        page_cache_release(page);
3620                } else {
3621                        page_cache_release(page);
3622                        return EXT_CONTINUE;
3623                }
3624        }
3625
3626        physical = (__u64)newex->ec_start << blksize_bits;
3627        length =   (__u64)newex->ec_len << blksize_bits;
3628
3629        if (ex && ext4_ext_is_uninitialized(ex))
3630                flags |= FIEMAP_EXTENT_UNWRITTEN;
3631
3632        /*
3633         * If this extent reaches EXT_MAX_BLOCK, it must be last.
3634         *
3635         * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3636         * this also indicates no more allocated blocks.
3637         *
3638         * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3639         */
3640        if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3641            newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3642                loff_t size = i_size_read(inode);
3643                loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3644
3645                flags |= FIEMAP_EXTENT_LAST;
3646                if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3647                    logical+length > size)
3648                        length = (size - logical + bs - 1) & ~(bs-1);
3649        }
3650
3651        error = fiemap_fill_next_extent(fieinfo, logical, physical,
3652                                        length, flags);
3653        if (error < 0)
3654                return error;
3655        if (error == 1)
3656                return EXT_BREAK;
3657
3658        return EXT_CONTINUE;
3659}
3660
3661/* fiemap flags we can handle specified here */
3662#define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3663
3664static int ext4_xattr_fiemap(struct inode *inode,
3665                                struct fiemap_extent_info *fieinfo)
3666{
3667        __u64 physical = 0;
3668        __u64 length;
3669        __u32 flags = FIEMAP_EXTENT_LAST;
3670        int blockbits = inode->i_sb->s_blocksize_bits;
3671        int error = 0;
3672
3673        /* in-inode? */
3674        if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
3675                struct ext4_iloc iloc;
3676                int offset;     /* offset of xattr in inode */
3677
3678                error = ext4_get_inode_loc(inode, &iloc);
3679                if (error)
3680                        return error;
3681                physical = iloc.bh->b_blocknr << blockbits;
3682                offset = EXT4_GOOD_OLD_INODE_SIZE +
3683                                EXT4_I(inode)->i_extra_isize;
3684                physical += offset;
3685                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3686                flags |= FIEMAP_EXTENT_DATA_INLINE;
3687        } else { /* external block */
3688                physical = EXT4_I(inode)->i_file_acl << blockbits;
3689                length = inode->i_sb->s_blocksize;
3690        }
3691
3692        if (physical)
3693                error = fiemap_fill_next_extent(fieinfo, 0, physical,
3694                                                length, flags);
3695        return (error < 0 ? error : 0);
3696}
3697
3698int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3699                __u64 start, __u64 len)
3700{
3701        ext4_lblk_t start_blk;
3702        ext4_lblk_t len_blks;
3703        int error = 0;
3704
3705        /* fallback to generic here if not in extents fmt */
3706        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3707                return generic_block_fiemap(inode, fieinfo, start, len,
3708                        ext4_get_block);
3709
3710        if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3711                return -EBADR;
3712
3713        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3714                error = ext4_xattr_fiemap(inode, fieinfo);
3715        } else {
3716                start_blk = start >> inode->i_sb->s_blocksize_bits;
3717                len_blks = len >> inode->i_sb->s_blocksize_bits;
3718
3719                /*
3720                 * Walk the extent tree gathering extent information.
3721                 * ext4_ext_fiemap_cb will push extents back to user.
3722                 */
3723                down_read(&EXT4_I(inode)->i_data_sem);
3724                error = ext4_ext_walk_space(inode, start_blk, len_blks,
3725                                          ext4_ext_fiemap_cb, fieinfo);
3726                up_read(&EXT4_I(inode)->i_data_sem);
3727        }
3728
3729        return error;
3730}
3731
3732