LXR linux/fs/ext4/inode.c

   1/*
   2 *  linux/fs/ext4/inode.c
   3 *
   4 * Copyright (C) 1992, 1993, 1994, 1995
   5 * Remy Card (card@masi.ibp.fr)
   6 * Laboratoire MASI - Institut Blaise Pascal
   7 * Universite Pierre et Marie Curie (Paris VI)
   8 *
   9 *  from
  10 *
  11 *  linux/fs/minix/inode.c
  12 *
  13 *  Copyright (C) 1991, 1992  Linus Torvalds
  14 *
  15 *  Goal-directed block allocation by Stephen Tweedie
  16 *      (sct@redhat.com), 1993, 1998
  17 *  Big-endian to little-endian byte-swapping/bitmaps by
  18 *        David S. Miller (davem@caip.rutgers.edu), 1995
  19 *  64-bit file support on 64-bit platforms by Jakub Jelinek
  20 *      (jj@sunsite.ms.mff.cuni.cz)
  21 *
  22 *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  23 */
  24
  25#include <linux/module.h>
  26#include <linux/fs.h>
  27#include <linux/time.h>
  28#include <linux/jbd2.h>
  29#include <linux/highuid.h>
  30#include <linux/pagemap.h>
  31#include <linux/quotaops.h>
  32#include <linux/string.h>
  33#include <linux/buffer_head.h>
  34#include <linux/writeback.h>
  35#include <linux/pagevec.h>
  36#include <linux/mpage.h>
  37#include <linux/namei.h>
  38#include <linux/uio.h>
  39#include <linux/bio.h>
  40#include <linux/workqueue.h>
  41#include <linux/kernel.h>
  42#include <linux/printk.h>
  43#include <linux/slab.h>
  44#include <linux/ratelimit.h>
  45
  46#include "ext4_jbd2.h"
  47#include "xattr.h"
  48#include "acl.h"
  49#include "ext4_extents.h"
  50
  51#include <trace/events/ext4.h>
  52
  53#define MPAGE_DA_EXTENT_TAIL 0x01
  54
  55static inline int ext4_begin_ordered_truncate(struct inode *inode,
  56                                              loff_t new_size)
  57{
  58        trace_ext4_begin_ordered_truncate(inode, new_size);
  59        /*
  60         * If jinode is zero, then we never opened the file for
  61         * writing, so there's no need to call
  62         * jbd2_journal_begin_ordered_truncate() since there's no
  63         * outstanding writes we need to flush.
  64         */
  65        if (!EXT4_I(inode)->jinode)
  66                return 0;
  67        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
  68                                                   EXT4_I(inode)->jinode,
  69                                                   new_size);
  70}
  71
  72static void ext4_invalidatepage(struct page *page, unsigned long offset);
  73static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
  74                                   struct buffer_head *bh_result, int create);
  75static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
  76static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
  77static int __ext4_journalled_writepage(struct page *page, unsigned int len);
  78static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
  79
  80/*
  81 * Test whether an inode is a fast symlink.
  82 */
  83static int ext4_inode_is_fast_symlink(struct inode *inode)
  84{
  85        int ea_blocks = EXT4_I(inode)->i_file_acl ?
  86                (inode->i_sb->s_blocksize >> 9) : 0;
  87
  88        return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
  89}
  90
  91/*
  92 * Work out how many blocks we need to proceed with the next chunk of a
  93 * truncate transaction.
  94 */
  95static unsigned long blocks_for_truncate(struct inode *inode)
  96{
  97        ext4_lblk_t needed;
  98
  99        needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 100
 101        /* Give ourselves just enough room to cope with inodes in which
 102         * i_blocks is corrupt: we've seen disk corruptions in the past
 103         * which resulted in random data in an inode which looked enough
 104         * like a regular file for ext4 to try to delete it.  Things
 105         * will go a bit crazy if that happens, but at least we should
 106         * try not to panic the whole kernel. */
 107        if (needed < 2)
 108                needed = 2;
 109
 110        /* But we need to bound the transaction so we don't overflow the
 111         * journal. */
 112        if (needed > EXT4_MAX_TRANS_DATA)
 113                needed = EXT4_MAX_TRANS_DATA;
 114
 115        return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 116}
 117
 118/*
 119 * Truncate transactions can be complex and absolutely huge.  So we need to
 120 * be able to restart the transaction at a conventient checkpoint to make
 121 * sure we don't overflow the journal.
 122 *
 123 * start_transaction gets us a new handle for a truncate transaction,
 124 * and extend_transaction tries to extend the existing one a bit.  If
 125 * extend fails, we need to propagate the failure up and restart the
 126 * transaction in the top-level truncate loop. --sct
 127 */
 128static handle_t *start_transaction(struct inode *inode)
 129{
 130        handle_t *result;
 131
 132        result = ext4_journal_start(inode, blocks_for_truncate(inode));
 133        if (!IS_ERR(result))
 134                return result;
 135
 136        ext4_std_error(inode->i_sb, PTR_ERR(result));
 137        return result;
 138}
 139
 140/*
 141 * Try to extend this transaction for the purposes of truncation.
 142 *
 143 * Returns 0 if we managed to create more room.  If we can't create more
 144 * room, and the transaction must be restarted we return 1.
 145 */
 146static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 147{
 148        if (!ext4_handle_valid(handle))
 149                return 0;
 150        if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
 151                return 0;
 152        if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
 153                return 0;
 154        return 1;
 155}
 156
 157/*
 158 * Restart the transaction associated with *handle.  This does a commit,
 159 * so before we call here everything must be consistently dirtied against
 160 * this transaction.
 161 */
 162int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 163                                 int nblocks)
 164{
 165        int ret;
 166
 167        /*
 168         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 169         * moment, get_block can be called only for blocks inside i_size since
 170         * page cache has been already dropped and writes are blocked by
 171         * i_mutex. So we can safely drop the i_data_sem here.
 172         */
 173        BUG_ON(EXT4_JOURNAL(inode) == NULL);
 174        jbd_debug(2, "restarting handle %p\n", handle);
 175        up_write(&EXT4_I(inode)->i_data_sem);
 176        ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
 177        down_write(&EXT4_I(inode)->i_data_sem);
 178        ext4_discard_preallocations(inode);
 179
 180        return ret;
 181}
 182
 183/*
 184 * Called at the last iput() if i_nlink is zero.
 185 */
 186void ext4_evict_inode(struct inode *inode)
 187{
 188        handle_t *handle;
 189        int err;
 190
 191        trace_ext4_evict_inode(inode);
 192        if (inode->i_nlink) {
 193                truncate_inode_pages(&inode->i_data, 0);
 194                goto no_delete;
 195        }
 196
 197        if (!is_bad_inode(inode))
 198                dquot_initialize(inode);
 199
 200        if (ext4_should_order_data(inode))
 201                ext4_begin_ordered_truncate(inode, 0);
 202        truncate_inode_pages(&inode->i_data, 0);
 203
 204        if (is_bad_inode(inode))
 205                goto no_delete;
 206
 207        handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
 208        if (IS_ERR(handle)) {
 209                ext4_std_error(inode->i_sb, PTR_ERR(handle));
 210                /*
 211                 * If we're going to skip the normal cleanup, we still need to
 212                 * make sure that the in-core orphan linked list is properly
 213                 * cleaned up.
 214                 */
 215                ext4_orphan_del(NULL, inode);
 216                goto no_delete;
 217        }
 218
 219        if (IS_SYNC(inode))
 220                ext4_handle_sync(handle);
 221        inode->i_size = 0;
 222        err = ext4_mark_inode_dirty(handle, inode);
 223        if (err) {
 224                ext4_warning(inode->i_sb,
 225                             "couldn't mark inode dirty (err %d)", err);
 226                goto stop_handle;
 227        }
 228        if (inode->i_blocks)
 229                ext4_truncate(inode);
 230
 231        /*
 232         * ext4_ext_truncate() doesn't reserve any slop when it
 233         * restarts journal transactions; therefore there may not be
 234         * enough credits left in the handle to remove the inode from
 235         * the orphan list and set the dtime field.
 236         */
 237        if (!ext4_handle_has_enough_credits(handle, 3)) {
 238                err = ext4_journal_extend(handle, 3);
 239                if (err > 0)
 240                        err = ext4_journal_restart(handle, 3);
 241                if (err != 0) {
 242                        ext4_warning(inode->i_sb,
 243                                     "couldn't extend journal (err %d)", err);
 244                stop_handle:
 245                        ext4_journal_stop(handle);
 246                        ext4_orphan_del(NULL, inode);
 247                        goto no_delete;
 248                }
 249        }
 250
 251        /*
 252         * Kill off the orphan record which ext4_truncate created.
 253         * AKPM: I think this can be inside the above `if'.
 254         * Note that ext4_orphan_del() has to be able to cope with the
 255         * deletion of a non-existent orphan - this is because we don't
 256         * know if ext4_truncate() actually created an orphan record.
 257         * (Well, we could do this if we need to, but heck - it works)
 258         */
 259        ext4_orphan_del(handle, inode);
 260        EXT4_I(inode)->i_dtime  = get_seconds();
 261
 262        /*
 263         * One subtle ordering requirement: if anything has gone wrong
 264         * (transaction abort, IO errors, whatever), then we can still
 265         * do these next steps (the fs will already have been marked as
 266         * having errors), but we can't free the inode if the mark_dirty
 267         * fails.
 268         */
 269        if (ext4_mark_inode_dirty(handle, inode))
 270                /* If that failed, just do the required in-core inode clear. */
 271                ext4_clear_inode(inode);
 272        else
 273                ext4_free_inode(handle, inode);
 274        ext4_journal_stop(handle);
 275        return;
 276no_delete:
 277        ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
 278}
 279
 280typedef struct {
 281        __le32  *p;
 282        __le32  key;
 283        struct buffer_head *bh;
 284} Indirect;
 285
 286static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 287{
 288        p->key = *(p->p = v);
 289        p->bh = bh;
 290}
 291
 292/**
 293 *      ext4_block_to_path - parse the block number into array of offsets
 294 *      @inode: inode in question (we are only interested in its superblock)
 295 *      @i_block: block number to be parsed
 296 *      @offsets: array to store the offsets in
 297 *      @boundary: set this non-zero if the referred-to block is likely to be
 298 *             followed (on disk) by an indirect block.
 299 *
 300 *      To store the locations of file's data ext4 uses a data structure common
 301 *      for UNIX filesystems - tree of pointers anchored in the inode, with
 302 *      data blocks at leaves and indirect blocks in intermediate nodes.
 303 *      This function translates the block number into path in that tree -
 304 *      return value is the path length and @offsets[n] is the offset of
 305 *      pointer to (n+1)th node in the nth one. If @block is out of range
 306 *      (negative or too large) warning is printed and zero returned.
 307 *
 308 *      Note: function doesn't find node addresses, so no IO is needed. All
 309 *      we need to know is the capacity of indirect blocks (taken from the
 310 *      inode->i_sb).
 311 */
 312
 313/*
 314 * Portability note: the last comparison (check that we fit into triple
 315 * indirect block) is spelled differently, because otherwise on an
 316 * architecture with 32-bit longs and 8Kb pages we might get into trouble
 317 * if our filesystem had 8Kb blocks. We might use long long, but that would
 318 * kill us on x86. Oh, well, at least the sign propagation does not matter -
 319 * i_block would have to be negative in the very beginning, so we would not
 320 * get there at all.
 321 */
 322
 323static int ext4_block_to_path(struct inode *inode,
 324                              ext4_lblk_t i_block,
 325                              ext4_lblk_t offsets[4], int *boundary)
 326{
 327        int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 328        int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
 329        const long direct_blocks = EXT4_NDIR_BLOCKS,
 330                indirect_blocks = ptrs,
 331                double_blocks = (1 << (ptrs_bits * 2));
 332        int n = 0;
 333        int final = 0;
 334
 335        if (i_block < direct_blocks) {
 336                offsets[n++] = i_block;
 337                final = direct_blocks;
 338        } else if ((i_block -= direct_blocks) < indirect_blocks) {
 339                offsets[n++] = EXT4_IND_BLOCK;
 340                offsets[n++] = i_block;
 341                final = ptrs;
 342        } else if ((i_block -= indirect_blocks) < double_blocks) {
 343                offsets[n++] = EXT4_DIND_BLOCK;
 344                offsets[n++] = i_block >> ptrs_bits;
 345                offsets[n++] = i_block & (ptrs - 1);
 346                final = ptrs;
 347        } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 348                offsets[n++] = EXT4_TIND_BLOCK;
 349                offsets[n++] = i_block >> (ptrs_bits * 2);
 350                offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 351                offsets[n++] = i_block & (ptrs - 1);
 352                final = ptrs;
 353        } else {
 354                ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
 355                             i_block + direct_blocks +
 356                             indirect_blocks + double_blocks, inode->i_ino);
 357        }
 358        if (boundary)
 359                *boundary = final - 1 - (i_block & (ptrs - 1));
 360        return n;
 361}
 362
 363static int __ext4_check_blockref(const char *function, unsigned int line,
 364                                 struct inode *inode,
 365                                 __le32 *p, unsigned int max)
 366{
 367        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 368        __le32 *bref = p;
 369        unsigned int blk;
 370
 371        while (bref < p+max) {
 372                blk = le32_to_cpu(*bref++);
 373                if (blk &&
 374                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 375                                                    blk, 1))) {
 376                        es->s_last_error_block = cpu_to_le64(blk);
 377                        ext4_error_inode(inode, function, line, blk,
 378                                         "invalid block");
 379                        return -EIO;
 380                }
 381        }
 382        return 0;
 383}
 384
 385
 386#define ext4_check_indirect_blockref(inode, bh)                         \
 387        __ext4_check_blockref(__func__, __LINE__, inode,                \
 388                              (__le32 *)(bh)->b_data,                   \
 389                              EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 390
 391#define ext4_check_inode_blockref(inode)                                \
 392        __ext4_check_blockref(__func__, __LINE__, inode,                \
 393                              EXT4_I(inode)->i_data,                    \
 394                              EXT4_NDIR_BLOCKS)
 395
 396/**
 397 *      ext4_get_branch - read the chain of indirect blocks leading to data
 398 *      @inode: inode in question
 399 *      @depth: depth of the chain (1 - direct pointer, etc.)
 400 *      @offsets: offsets of pointers in inode/indirect blocks
 401 *      @chain: place to store the result
 402 *      @err: here we store the error value
 403 *
 404 *      Function fills the array of triples <key, p, bh> and returns %NULL
 405 *      if everything went OK or the pointer to the last filled triple
 406 *      (incomplete one) otherwise. Upon the return chain[i].key contains
 407 *      the number of (i+1)-th block in the chain (as it is stored in memory,
 408 *      i.e. little-endian 32-bit), chain[i].p contains the address of that
 409 *      number (it points into struct inode for i==0 and into the bh->b_data
 410 *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
 411 *      block for i>0 and NULL for i==0. In other words, it holds the block
 412 *      numbers of the chain, addresses they were taken from (and where we can
 413 *      verify that chain did not change) and buffer_heads hosting these
 414 *      numbers.
 415 *
 416 *      Function stops when it stumbles upon zero pointer (absent block)
 417 *              (pointer to last triple returned, *@err == 0)
 418 *      or when it gets an IO error reading an indirect block
 419 *              (ditto, *@err == -EIO)
 420 *      or when it reads all @depth-1 indirect blocks successfully and finds
 421 *      the whole chain, all way to the data (returns %NULL, *err == 0).
 422 *
 423 *      Need to be called with
 424 *      down_read(&EXT4_I(inode)->i_data_sem)
 425 */
 426static Indirect *ext4_get_branch(struct inode *inode, int depth,
 427                                 ext4_lblk_t  *offsets,
 428                                 Indirect chain[4], int *err)
 429{
 430        struct super_block *sb = inode->i_sb;
 431        Indirect *p = chain;
 432        struct buffer_head *bh;
 433
 434        *err = 0;
 435        /* i_data is not going away, no lock needed */
 436        add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
 437        if (!p->key)
 438                goto no_block;
 439        while (--depth) {
 440                bh = sb_getblk(sb, le32_to_cpu(p->key));
 441                if (unlikely(!bh))
 442                        goto failure;
 443
 444                if (!bh_uptodate_or_lock(bh)) {
 445                        if (bh_submit_read(bh) < 0) {
 446                                put_bh(bh);
 447                                goto failure;
 448                        }
 449                        /* validate block references */
 450                        if (ext4_check_indirect_blockref(inode, bh)) {
 451                                put_bh(bh);
 452                                goto failure;
 453                        }
 454                }
 455
 456                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 457                /* Reader: end */
 458                if (!p->key)
 459                        goto no_block;
 460        }
 461        return NULL;
 462
 463failure:
 464        *err = -EIO;
 465no_block:
 466        return p;
 467}
 468
 469/**
 470 *      ext4_find_near - find a place for allocation with sufficient locality
 471 *      @inode: owner
 472 *      @ind: descriptor of indirect block.
 473 *
 474 *      This function returns the preferred place for block allocation.
 475 *      It is used when heuristic for sequential allocation fails.
 476 *      Rules are:
 477 *        + if there is a block to the left of our position - allocate near it.
 478 *        + if pointer will live in indirect block - allocate near that block.
 479 *        + if pointer will live in inode - allocate in the same
 480 *          cylinder group.
 481 *
 482 * In the latter case we colour the starting block by the callers PID to
 483 * prevent it from clashing with concurrent allocations for a different inode
 484 * in the same block group.   The PID is used here so that functionally related
 485 * files will be close-by on-disk.
 486 *
 487 *      Caller must make sure that @ind is valid and will stay that way.
 488 */
 489static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 490{
 491        struct ext4_inode_info *ei = EXT4_I(inode);
 492        __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
 493        __le32 *p;
 494        ext4_fsblk_t bg_start;
 495        ext4_fsblk_t last_block;
 496        ext4_grpblk_t colour;
 497        ext4_group_t block_group;
 498        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 499
 500        /* Try to find previous block */
 501        for (p = ind->p - 1; p >= start; p--) {
 502                if (*p)
 503                        return le32_to_cpu(*p);
 504        }
 505
 506        /* No such thing, so let's try location of indirect block */
 507        if (ind->bh)
 508                return ind->bh->b_blocknr;
 509
 510        /*
 511         * It is going to be referred to from the inode itself? OK, just put it
 512         * into the same cylinder group then.
 513         */
 514        block_group = ei->i_block_group;
 515        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
 516                block_group &= ~(flex_size-1);
 517                if (S_ISREG(inode->i_mode))
 518                        block_group++;
 519        }
 520        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
 521        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 522
 523        /*
 524         * If we are doing delayed allocation, we don't need take
 525         * colour into account.
 526         */
 527        if (test_opt(inode->i_sb, DELALLOC))
 528                return bg_start;
 529
 530        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 531                colour = (current->pid % 16) *
 532                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 533        else
 534                colour = (current->pid % 16) * ((last_block - bg_start) / 16);
 535        return bg_start + colour;
 536}
 537
 538/**
 539 *      ext4_find_goal - find a preferred place for allocation.
 540 *      @inode: owner
 541 *      @block:  block we want
 542 *      @partial: pointer to the last triple within a chain
 543 *
 544 *      Normally this function find the preferred place for block allocation,
 545 *      returns it.
 546 *      Because this is only used for non-extent files, we limit the block nr
 547 *      to 32 bits.
 548 */
 549static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 550                                   Indirect *partial)
 551{
 552        ext4_fsblk_t goal;
 553
 554        /*
 555         * XXX need to get goal block from mballoc's data structures
 556         */
 557
 558        goal = ext4_find_near(inode, partial);
 559        goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 560        return goal;
 561}
 562
 563/**
 564 *      ext4_blks_to_allocate - Look up the block map and count the number
 565 *      of direct blocks need to be allocated for the given branch.
 566 *
 567 *      @branch: chain of indirect blocks
 568 *      @k: number of blocks need for indirect blocks
 569 *      @blks: number of data blocks to be mapped.
 570 *      @blocks_to_boundary:  the offset in the indirect block
 571 *
 572 *      return the total number of blocks to be allocate, including the
 573 *      direct and indirect blocks.
 574 */
 575static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 576                                 int blocks_to_boundary)
 577{
 578        unsigned int count = 0;
 579
 580        /*
 581         * Simple case, [t,d]Indirect block(s) has not allocated yet
 582         * then it's clear blocks on that path have not allocated
 583         */
 584        if (k > 0) {
 585                /* right now we don't handle cross boundary allocation */
 586                if (blks < blocks_to_boundary + 1)
 587                        count += blks;
 588                else
 589                        count += blocks_to_boundary + 1;
 590                return count;
 591        }
 592
 593        count++;
 594        while (count < blks && count <= blocks_to_boundary &&
 595                le32_to_cpu(*(branch[0].p + count)) == 0) {
 596                count++;
 597        }
 598        return count;
 599}
 600
 601/**
 602 *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
 603 *      @handle: handle for this transaction
 604 *      @inode: inode which needs allocated blocks
 605 *      @iblock: the logical block to start allocated at
 606 *      @goal: preferred physical block of allocation
 607 *      @indirect_blks: the number of blocks need to allocate for indirect
 608 *                      blocks
 609 *      @blks: number of desired blocks
 610 *      @new_blocks: on return it will store the new block numbers for
 611 *      the indirect blocks(if needed) and the first direct block,
 612 *      @err: on return it will store the error code
 613 *
 614 *      This function will return the number of blocks allocated as
 615 *      requested by the passed-in parameters.
 616 */
 617static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 618                             ext4_lblk_t iblock, ext4_fsblk_t goal,
 619                             int indirect_blks, int blks,
 620                             ext4_fsblk_t new_blocks[4], int *err)
 621{
 622        struct ext4_allocation_request ar;
 623        int target, i;
 624        unsigned long count = 0, blk_allocated = 0;
 625        int index = 0;
 626        ext4_fsblk_t current_block = 0;
 627        int ret = 0;
 628
 629        /*
 630         * Here we try to allocate the requested multiple blocks at once,
 631         * on a best-effort basis.
 632         * To build a branch, we should allocate blocks for
 633         * the indirect blocks(if not allocated yet), and at least
 634         * the first direct block of this branch.  That's the
 635         * minimum number of blocks need to allocate(required)
 636         */
 637        /* first we try to allocate the indirect blocks */
 638        target = indirect_blks;
 639        while (target > 0) {
 640                count = target;
 641                /* allocating blocks for indirect blocks and direct blocks */
 642                current_block = ext4_new_meta_blocks(handle, inode,
 643                                                        goal, &count, err);
 644                if (*err)
 645                        goto failed_out;
 646
 647                if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
 648                        EXT4_ERROR_INODE(inode,
 649                                         "current_block %llu + count %lu > %d!",
 650                                         current_block, count,
 651                                         EXT4_MAX_BLOCK_FILE_PHYS);
 652                        *err = -EIO;
 653                        goto failed_out;
 654                }
 655
 656                target -= count;
 657                /* allocate blocks for indirect blocks */
 658                while (index < indirect_blks && count) {
 659                        new_blocks[index++] = current_block++;
 660                        count--;
 661                }
 662                if (count > 0) {
 663                        /*
 664                         * save the new block number
 665                         * for the first direct block
 666                         */
 667                        new_blocks[index] = current_block;
 668                        printk(KERN_INFO "%s returned more blocks than "
 669                                                "requested\n", __func__);
 670                        WARN_ON(1);
 671                        break;
 672                }
 673        }
 674
 675        target = blks - count ;
 676        blk_allocated = count;
 677        if (!target)
 678                goto allocated;
 679        /* Now allocate data blocks */
 680        memset(&ar, 0, sizeof(ar));
 681        ar.inode = inode;
 682        ar.goal = goal;
 683        ar.len = target;
 684        ar.logical = iblock;
 685        if (S_ISREG(inode->i_mode))
 686                /* enable in-core preallocation only for regular files */
 687                ar.flags = EXT4_MB_HINT_DATA;
 688
 689        current_block = ext4_mb_new_blocks(handle, &ar, err);
 690        if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
 691                EXT4_ERROR_INODE(inode,
 692                                 "current_block %llu + ar.len %d > %d!",
 693                                 current_block, ar.len,
 694                                 EXT4_MAX_BLOCK_FILE_PHYS);
 695                *err = -EIO;
 696                goto failed_out;
 697        }
 698
 699        if (*err && (target == blks)) {
 700                /*
 701                 * if the allocation failed and we didn't allocate
 702                 * any blocks before
 703                 */
 704                goto failed_out;
 705        }
 706        if (!*err) {
 707                if (target == blks) {
 708                        /*
 709                         * save the new block number
 710                         * for the first direct block
 711                         */
 712                        new_blocks[index] = current_block;
 713                }
 714                blk_allocated += ar.len;
 715        }
 716allocated:
 717        /* total number of blocks allocated for direct blocks */
 718        ret = blk_allocated;
 719        *err = 0;
 720        return ret;
 721failed_out:
 722        for (i = 0; i < index; i++)
 723                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
 724        return ret;
 725}
 726
 727/**
 728 *      ext4_alloc_branch - allocate and set up a chain of blocks.
 729 *      @handle: handle for this transaction
 730 *      @inode: owner
 731 *      @indirect_blks: number of allocated indirect blocks
 732 *      @blks: number of allocated direct blocks
 733 *      @goal: preferred place for allocation
 734 *      @offsets: offsets (in the blocks) to store the pointers to next.
 735 *      @branch: place to store the chain in.
 736 *
 737 *      This function allocates blocks, zeroes out all but the last one,
 738 *      links them into chain and (if we are synchronous) writes them to disk.
 739 *      In other words, it prepares a branch that can be spliced onto the
 740 *      inode. It stores the information about that chain in the branch[], in
 741 *      the same format as ext4_get_branch() would do. We are calling it after
 742 *      we had read the existing part of chain and partial points to the last
 743 *      triple of that (one with zero ->key). Upon the exit we have the same
 744 *      picture as after the successful ext4_get_block(), except that in one
 745 *      place chain is disconnected - *branch->p is still zero (we did not
 746 *      set the last link), but branch->key contains the number that should
 747 *      be placed into *branch->p to fill that gap.
 748 *
 749 *      If allocation fails we free all blocks we've allocated (and forget
 750 *      their buffer_heads) and return the error value the from failed
 751 *      ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
 752 *      as described above and return 0.
 753 */
 754static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 755                             ext4_lblk_t iblock, int indirect_blks,
 756                             int *blks, ext4_fsblk_t goal,
 757                             ext4_lblk_t *offsets, Indirect *branch)
 758{
 759        int blocksize = inode->i_sb->s_blocksize;
 760        int i, n = 0;
 761        int err = 0;
 762        struct buffer_head *bh;
 763        int num;
 764        ext4_fsblk_t new_blocks[4];
 765        ext4_fsblk_t current_block;
 766
 767        num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
 768                                *blks, new_blocks, &err);
 769        if (err)
 770                return err;
 771
 772        branch[0].key = cpu_to_le32(new_blocks[0]);
 773        /*
 774         * metadata blocks and data blocks are allocated.
 775         */
 776        for (n = 1; n <= indirect_blks;  n++) {
 777                /*
 778                 * Get buffer_head for parent block, zero it out
 779                 * and set the pointer to new one, then send
 780                 * parent to disk.
 781                 */
 782                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
 783                if (unlikely(!bh)) {
 784                        err = -EIO;
 785                        goto failed;
 786                }
 787
 788                branch[n].bh = bh;
 789                lock_buffer(bh);
 790                BUFFER_TRACE(bh, "call get_create_access");
 791                err = ext4_journal_get_create_access(handle, bh);
 792                if (err) {
 793                        /* Don't brelse(bh) here; it's done in
 794                         * ext4_journal_forget() below */
 795                        unlock_buffer(bh);
 796                        goto failed;
 797                }
 798
 799                memset(bh->b_data, 0, blocksize);
 800                branch[n].p = (__le32 *) bh->b_data + offsets[n];
 801                branch[n].key = cpu_to_le32(new_blocks[n]);
 802                *branch[n].p = branch[n].key;
 803                if (n == indirect_blks) {
 804                        current_block = new_blocks[n];
 805                        /*
 806                         * End of chain, update the last new metablock of
 807                         * the chain to point to the new allocated
 808                         * data blocks numbers
 809                         */
 810                        for (i = 1; i < num; i++)
 811                                *(branch[n].p + i) = cpu_to_le32(++current_block);
 812                }
 813                BUFFER_TRACE(bh, "marking uptodate");
 814                set_buffer_uptodate(bh);
 815                unlock_buffer(bh);
 816
 817                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 818                err = ext4_handle_dirty_metadata(handle, inode, bh);
 819                if (err)
 820                        goto failed;
 821        }
 822        *blks = num;
 823        return err;
 824failed:
 825        /* Allocation failed, free what we already allocated */
 826        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
 827        for (i = 1; i <= n ; i++) {
 828                /*
 829                 * branch[i].bh is newly allocated, so there is no
 830                 * need to revoke the block, which is why we don't
 831                 * need to set EXT4_FREE_BLOCKS_METADATA.
 832                 */
 833                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
 834                                 EXT4_FREE_BLOCKS_FORGET);
 835        }
 836        for (i = n+1; i < indirect_blks; i++)
 837                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
 838
 839        ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
 840
 841        return err;
 842}
 843
 844/**
 845 * ext4_splice_branch - splice the allocated branch onto inode.
 846 * @handle: handle for this transaction
 847 * @inode: owner
 848 * @block: (logical) number of block we are adding
 849 * @chain: chain of indirect blocks (with a missing link - see
 850 *      ext4_alloc_branch)
 851 * @where: location of missing link
 852 * @num:   number of indirect blocks we are adding
 853 * @blks:  number of direct blocks we are adding
 854 *
 855 * This function fills the missing link and does all housekeeping needed in
 856 * inode (->i_blocks, etc.). In case of success we end up with the full
 857 * chain to new block and return 0.
 858 */
 859static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 860                              ext4_lblk_t block, Indirect *where, int num,
 861                              int blks)
 862{
 863        int i;
 864        int err = 0;
 865        ext4_fsblk_t current_block;
 866
 867        /*
 868         * If we're splicing into a [td]indirect block (as opposed to the
 869         * inode) then we need to get write access to the [td]indirect block
 870         * before the splice.
 871         */
 872        if (where->bh) {
 873                BUFFER_TRACE(where->bh, "get_write_access");
 874                err = ext4_journal_get_write_access(handle, where->bh);
 875                if (err)
 876                        goto err_out;
 877        }
 878        /* That's it */
 879
 880        *where->p = where->key;
 881
 882        /*
 883         * Update the host buffer_head or inode to point to more just allocated
 884         * direct blocks blocks
 885         */
 886        if (num == 0 && blks > 1) {
 887                current_block = le32_to_cpu(where->key) + 1;
 888                for (i = 1; i < blks; i++)
 889                        *(where->p + i) = cpu_to_le32(current_block++);
 890        }
 891
 892        /* We are done with atomic stuff, now do the rest of housekeeping */
 893        /* had we spliced it onto indirect block? */
 894        if (where->bh) {
 895                /*
 896                 * If we spliced it onto an indirect block, we haven't
 897                 * altered the inode.  Note however that if it is being spliced
 898                 * onto an indirect block at the very end of the file (the
 899                 * file is growing) then we *will* alter the inode to reflect
 900                 * the new i_size.  But that is not done here - it is done in
 901                 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
 902                 */
 903                jbd_debug(5, "splicing indirect only\n");
 904                BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
 905                err = ext4_handle_dirty_metadata(handle, inode, where->bh);
 906                if (err)
 907                        goto err_out;
 908        } else {
 909                /*
 910                 * OK, we spliced it into the inode itself on a direct block.
 911                 */
 912                ext4_mark_inode_dirty(handle, inode);
 913                jbd_debug(5, "splicing direct\n");
 914        }
 915        return err;
 916
 917err_out:
 918        for (i = 1; i <= num; i++) {
 919                /*
 920                 * branch[i].bh is newly allocated, so there is no
 921                 * need to revoke the block, which is why we don't
 922                 * need to set EXT4_FREE_BLOCKS_METADATA.
 923                 */
 924                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
 925                                 EXT4_FREE_BLOCKS_FORGET);
 926        }
 927        ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
 928                         blks, 0);
 929
 930        return err;
 931}
 932
 933/*
 934 * The ext4_ind_map_blocks() function handles non-extents inodes
 935 * (i.e., using the traditional indirect/double-indirect i_blocks
 936 * scheme) for ext4_map_blocks().
 937 *
 938 * Allocation strategy is simple: if we have to allocate something, we will
 939 * have to go the whole way to leaf. So let's do it before attaching anything
 940 * to tree, set linkage between the newborn blocks, write them if sync is
 941 * required, recheck the path, free and repeat if check fails, otherwise
 942 * set the last missing link (that will protect us from any truncate-generated
 943 * removals - all blocks on the path are immune now) and possibly force the
 944 * write on the parent block.
 945 * That has a nice additional property: no special recovery from the failed
 946 * allocations is needed - we simply release blocks and do not touch anything
 947 * reachable from inode.
 948 *
 949 * `handle' can be NULL if create == 0.
 950 *
 951 * return > 0, # of blocks mapped or allocated.
 952 * return = 0, if plain lookup failed.
 953 * return < 0, error case.
 954 *
 955 * The ext4_ind_get_blocks() function should be called with
 956 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
 957 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
 958 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
 959 * blocks.
 960 */
 961static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 962                               struct ext4_map_blocks *map,
 963                               int flags)
 964{
 965        int err = -EIO;
 966        ext4_lblk_t offsets[4];
 967        Indirect chain[4];
 968        Indirect *partial;
 969        ext4_fsblk_t goal;
 970        int indirect_blks;
 971        int blocks_to_boundary = 0;
 972        int depth;
 973        int count = 0;
 974        ext4_fsblk_t first_block = 0;
 975
 976        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
 977        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 978        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
 979                                   &blocks_to_boundary);
 980
 981        if (depth == 0)
 982                goto out;
 983
 984        partial = ext4_get_branch(inode, depth, offsets, chain, &err);
 985
 986        /* Simplest case - block found, no allocation needed */
 987        if (!partial) {
 988                first_block = le32_to_cpu(chain[depth - 1].key);
 989                count++;
 990                /*map more blocks*/
 991                while (count < map->m_len && count <= blocks_to_boundary) {
 992                        ext4_fsblk_t blk;
 993
 994                        blk = le32_to_cpu(*(chain[depth-1].p + count));
 995
 996                        if (blk == first_block + count)
 997                                count++;
 998                        else
 999                                break;
1000                }

1001                goto got_it;
1002        }
1003
1004        /* Next simple case - plain lookup or failed read of indirect block */
1005        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
1006                goto cleanup;
1007
1008        /*
1009         * Okay, we need to do block allocation.
1010        */
1011        goal = ext4_find_goal(inode, map->m_lblk, partial);
1012
1013        /* the number of blocks need to allocate for [d,t]indirect blocks */
1014        indirect_blks = (chain + depth) - partial - 1;
1015
1016        /*
1017         * Next look up the indirect map to count the totoal number of
1018         * direct blocks to allocate for this branch.
1019         */
1020        count = ext4_blks_to_allocate(partial, indirect_blks,
1021                                      map->m_len, blocks_to_boundary);
1022        /*
1023         * Block out ext4_truncate while we alter the tree
1024         */
1025        err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
1026                                &count, goal,
1027                                offsets + (partial - chain), partial);
1028
1029        /*
1030         * The ext4_splice_branch call will free and forget any buffers
1031         * on the new chain if there is a failure, but that risks using
1032         * up transaction credits, especially for bitmaps where the
1033         * credits cannot be returned.  Can we handle this somehow?  We
1034         * may need to return -EAGAIN upwards in the worst case.  --sct
1035         */
1036        if (!err)
1037                err = ext4_splice_branch(handle, inode, map->m_lblk,
1038                                         partial, indirect_blks, count);
1039        if (err)
1040                goto cleanup;
1041
1042        map->m_flags |= EXT4_MAP_NEW;
1043
1044        ext4_update_inode_fsync_trans(handle, inode, 1);
1045got_it:
1046        map->m_flags |= EXT4_MAP_MAPPED;
1047        map->m_pblk = le32_to_cpu(chain[depth-1].key);
1048        map->m_len = count;
1049        if (count > blocks_to_boundary)
1050                map->m_flags |= EXT4_MAP_BOUNDARY;
1051        err = count;
1052        /* Clean up and exit */
1053        partial = chain + depth - 1;    /* the whole chain */
1054cleanup:
1055        while (partial > chain) {
1056                BUFFER_TRACE(partial->bh, "call brelse");
1057                brelse(partial->bh);
1058                partial--;
1059        }
1060out:
1061        return err;
1062}
1063
1064#ifdef CONFIG_QUOTA
1065qsize_t *ext4_get_reserved_space(struct inode *inode)
1066{
1067        return &EXT4_I(inode)->i_reserved_quota;
1068}
1069#endif
1070
1071/*
1072 * Calculate the number of metadata blocks need to reserve
1073 * to allocate a new block at @lblocks for non extent file based file
1074 */
1075static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1076                                              sector_t lblock)
1077{
1078        struct ext4_inode_info *ei = EXT4_I(inode);
1079        sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1080        int blk_bits;
1081
1082        if (lblock < EXT4_NDIR_BLOCKS)
1083                return 0;
1084
1085        lblock -= EXT4_NDIR_BLOCKS;
1086
1087        if (ei->i_da_metadata_calc_len &&
1088            (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
1089                ei->i_da_metadata_calc_len++;
1090                return 0;
1091        }
1092        ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1093        ei->i_da_metadata_calc_len = 1;
1094        blk_bits = order_base_2(lblock);
1095        return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1096}
1097
1098/*
1099 * Calculate the number of metadata blocks need to reserve
1100 * to allocate a block located at @lblock
1101 */
1102static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1103{
1104        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1105                return ext4_ext_calc_metadata_amount(inode, lblock);
1106
1107        return ext4_indirect_calc_metadata_amount(inode, lblock);
1108}
1109
1110/*
1111 * Called with i_data_sem down, which is important since we can call
1112 * ext4_discard_preallocations() from here.
1113 */
1114void ext4_da_update_reserve_space(struct inode *inode,
1115                                        int used, int quota_claim)
1116{
1117        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1118        struct ext4_inode_info *ei = EXT4_I(inode);
1119
1120        spin_lock(&ei->i_block_reservation_lock);
1121        trace_ext4_da_update_reserve_space(inode, used);
1122        if (unlikely(used > ei->i_reserved_data_blocks)) {
1123                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1124                         "with only %d reserved data blocks\n",
1125                         __func__, inode->i_ino, used,
1126                         ei->i_reserved_data_blocks);
1127                WARN_ON(1);
1128                used = ei->i_reserved_data_blocks;
1129        }
1130
1131        /* Update per-inode reservations */
1132        ei->i_reserved_data_blocks -= used;
1133        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1134        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1135                           used + ei->i_allocated_meta_blocks);
1136        ei->i_allocated_meta_blocks = 0;
1137
1138        if (ei->i_reserved_data_blocks == 0) {
1139                /*
1140                 * We can release all of the reserved metadata blocks
1141                 * only when we have written all of the delayed
1142                 * allocation blocks.
1143                 */
1144                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1145                                   ei->i_reserved_meta_blocks);
1146                ei->i_reserved_meta_blocks = 0;
1147                ei->i_da_metadata_calc_len = 0;
1148        }
1149        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1150
1151        /* Update quota subsystem for data blocks */
1152        if (quota_claim)
1153                dquot_claim_block(inode, used);
1154        else {
1155                /*
1156                 * We did fallocate with an offset that is already delayed
1157                 * allocated. So on delayed allocated writeback we should
1158                 * not re-claim the quota for fallocated blocks.
1159                 */
1160                dquot_release_reservation_block(inode, used);
1161        }
1162
1163        /*
1164         * If we have done all the pending block allocations and if
1165         * there aren't any writers on the inode, we can discard the
1166         * inode's preallocations.
1167         */
1168        if ((ei->i_reserved_data_blocks == 0) &&
1169            (atomic_read(&inode->i_writecount) == 0))
1170                ext4_discard_preallocations(inode);
1171}
1172
1173static int __check_block_validity(struct inode *inode, const char *func,
1174                                unsigned int line,
1175                                struct ext4_map_blocks *map)
1176{
1177        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1178                                   map->m_len)) {
1179                ext4_error_inode(inode, func, line, map->m_pblk,
1180                                 "lblock %lu mapped to illegal pblock "
1181                                 "(length %d)", (unsigned long) map->m_lblk,
1182                                 map->m_len);
1183                return -EIO;
1184        }
1185        return 0;
1186}
1187
1188#define check_block_validity(inode, map)        \
1189        __check_block_validity((inode), __func__, __LINE__, (map))
1190
1191/*
1192 * Return the number of contiguous dirty pages in a given inode
1193 * starting at page frame idx.
1194 */
1195static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1196                                    unsigned int max_pages)
1197{
1198        struct address_space *mapping = inode->i_mapping;
1199        pgoff_t index;
1200        struct pagevec pvec;
1201        pgoff_t num = 0;
1202        int i, nr_pages, done = 0;
1203
1204        if (max_pages == 0)
1205                return 0;
1206        pagevec_init(&pvec, 0);
1207        while (!done) {
1208                index = idx;
1209                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1210                                              PAGECACHE_TAG_DIRTY,
1211                                              (pgoff_t)PAGEVEC_SIZE);
1212                if (nr_pages == 0)
1213                        break;
1214                for (i = 0; i < nr_pages; i++) {
1215                        struct page *page = pvec.pages[i];
1216                        struct buffer_head *bh, *head;
1217
1218                        lock_page(page);
1219                        if (unlikely(page->mapping != mapping) ||
1220                            !PageDirty(page) ||
1221                            PageWriteback(page) ||
1222                            page->index != idx) {
1223                                done = 1;
1224                                unlock_page(page);
1225                                break;
1226                        }
1227                        if (page_has_buffers(page)) {
1228                                bh = head = page_buffers(page);
1229                                do {
1230                                        if (!buffer_delay(bh) &&
1231                                            !buffer_unwritten(bh))
1232                                                done = 1;
1233                                        bh = bh->b_this_page;
1234                                } while (!done && (bh != head));
1235                        }
1236                        unlock_page(page);
1237                        if (done)
1238                                break;
1239                        idx++;
1240                        num++;
1241                        if (num >= max_pages) {
1242                                done = 1;
1243                                break;
1244                        }
1245                }
1246                pagevec_release(&pvec);
1247        }
1248        return num;
1249}
1250
1251/*
1252 * The ext4_map_blocks() function tries to look up the requested blocks,
1253 * and returns if the blocks are already mapped.
1254 *
1255 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1256 * and store the allocated blocks in the result buffer head and mark it
1257 * mapped.
1258 *
1259 * If file type is extents based, it will call ext4_ext_map_blocks(),
1260 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1261 * based files
1262 *
1263 * On success, it returns the number of blocks being mapped or allocate.
1264 * if create==0 and the blocks are pre-allocated and uninitialized block,
1265 * the result buffer head is unmapped. If the create ==1, it will make sure
1266 * the buffer head is mapped.
1267 *
1268 * It returns 0 if plain look up failed (blocks have not been allocated), in
1269 * that casem, buffer head is unmapped
1270 *
1271 * It returns the error in case of allocation failure.
1272 */
1273int ext4_map_blocks(handle_t *handle, struct inode *inode,
1274                    struct ext4_map_blocks *map, int flags)
1275{
1276        int retval;
1277
1278        map->m_flags = 0;
1279        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1280                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
1281                  (unsigned long) map->m_lblk);
1282        /*
1283         * Try to see if we can get the block without requesting a new
1284         * file system block.
1285         */
1286        down_read((&EXT4_I(inode)->i_data_sem));
1287        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1288                retval = ext4_ext_map_blocks(handle, inode, map, 0);
1289        } else {
1290                retval = ext4_ind_map_blocks(handle, inode, map, 0);
1291        }
1292        up_read((&EXT4_I(inode)->i_data_sem));
1293
1294        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1295                int ret = check_block_validity(inode, map);
1296                if (ret != 0)
1297                        return ret;
1298        }
1299
1300        /* If it is only a block(s) look up */
1301        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
1302                return retval;
1303
1304        /*
1305         * Returns if the blocks have already allocated
1306         *
1307         * Note that if blocks have been preallocated
1308         * ext4_ext_get_block() returns th create = 0
1309         * with buffer head unmapped.
1310         */
1311        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1312                return retval;
1313
1314        /*
1315         * When we call get_blocks without the create flag, the
1316         * BH_Unwritten flag could have gotten set if the blocks
1317         * requested were part of a uninitialized extent.  We need to
1318         * clear this flag now that we are committed to convert all or
1319         * part of the uninitialized extent to be an initialized
1320         * extent.  This is because we need to avoid the combination
1321         * of BH_Unwritten and BH_Mapped flags being simultaneously
1322         * set on the buffer_head.
1323         */
1324        map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1325
1326        /*
1327         * New blocks allocate and/or writing to uninitialized extent
1328         * will possibly result in updating i_data, so we take
1329         * the write lock of i_data_sem, and call get_blocks()
1330         * with create == 1 flag.
1331         */
1332        down_write((&EXT4_I(inode)->i_data_sem));
1333
1334        /*
1335         * if the caller is from delayed allocation writeout path
1336         * we have already reserved fs blocks for allocation
1337         * let the underlying get_block() function know to
1338         * avoid double accounting
1339         */
1340        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1341                ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1342        /*
1343         * We need to check for EXT4 here because migrate
1344         * could have changed the inode type in between
1345         */
1346        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1347                retval = ext4_ext_map_blocks(handle, inode, map, flags);
1348        } else {
1349                retval = ext4_ind_map_blocks(handle, inode, map, flags);
1350
1351                if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1352                        /*
1353                         * We allocated new blocks which will result in
1354                         * i_data's format changing.  Force the migrate
1355                         * to fail by clearing migrate flags
1356                         */
1357                        ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1358                }
1359
1360                /*
1361                 * Update reserved blocks/metadata blocks after successful
1362                 * block allocation which had been deferred till now. We don't
1363                 * support fallocate for non extent files. So we can update
1364                 * reserve space here.
1365                 */
1366                if ((retval > 0) &&
1367                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
1368                        ext4_da_update_reserve_space(inode, retval, 1);
1369        }
1370        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1371                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1372
1373        up_write((&EXT4_I(inode)->i_data_sem));
1374        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1375                int ret = check_block_validity(inode, map);
1376                if (ret != 0)
1377                        return ret;
1378        }
1379        return retval;
1380}
1381
1382/* Maximum number of blocks we map for direct IO at once. */
1383#define DIO_MAX_BLOCKS 4096
1384
1385static int _ext4_get_block(struct inode *inode, sector_t iblock,
1386                           struct buffer_head *bh, int flags)
1387{
1388        handle_t *handle = ext4_journal_current_handle();
1389        struct ext4_map_blocks map;
1390        int ret = 0, started = 0;
1391        int dio_credits;
1392
1393        map.m_lblk = iblock;
1394        map.m_len = bh->b_size >> inode->i_blkbits;
1395
1396        if (flags && !handle) {
1397                /* Direct IO write... */
1398                if (map.m_len > DIO_MAX_BLOCKS)
1399                        map.m_len = DIO_MAX_BLOCKS;
1400                dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1401                handle = ext4_journal_start(inode, dio_credits);
1402                if (IS_ERR(handle)) {
1403                        ret = PTR_ERR(handle);
1404                        return ret;
1405                }
1406                started = 1;
1407        }
1408
1409        ret = ext4_map_blocks(handle, inode, &map, flags);
1410        if (ret > 0) {
1411                map_bh(bh, inode->i_sb, map.m_pblk);
1412                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1413                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1414                ret = 0;
1415        }
1416        if (started)
1417                ext4_journal_stop(handle);
1418        return ret;
1419}
1420
1421int ext4_get_block(struct inode *inode, sector_t iblock,
1422                   struct buffer_head *bh, int create)
1423{
1424        return _ext4_get_block(inode, iblock, bh,
1425                               create ? EXT4_GET_BLOCKS_CREATE : 0);
1426}
1427
1428/*
1429 * `handle' can be NULL if create is zero
1430 */
1431struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1432                                ext4_lblk_t block, int create, int *errp)
1433{
1434        struct ext4_map_blocks map;
1435        struct buffer_head *bh;
1436        int fatal = 0, err;
1437
1438        J_ASSERT(handle != NULL || create == 0);
1439
1440        map.m_lblk = block;
1441        map.m_len = 1;
1442        err = ext4_map_blocks(handle, inode, &map,
1443                              create ? EXT4_GET_BLOCKS_CREATE : 0);
1444
1445        if (err < 0)
1446                *errp = err;
1447        if (err <= 0)
1448                return NULL;
1449        *errp = 0;
1450
1451        bh = sb_getblk(inode->i_sb, map.m_pblk);
1452        if (!bh) {
1453                *errp = -EIO;
1454                return NULL;
1455        }
1456        if (map.m_flags & EXT4_MAP_NEW) {
1457                J_ASSERT(create != 0);
1458                J_ASSERT(handle != NULL);
1459
1460                /*
1461                 * Now that we do not always journal data, we should
1462                 * keep in mind whether this should always journal the
1463                 * new buffer as metadata.  For now, regular file
1464                 * writes use ext4_get_block instead, so it's not a
1465                 * problem.
1466                 */
1467                lock_buffer(bh);
1468                BUFFER_TRACE(bh, "call get_create_access");
1469                fatal = ext4_journal_get_create_access(handle, bh);
1470                if (!fatal && !buffer_uptodate(bh)) {
1471                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1472                        set_buffer_uptodate(bh);
1473                }
1474                unlock_buffer(bh);
1475                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1476                err = ext4_handle_dirty_metadata(handle, inode, bh);
1477                if (!fatal)
1478                        fatal = err;
1479        } else {
1480                BUFFER_TRACE(bh, "not a new buffer");
1481        }
1482        if (fatal) {
1483                *errp = fatal;
1484                brelse(bh);
1485                bh = NULL;
1486        }
1487        return bh;
1488}
1489
1490struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1491                               ext4_lblk_t block, int create, int *err)
1492{
1493        struct buffer_head *bh;
1494
1495        bh = ext4_getblk(handle, inode, block, create, err);
1496        if (!bh)
1497                return bh;
1498        if (buffer_uptodate(bh))
1499                return bh;
1500        ll_rw_block(READ_META, 1, &bh);
1501        wait_on_buffer(bh);
1502        if (buffer_uptodate(bh))
1503                return bh;
1504        put_bh(bh);
1505        *err = -EIO;
1506        return NULL;
1507}
1508
1509static int walk_page_buffers(handle_t *handle,
1510                             struct buffer_head *head,
1511                             unsigned from,
1512                             unsigned to,
1513                             int *partial,
1514                             int (*fn)(handle_t *handle,
1515                                       struct buffer_head *bh))
1516{
1517        struct buffer_head *bh;
1518        unsigned block_start, block_end;
1519        unsigned blocksize = head->b_size;
1520        int err, ret = 0;
1521        struct buffer_head *next;
1522
1523        for (bh = head, block_start = 0;
1524             ret == 0 && (bh != head || !block_start);
1525             block_start = block_end, bh = next) {
1526                next = bh->b_this_page;
1527                block_end = block_start + blocksize;
1528                if (block_end <= from || block_start >= to) {
1529                        if (partial && !buffer_uptodate(bh))
1530                                *partial = 1;
1531                        continue;
1532                }
1533                err = (*fn)(handle, bh);
1534                if (!ret)
1535                        ret = err;
1536        }
1537        return ret;
1538}
1539
1540/*
1541 * To preserve ordering, it is essential that the hole instantiation and
1542 * the data write be encapsulated in a single transaction.  We cannot
1543 * close off a transaction and start a new one between the ext4_get_block()
1544 * and the commit_write().  So doing the jbd2_journal_start at the start of
1545 * prepare_write() is the right place.
1546 *
1547 * Also, this function can nest inside ext4_writepage() ->
1548 * block_write_full_page(). In that case, we *know* that ext4_writepage()
1549 * has generated enough buffer credits to do the whole page.  So we won't
1550 * block on the journal in that case, which is good, because the caller may
1551 * be PF_MEMALLOC.
1552 *
1553 * By accident, ext4 can be reentered when a transaction is open via
1554 * quota file writes.  If we were to commit the transaction while thus
1555 * reentered, there can be a deadlock - we would be holding a quota
1556 * lock, and the commit would never complete if another thread had a
1557 * transaction open and was blocking on the quota lock - a ranking
1558 * violation.
1559 *
1560 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1561 * will _not_ run commit under these circumstances because handle->h_ref
1562 * is elevated.  We'll still have enough credits for the tiny quotafile
1563 * write.
1564 */
1565static int do_journal_get_write_access(handle_t *handle,
1566                                       struct buffer_head *bh)
1567{
1568        int dirty = buffer_dirty(bh);
1569        int ret;
1570
1571        if (!buffer_mapped(bh) || buffer_freed(bh))
1572                return 0;
1573        /*
1574         * __block_write_begin() could have dirtied some buffers. Clean
1575         * the dirty bit as jbd2_journal_get_write_access() could complain
1576         * otherwise about fs integrity issues. Setting of the dirty bit
1577         * by __block_write_begin() isn't a real problem here as we clear
1578         * the bit before releasing a page lock and thus writeback cannot
1579         * ever write the buffer.
1580         */
1581        if (dirty)
1582                clear_buffer_dirty(bh);
1583        ret = ext4_journal_get_write_access(handle, bh);
1584        if (!ret && dirty)
1585                ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1586        return ret;
1587}
1588
1589/*
1590 * Truncate blocks that were not used by write. We have to truncate the
1591 * pagecache as well so that corresponding buffers get properly unmapped.
1592 */
1593static void ext4_truncate_failed_write(struct inode *inode)
1594{
1595        truncate_inode_pages(inode->i_mapping, inode->i_size);
1596        ext4_truncate(inode);
1597}
1598
1599static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1600                   struct buffer_head *bh_result, int create);
1601static int ext4_write_begin(struct file *file, struct address_space *mapping,
1602                            loff_t pos, unsigned len, unsigned flags,
1603                            struct page **pagep, void **fsdata)
1604{
1605        struct inode *inode = mapping->host;
1606        int ret, needed_blocks;
1607        handle_t *handle;
1608        int retries = 0;
1609        struct page *page;
1610        pgoff_t index;
1611        unsigned from, to;
1612
1613        trace_ext4_write_begin(inode, pos, len, flags);
1614        /*
1615         * Reserve one block more for addition to orphan list in case
1616         * we allocate blocks but write fails for some reason
1617         */
1618        needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
1619        index = pos >> PAGE_CACHE_SHIFT;
1620        from = pos & (PAGE_CACHE_SIZE - 1);
1621        to = from + len;
1622
1623retry:
1624        handle = ext4_journal_start(inode, needed_blocks);
1625        if (IS_ERR(handle)) {
1626                ret = PTR_ERR(handle);
1627                goto out;
1628        }
1629
1630        /* We cannot recurse into the filesystem as the transaction is already
1631         * started */
1632        flags |= AOP_FLAG_NOFS;
1633
1634        page = grab_cache_page_write_begin(mapping, index, flags);
1635        if (!page) {
1636                ext4_journal_stop(handle);
1637                ret = -ENOMEM;
1638                goto out;
1639        }
1640        *pagep = page;
1641
1642        if (ext4_should_dioread_nolock(inode))
1643                ret = __block_write_begin(page, pos, len, ext4_get_block_write);
1644        else
1645                ret = __block_write_begin(page, pos, len, ext4_get_block);
1646
1647        if (!ret && ext4_should_journal_data(inode)) {
1648                ret = walk_page_buffers(handle, page_buffers(page),
1649                                from, to, NULL, do_journal_get_write_access);
1650        }
1651
1652        if (ret) {
1653                unlock_page(page);
1654                page_cache_release(page);
1655                /*
1656                 * __block_write_begin may have instantiated a few blocks
1657                 * outside i_size.  Trim these off again. Don't need
1658                 * i_size_read because we hold i_mutex.
1659                 *
1660                 * Add inode to orphan list in case we crash before
1661                 * truncate finishes
1662                 */
1663                if (pos + len > inode->i_size && ext4_can_truncate(inode))
1664                        ext4_orphan_add(handle, inode);
1665
1666                ext4_journal_stop(handle);
1667                if (pos + len > inode->i_size) {
1668                        ext4_truncate_failed_write(inode);
1669                        /*
1670                         * If truncate failed early the inode might
1671                         * still be on the orphan list; we need to
1672                         * make sure the inode is removed from the
1673                         * orphan list in that case.
1674                         */
1675                        if (inode->i_nlink)
1676                                ext4_orphan_del(NULL, inode);
1677                }
1678        }
1679
1680        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1681                goto retry;
1682out:
1683        return ret;
1684}
1685
1686/* For write_end() in data=journal mode */
1687static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1688{
1689        if (!buffer_mapped(bh) || buffer_freed(bh))
1690                return 0;
1691        set_buffer_uptodate(bh);
1692        return ext4_handle_dirty_metadata(handle, NULL, bh);
1693}
1694
1695static int ext4_generic_write_end(struct file *file,
1696                                  struct address_space *mapping,
1697                                  loff_t pos, unsigned len, unsigned copied,
1698                                  struct page *page, void *fsdata)
1699{
1700        int i_size_changed = 0;
1701        struct inode *inode = mapping->host;
1702        handle_t *handle = ext4_journal_current_handle();
1703
1704        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1705
1706        /*
1707         * No need to use i_size_read() here, the i_size
1708         * cannot change under us because we hold i_mutex.
1709         *
1710         * But it's important to update i_size while still holding page lock:
1711         * page writeout could otherwise come in and zero beyond i_size.
1712         */
1713        if (pos + copied > inode->i_size) {
1714                i_size_write(inode, pos + copied);
1715                i_size_changed = 1;
1716        }
1717
1718        if (pos + copied >  EXT4_I(inode)->i_disksize) {
1719                /* We need to mark inode dirty even if
1720                 * new_i_size is less that inode->i_size
1721                 * bu greater than i_disksize.(hint delalloc)
1722                 */
1723                ext4_update_i_disksize(inode, (pos + copied));
1724                i_size_changed = 1;
1725        }
1726        unlock_page(page);
1727        page_cache_release(page);
1728
1729        /*
1730         * Don't mark the inode dirty under page lock. First, it unnecessarily
1731         * makes the holding time of page lock longer. Second, it forces lock
1732         * ordering of page lock and transaction start for journaling
1733         * filesystems.
1734         */
1735        if (i_size_changed)
1736                ext4_mark_inode_dirty(handle, inode);
1737
1738        return copied;
1739}
1740
1741/*
1742 * We need to pick up the new inode size which generic_commit_write gave us
1743 * `file' can be NULL - eg, when called from page_symlink().
1744 *
1745 * ext4 never places buffers on inode->i_mapping->private_list.  metadata
1746 * buffers are managed internally.
1747 */
1748static int ext4_ordered_write_end(struct file *file,
1749                                  struct address_space *mapping,
1750                                  loff_t pos, unsigned len, unsigned copied,
1751                                  struct page *page, void *fsdata)
1752{
1753        handle_t *handle = ext4_journal_current_handle();
1754        struct inode *inode = mapping->host;
1755        int ret = 0, ret2;
1756
1757        trace_ext4_ordered_write_end(inode, pos, len, copied);
1758        ret = ext4_jbd2_file_inode(handle, inode);
1759
1760        if (ret == 0) {
1761                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1762                                                        page, fsdata);
1763                copied = ret2;
1764                if (pos + len > inode->i_size && ext4_can_truncate(inode))
1765                        /* if we have allocated more blocks and copied
1766                         * less. We will have blocks allocated outside
1767                         * inode->i_size. So truncate them
1768                         */
1769                        ext4_orphan_add(handle, inode);
1770                if (ret2 < 0)
1771                        ret = ret2;
1772        }
1773        ret2 = ext4_journal_stop(handle);
1774        if (!ret)
1775                ret = ret2;
1776
1777        if (pos + len > inode->i_size) {
1778                ext4_truncate_failed_write(inode);
1779                /*
1780                 * If truncate failed early the inode might still be
1781                 * on the orphan list; we need to make sure the inode
1782                 * is removed from the orphan list in that case.
1783                 */
1784                if (inode->i_nlink)
1785                        ext4_orphan_del(NULL, inode);
1786        }
1787
1788
1789        return ret ? ret : copied;
1790}
1791
1792static int ext4_writeback_write_end(struct file *file,
1793                                    struct address_space *mapping,
1794                                    loff_t pos, unsigned len, unsigned copied,
1795                                    struct page *page, void *fsdata)
1796{
1797        handle_t *handle = ext4_journal_current_handle();
1798        struct inode *inode = mapping->host;
1799        int ret = 0, ret2;
1800
1801        trace_ext4_writeback_write_end(inode, pos, len, copied);
1802        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1803                                                        page, fsdata);
1804        copied = ret2;
1805        if (pos + len > inode->i_size && ext4_can_truncate(inode))
1806                /* if we have allocated more blocks and copied
1807                 * less. We will have blocks allocated outside
1808                 * inode->i_size. So truncate them
1809                 */
1810                ext4_orphan_add(handle, inode);
1811
1812        if (ret2 < 0)
1813                ret = ret2;
1814
1815        ret2 = ext4_journal_stop(handle);
1816        if (!ret)
1817                ret = ret2;
1818
1819        if (pos + len > inode->i_size) {
1820                ext4_truncate_failed_write(inode);
1821                /*
1822                 * If truncate failed early the inode might still be
1823                 * on the orphan list; we need to make sure the inode
1824                 * is removed from the orphan list in that case.
1825                 */
1826                if (inode->i_nlink)
1827                        ext4_orphan_del(NULL, inode);
1828        }
1829
1830        return ret ? ret : copied;
1831}
1832
1833static int ext4_journalled_write_end(struct file *file,
1834                                     struct address_space *mapping,
1835                                     loff_t pos, unsigned len, unsigned copied,
1836                                     struct page *page, void *fsdata)
1837{
1838        handle_t *handle = ext4_journal_current_handle();
1839        struct inode *inode = mapping->host;
1840        int ret = 0, ret2;
1841        int partial = 0;
1842        unsigned from, to;
1843        loff_t new_i_size;
1844
1845        trace_ext4_journalled_write_end(inode, pos, len, copied);
1846        from = pos & (PAGE_CACHE_SIZE - 1);
1847        to = from + len;
1848
1849        if (copied < len) {
1850                if (!PageUptodate(page))
1851                        copied = 0;
1852                page_zero_new_buffers(page, from+copied, to);
1853        }
1854
1855        ret = walk_page_buffers(handle, page_buffers(page), from,
1856                                to, &partial, write_end_fn);
1857        if (!partial)
1858                SetPageUptodate(page);
1859        new_i_size = pos + copied;
1860        if (new_i_size > inode->i_size)
1861                i_size_write(inode, pos+copied);
1862        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1863        if (new_i_size > EXT4_I(inode)->i_disksize) {
1864                ext4_update_i_disksize(inode, new_i_size);
1865                ret2 = ext4_mark_inode_dirty(handle, inode);
1866                if (!ret)
1867                        ret = ret2;
1868        }
1869
1870        unlock_page(page);
1871        page_cache_release(page);
1872        if (pos + len > inode->i_size && ext4_can_truncate(inode))
1873                /* if we have allocated more blocks and copied
1874                 * less. We will have blocks allocated outside
1875                 * inode->i_size. So truncate them
1876                 */
1877                ext4_orphan_add(handle, inode);
1878
1879        ret2 = ext4_journal_stop(handle);
1880        if (!ret)
1881                ret = ret2;
1882        if (pos + len > inode->i_size) {
1883                ext4_truncate_failed_write(inode);
1884                /*
1885                 * If truncate failed early the inode might still be
1886                 * on the orphan list; we need to make sure the inode
1887                 * is removed from the orphan list in that case.
1888                 */
1889                if (inode->i_nlink)
1890                        ext4_orphan_del(NULL, inode);
1891        }
1892
1893        return ret ? ret : copied;
1894}
1895
1896/*
1897 * Reserve a single block located at lblock
1898 */
1899static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1900{
1901        int retries = 0;
1902        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1903        struct ext4_inode_info *ei = EXT4_I(inode);
1904        unsigned long md_needed;
1905        int ret;
1906
1907        /*
1908         * recalculate the amount of metadata blocks to reserve
1909         * in order to allocate nrblocks
1910         * worse case is one extent per block
1911         */
1912repeat:
1913        spin_lock(&ei->i_block_reservation_lock);
1914        md_needed = ext4_calc_metadata_amount(inode, lblock);
1915        trace_ext4_da_reserve_space(inode, md_needed);
1916        spin_unlock(&ei->i_block_reservation_lock);
1917
1918        /*
1919         * We will charge metadata quota at writeout time; this saves
1920         * us from metadata over-estimation, though we may go over by
1921         * a small amount in the end.  Here we just reserve for data.
1922         */
1923        ret = dquot_reserve_block(inode, 1);
1924        if (ret)
1925                return ret;
1926        /*
1927         * We do still charge estimated metadata to the sb though;
1928         * we cannot afford to run out of free blocks.
1929         */
1930        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1931                dquot_release_reservation_block(inode, 1);
1932                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1933                        yield();
1934                        goto repeat;
1935                }
1936                return -ENOSPC;
1937        }
1938        spin_lock(&ei->i_block_reservation_lock);
1939        ei->i_reserved_data_blocks++;
1940        ei->i_reserved_meta_blocks += md_needed;
1941        spin_unlock(&ei->i_block_reservation_lock);
1942
1943        return 0;       /* success */
1944}
1945
1946static void ext4_da_release_space(struct inode *inode, int to_free)
1947{
1948        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1949        struct ext4_inode_info *ei = EXT4_I(inode);
1950
1951        if (!to_free)
1952                return;         /* Nothing to release, exit */
1953
1954        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1955
1956        trace_ext4_da_release_space(inode, to_free);
1957        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1958                /*
1959                 * if there aren't enough reserved blocks, then the
1960                 * counter is messed up somewhere.  Since this
1961                 * function is called from invalidate page, it's
1962                 * harmless to return without any action.
1963                 */
1964                ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1965                         "ino %lu, to_free %d with only %d reserved "
1966                         "data blocks\n", inode->i_ino, to_free,
1967                         ei->i_reserved_data_blocks);
1968                WARN_ON(1);
1969                to_free = ei->i_reserved_data_blocks;
1970        }
1971        ei->i_reserved_data_blocks -= to_free;
1972
1973        if (ei->i_reserved_data_blocks == 0) {
1974                /*
1975                 * We can release all of the reserved metadata blocks
1976                 * only when we have written all of the delayed
1977                 * allocation blocks.
1978                 */
1979                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1980                                   ei->i_reserved_meta_blocks);
1981                ei->i_reserved_meta_blocks = 0;
1982                ei->i_da_metadata_calc_len = 0;
1983        }
1984
1985        /* update fs dirty data blocks counter */
1986        percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1987
1988        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1989
1990        dquot_release_reservation_block(inode, to_free);
1991}
1992
1993static void ext4_da_page_release_reservation(struct page *page,
1994                                             unsigned long offset)
1995{
1996        int to_release = 0;
1997        struct buffer_head *head, *bh;
1998        unsigned int curr_off = 0;
1999
2000        head = page_buffers(page);

2001        bh = head;
2002        do {
2003                unsigned int next_off = curr_off + bh->b_size;
2004
2005                if ((offset <= curr_off) && (buffer_delay(bh))) {
2006                        to_release++;
2007                        clear_buffer_delay(bh);
2008                }
2009                curr_off = next_off;
2010        } while ((bh = bh->b_this_page) != head);
2011        ext4_da_release_space(page->mapping->host, to_release);
2012}
2013
2014/*
2015 * Delayed allocation stuff
2016 */
2017
2018/*
2019 * mpage_da_submit_io - walks through extent of pages and try to write
2020 * them with writepage() call back
2021 *
2022 * @mpd->inode: inode
2023 * @mpd->first_page: first page of the extent
2024 * @mpd->next_page: page after the last page of the extent
2025 *
2026 * By the time mpage_da_submit_io() is called we expect all blocks
2027 * to be allocated. this may be wrong if allocation failed.
2028 *
2029 * As pages are already locked by write_cache_pages(), we can't use it
2030 */
2031static int mpage_da_submit_io(struct mpage_da_data *mpd,
2032                              struct ext4_map_blocks *map)
2033{
2034        struct pagevec pvec;
2035        unsigned long index, end;
2036        int ret = 0, err, nr_pages, i;
2037        struct inode *inode = mpd->inode;
2038        struct address_space *mapping = inode->i_mapping;
2039        loff_t size = i_size_read(inode);
2040        unsigned int len, block_start;
2041        struct buffer_head *bh, *page_bufs = NULL;
2042        int journal_data = ext4_should_journal_data(inode);
2043        sector_t pblock = 0, cur_logical = 0;
2044        struct ext4_io_submit io_submit;
2045
2046        BUG_ON(mpd->next_page <= mpd->first_page);
2047        memset(&io_submit, 0, sizeof(io_submit));
2048        /*
2049         * We need to start from the first_page to the next_page - 1
2050         * to make sure we also write the mapped dirty buffer_heads.
2051         * If we look at mpd->b_blocknr we would only be looking
2052         * at the currently mapped buffer_heads.
2053         */
2054        index = mpd->first_page;
2055        end = mpd->next_page - 1;
2056
2057        pagevec_init(&pvec, 0);
2058        while (index <= end) {
2059                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2060                if (nr_pages == 0)
2061                        break;
2062                for (i = 0; i < nr_pages; i++) {
2063                        int commit_write = 0, redirty_page = 0;
2064                        struct page *page = pvec.pages[i];
2065
2066                        index = page->index;
2067                        if (index > end)
2068                                break;
2069
2070                        if (index == size >> PAGE_CACHE_SHIFT)
2071                                len = size & ~PAGE_CACHE_MASK;
2072                        else
2073                                len = PAGE_CACHE_SIZE;
2074                        if (map) {
2075                                cur_logical = index << (PAGE_CACHE_SHIFT -
2076                                                        inode->i_blkbits);
2077                                pblock = map->m_pblk + (cur_logical -
2078                                                        map->m_lblk);
2079                        }
2080                        index++;
2081
2082                        BUG_ON(!PageLocked(page));
2083                        BUG_ON(PageWriteback(page));
2084
2085                        /*
2086                         * If the page does not have buffers (for
2087                         * whatever reason), try to create them using
2088                         * __block_write_begin.  If this fails,
2089                         * redirty the page and move on.
2090                         */
2091                        if (!page_has_buffers(page)) {
2092                                if (__block_write_begin(page, 0, len,
2093                                                noalloc_get_block_write)) {
2094                                redirty_page:
2095                                        redirty_page_for_writepage(mpd->wbc,
2096                                                                   page);
2097                                        unlock_page(page);
2098                                        continue;
2099                                }
2100                                commit_write = 1;
2101                        }
2102
2103                        bh = page_bufs = page_buffers(page);
2104                        block_start = 0;
2105                        do {
2106                                if (!bh)
2107                                        goto redirty_page;
2108                                if (map && (cur_logical >= map->m_lblk) &&
2109                                    (cur_logical <= (map->m_lblk +
2110                                                     (map->m_len - 1)))) {
2111                                        if (buffer_delay(bh)) {
2112                                                clear_buffer_delay(bh);
2113                                                bh->b_blocknr = pblock;
2114                                        }
2115                                        if (buffer_unwritten(bh) ||
2116                                            buffer_mapped(bh))
2117                                                BUG_ON(bh->b_blocknr != pblock);
2118                                        if (map->m_flags & EXT4_MAP_UNINIT)
2119                                                set_buffer_uninit(bh);
2120                                        clear_buffer_unwritten(bh);
2121                                }
2122
2123                                /* redirty page if block allocation undone */
2124                                if (buffer_delay(bh) || buffer_unwritten(bh))
2125                                        redirty_page = 1;
2126                                bh = bh->b_this_page;
2127                                block_start += bh->b_size;
2128                                cur_logical++;
2129                                pblock++;
2130                        } while (bh != page_bufs);
2131
2132                        if (redirty_page)
2133                                goto redirty_page;
2134
2135                        if (commit_write)
2136                                /* mark the buffer_heads as dirty & uptodate */
2137                                block_commit_write(page, 0, len);
2138
2139                        /*
2140                         * Delalloc doesn't support data journalling,
2141                         * but eventually maybe we'll lift this
2142                         * restriction.
2143                         */
2144                        if (unlikely(journal_data && PageChecked(page)))
2145                                err = __ext4_journalled_writepage(page, len);
2146                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
2147                                err = ext4_bio_write_page(&io_submit, page,
2148                                                          len, mpd->wbc);
2149                        else
2150                                err = block_write_full_page(page,
2151                                        noalloc_get_block_write, mpd->wbc);
2152
2153                        if (!err)
2154                                mpd->pages_written++;
2155                        /*
2156                         * In error case, we have to continue because
2157                         * remaining pages are still locked
2158                         */
2159                        if (ret == 0)
2160                                ret = err;
2161                }
2162                pagevec_release(&pvec);
2163        }
2164        ext4_io_submit(&io_submit);
2165        return ret;
2166}
2167
2168static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2169                                        sector_t logical, long blk_cnt)
2170{
2171        int nr_pages, i;
2172        pgoff_t index, end;
2173        struct pagevec pvec;
2174        struct inode *inode = mpd->inode;
2175        struct address_space *mapping = inode->i_mapping;
2176
2177        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2178        end   = (logical + blk_cnt - 1) >>
2179                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
2180        while (index <= end) {
2181                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2182                if (nr_pages == 0)
2183                        break;
2184                for (i = 0; i < nr_pages; i++) {
2185                        struct page *page = pvec.pages[i];
2186                        if (page->index > end)
2187                                break;
2188                        BUG_ON(!PageLocked(page));
2189                        BUG_ON(PageWriteback(page));
2190                        block_invalidatepage(page, 0);
2191                        ClearPageUptodate(page);
2192                        unlock_page(page);
2193                }
2194                index = pvec.pages[nr_pages - 1]->index + 1;
2195                pagevec_release(&pvec);
2196        }
2197        return;
2198}
2199
2200static void ext4_print_free_blocks(struct inode *inode)
2201{
2202        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2203        printk(KERN_CRIT "Total free blocks count %lld\n",
2204               ext4_count_free_blocks(inode->i_sb));
2205        printk(KERN_CRIT "Free/Dirty block details\n");
2206        printk(KERN_CRIT "free_blocks=%lld\n",
2207               (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2208        printk(KERN_CRIT "dirty_blocks=%lld\n",
2209               (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2210        printk(KERN_CRIT "Block reservation details\n");
2211        printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2212               EXT4_I(inode)->i_reserved_data_blocks);
2213        printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2214               EXT4_I(inode)->i_reserved_meta_blocks);
2215        return;
2216}
2217
2218/*
2219 * mpage_da_map_and_submit - go through given space, map them
2220 *       if necessary, and then submit them for I/O
2221 *
2222 * @mpd - bh describing space
2223 *
2224 * The function skips space we know is already mapped to disk blocks.
2225 *
2226 */
2227static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2228{
2229        int err, blks, get_blocks_flags;
2230        struct ext4_map_blocks map, *mapp = NULL;
2231        sector_t next = mpd->b_blocknr;
2232        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2233        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2234        handle_t *handle = NULL;
2235
2236        /*
2237         * If the blocks are mapped already, or we couldn't accumulate
2238         * any blocks, then proceed immediately to the submission stage.
2239         */
2240        if ((mpd->b_size == 0) ||
2241            ((mpd->b_state  & (1 << BH_Mapped)) &&
2242             !(mpd->b_state & (1 << BH_Delay)) &&
2243             !(mpd->b_state & (1 << BH_Unwritten))))
2244                goto submit_io;
2245
2246        handle = ext4_journal_current_handle();
2247        BUG_ON(!handle);
2248
2249        /*
2250         * Call ext4_map_blocks() to allocate any delayed allocation
2251         * blocks, or to convert an uninitialized extent to be
2252         * initialized (in the case where we have written into
2253         * one or more preallocated blocks).
2254         *
2255         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
2256         * indicate that we are on the delayed allocation path.  This
2257         * affects functions in many different parts of the allocation
2258         * call path.  This flag exists primarily because we don't
2259         * want to change *many* call functions, so ext4_map_blocks()
2260         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
2261         * inode's allocation semaphore is taken.
2262         *
2263         * If the blocks in questions were delalloc blocks, set
2264         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2265         * variables are updated after the blocks have been allocated.
2266         */
2267        map.m_lblk = next;
2268        map.m_len = max_blocks;
2269        get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2270        if (ext4_should_dioread_nolock(mpd->inode))
2271                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2272        if (mpd->b_state & (1 << BH_Delay))
2273                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2274
2275        blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2276        if (blks < 0) {
2277                struct super_block *sb = mpd->inode->i_sb;
2278
2279                err = blks;
2280                /*
2281                 * If get block returns EAGAIN or ENOSPC and there
2282                 * appears to be free blocks we will call
2283                 * ext4_writepage() for all of the pages which will
2284                 * just redirty the pages.
2285                 */
2286                if (err == -EAGAIN)
2287                        goto submit_io;
2288
2289                if (err == -ENOSPC &&
2290                    ext4_count_free_blocks(sb)) {
2291                        mpd->retval = err;
2292                        goto submit_io;
2293                }
2294
2295                /*
2296                 * get block failure will cause us to loop in
2297                 * writepages, because a_ops->writepage won't be able
2298                 * to make progress. The page will be redirtied by
2299                 * writepage and writepages will again try to write
2300                 * the same.
2301                 */
2302                if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2303                        ext4_msg(sb, KERN_CRIT,
2304                                 "delayed block allocation failed for inode %lu "
2305                                 "at logical offset %llu with max blocks %zd "
2306                                 "with error %d", mpd->inode->i_ino,
2307                                 (unsigned long long) next,
2308                                 mpd->b_size >> mpd->inode->i_blkbits, err);
2309                        ext4_msg(sb, KERN_CRIT,
2310                                "This should not happen!! Data will be lost\n");
2311                        if (err == -ENOSPC)
2312                                ext4_print_free_blocks(mpd->inode);
2313                }
2314                /* invalidate all the pages */
2315                ext4_da_block_invalidatepages(mpd, next,
2316                                mpd->b_size >> mpd->inode->i_blkbits);
2317                return;
2318        }
2319        BUG_ON(blks == 0);
2320
2321        mapp = &map;
2322        if (map.m_flags & EXT4_MAP_NEW) {
2323                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2324                int i;
2325
2326                for (i = 0; i < map.m_len; i++)
2327                        unmap_underlying_metadata(bdev, map.m_pblk + i);
2328        }
2329
2330        if (ext4_should_order_data(mpd->inode)) {
2331                err = ext4_jbd2_file_inode(handle, mpd->inode);
2332                if (err)
2333                        /* This only happens if the journal is aborted */
2334                        return;
2335        }
2336
2337        /*
2338         * Update on-disk size along with block allocation.
2339         */
2340        disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
2341        if (disksize > i_size_read(mpd->inode))
2342                disksize = i_size_read(mpd->inode);
2343        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2344                ext4_update_i_disksize(mpd->inode, disksize);
2345                err = ext4_mark_inode_dirty(handle, mpd->inode);
2346                if (err)
2347                        ext4_error(mpd->inode->i_sb,
2348                                   "Failed to mark inode %lu dirty",
2349                                   mpd->inode->i_ino);
2350        }
2351
2352submit_io:
2353        mpage_da_submit_io(mpd, mapp);
2354        mpd->io_done = 1;
2355}
2356
2357#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
2358                (1 << BH_Delay) | (1 << BH_Unwritten))
2359
2360/*
2361 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
2362 *
2363 * @mpd->lbh - extent of blocks
2364 * @logical - logical number of the block in the file
2365 * @bh - bh of the block (used to access block's state)
2366 *
2367 * the function is used to collect contig. blocks in same state
2368 */
2369static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2370                                   sector_t logical, size_t b_size,
2371                                   unsigned long b_state)
2372{
2373        sector_t next;
2374        int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2375
2376        /*
2377         * XXX Don't go larger than mballoc is willing to allocate
2378         * This is a stopgap solution.  We eventually need to fold
2379         * mpage_da_submit_io() into this function and then call
2380         * ext4_map_blocks() multiple times in a loop
2381         */
2382        if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2383                goto flush_it;
2384
2385        /* check if thereserved journal credits might overflow */
2386        if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2387                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2388                        /*
2389                         * With non-extent format we are limited by the journal
2390                         * credit available.  Total credit needed to insert
2391                         * nrblocks contiguous blocks is dependent on the
2392                         * nrblocks.  So limit nrblocks.
2393                         */
2394                        goto flush_it;
2395                } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
2396                                EXT4_MAX_TRANS_DATA) {
2397                        /*
2398                         * Adding the new buffer_head would make it cross the
2399                         * allowed limit for which we have journal credit
2400                         * reserved. So limit the new bh->b_size
2401                         */
2402                        b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
2403                                                mpd->inode->i_blkbits;
2404                        /* we will do mpage_da_submit_io in the next loop */
2405                }
2406        }
2407        /*
2408         * First block in the extent
2409         */
2410        if (mpd->b_size == 0) {
2411                mpd->b_blocknr = logical;
2412                mpd->b_size = b_size;
2413                mpd->b_state = b_state & BH_FLAGS;
2414                return;
2415        }
2416
2417        next = mpd->b_blocknr + nrblocks;
2418        /*
2419         * Can we merge the block to our big extent?
2420         */
2421        if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2422                mpd->b_size += b_size;
2423                return;
2424        }
2425
2426flush_it:
2427        /*
2428         * We couldn't merge the block to our extent, so we
2429         * need to flush current  extent and start new one
2430         */
2431        mpage_da_map_and_submit(mpd);
2432        return;
2433}
2434
2435static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2436{
2437        return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
2438}
2439
2440/*
2441 * __mpage_da_writepage - finds extent of pages and blocks
2442 *
2443 * @page: page to consider
2444 * @wbc: not used, we just follow rules
2445 * @data: context
2446 *
2447 * The function finds extents of pages and scan them for all blocks.
2448 */
2449static int __mpage_da_writepage(struct page *page,
2450                                struct writeback_control *wbc,
2451                                struct mpage_da_data *mpd)
2452{
2453        struct inode *inode = mpd->inode;
2454        struct buffer_head *bh, *head;
2455        sector_t logical;
2456
2457        /*
2458         * Can we merge this page to current extent?
2459         */
2460        if (mpd->next_page != page->index) {
2461                /*
2462                 * Nope, we can't. So, we map non-allocated blocks
2463                 * and start IO on them
2464                 */
2465                if (mpd->next_page != mpd->first_page) {
2466                        mpage_da_map_and_submit(mpd);
2467                        /*
2468                         * skip rest of the page in the page_vec
2469                         */
2470                        redirty_page_for_writepage(wbc, page);
2471                        unlock_page(page);
2472                        return MPAGE_DA_EXTENT_TAIL;
2473                }
2474
2475                /*
2476                 * Start next extent of pages ...
2477                 */
2478                mpd->first_page = page->index;
2479
2480                /*
2481                 * ... and blocks
2482                 */
2483                mpd->b_size = 0;
2484                mpd->b_state = 0;
2485                mpd->b_blocknr = 0;
2486        }
2487
2488        mpd->next_page = page->index + 1;
2489        logical = (sector_t) page->index <<
2490                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
2491
2492        if (!page_has_buffers(page)) {
2493                mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2494                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
2495                if (mpd->io_done)
2496                        return MPAGE_DA_EXTENT_TAIL;
2497        } else {
2498                /*
2499                 * Page with regular buffer heads, just add all dirty ones
2500                 */
2501                head = page_buffers(page);
2502                bh = head;
2503                do {
2504                        BUG_ON(buffer_locked(bh));
2505                        /*
2506                         * We need to try to allocate
2507                         * unmapped blocks in the same page.
2508                         * Otherwise we won't make progress
2509                         * with the page in ext4_writepage
2510                         */
2511                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2512                                mpage_add_bh_to_extent(mpd, logical,
2513                                                       bh->b_size,
2514                                                       bh->b_state);
2515                                if (mpd->io_done)
2516                                        return MPAGE_DA_EXTENT_TAIL;
2517                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2518                                /*
2519                                 * mapped dirty buffer. We need to update
2520                                 * the b_state because we look at
2521                                 * b_state in mpage_da_map_blocks. We don't
2522                                 * update b_size because if we find an
2523                                 * unmapped buffer_head later we need to
2524                                 * use the b_state flag of that buffer_head.
2525                                 */
2526                                if (mpd->b_size == 0)
2527                                        mpd->b_state = bh->b_state & BH_FLAGS;
2528                        }
2529                        logical++;
2530                } while ((bh = bh->b_this_page) != head);
2531        }
2532
2533        return 0;
2534}
2535
2536/*
2537 * This is a special get_blocks_t callback which is used by
2538 * ext4_da_write_begin().  It will either return mapped block or
2539 * reserve space for a single block.
2540 *
2541 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
2542 * We also have b_blocknr = -1 and b_bdev initialized properly
2543 *
2544 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
2545 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
2546 * initialized properly.
2547 */
2548static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2549                                  struct buffer_head *bh, int create)
2550{
2551        struct ext4_map_blocks map;
2552        int ret = 0;
2553        sector_t invalid_block = ~((sector_t) 0xffff);
2554
2555        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
2556                invalid_block = ~0;
2557
2558        BUG_ON(create == 0);
2559        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2560
2561        map.m_lblk = iblock;
2562        map.m_len = 1;
2563
2564        /*
2565         * first, we need to know whether the block is allocated already
2566         * preallocated blocks are unmapped but should treated
2567         * the same as allocated blocks.
2568         */
2569        ret = ext4_map_blocks(NULL, inode, &map, 0);
2570        if (ret < 0)
2571                return ret;
2572        if (ret == 0) {
2573                if (buffer_delay(bh))
2574                        return 0; /* Not sure this could or should happen */
2575                /*
2576                 * XXX: __block_write_begin() unmaps passed block, is it OK?
2577                 */
2578                ret = ext4_da_reserve_space(inode, iblock);
2579                if (ret)
2580                        /* not enough space to reserve */
2581                        return ret;
2582
2583                map_bh(bh, inode->i_sb, invalid_block);
2584                set_buffer_new(bh);
2585                set_buffer_delay(bh);
2586                return 0;
2587        }
2588
2589        map_bh(bh, inode->i_sb, map.m_pblk);
2590        bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
2591
2592        if (buffer_unwritten(bh)) {
2593                /* A delayed write to unwritten bh should be marked
2594                 * new and mapped.  Mapped ensures that we don't do
2595                 * get_block multiple times when we write to the same
2596                 * offset and new ensures that we do proper zero out
2597                 * for partial write.
2598                 */
2599                set_buffer_new(bh);
2600                set_buffer_mapped(bh);
2601        }
2602        return 0;
2603}
2604
2605/*
2606 * This function is used as a standard get_block_t calback function
2607 * when there is no desire to allocate any blocks.  It is used as a
2608 * callback function for block_write_begin() and block_write_full_page().
2609 * These functions should only try to map a single block at a time.
2610 *
2611 * Since this function doesn't do block allocations even if the caller
2612 * requests it by passing in create=1, it is critically important that
2613 * any caller checks to make sure that any buffer heads are returned
2614 * by this function are either all already mapped or marked for
2615 * delayed allocation before calling  block_write_full_page().  Otherwise,
2616 * b_blocknr could be left unitialized, and the page write functions will
2617 * be taken by surprise.
2618 */
2619static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2620                                   struct buffer_head *bh_result, int create)
2621{
2622        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2623        return _ext4_get_block(inode, iblock, bh_result, 0);
2624}
2625
2626static int bget_one(handle_t *handle, struct buffer_head *bh)
2627{
2628        get_bh(bh);
2629        return 0;
2630}
2631
2632static int bput_one(handle_t *handle, struct buffer_head *bh)
2633{
2634        put_bh(bh);
2635        return 0;
2636}
2637
2638static int __ext4_journalled_writepage(struct page *page,
2639                                       unsigned int len)
2640{
2641        struct address_space *mapping = page->mapping;
2642        struct inode *inode = mapping->host;
2643        struct buffer_head *page_bufs;
2644        handle_t *handle = NULL;
2645        int ret = 0;
2646        int err;
2647
2648        ClearPageChecked(page);
2649        page_bufs = page_buffers(page);
2650        BUG_ON(!page_bufs);
2651        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
2652        /* As soon as we unlock the page, it can go away, but we have
2653         * references to buffers so we are safe */
2654        unlock_page(page);
2655
2656        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
2657        if (IS_ERR(handle)) {
2658                ret = PTR_ERR(handle);
2659                goto out;
2660        }
2661
2662        ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2663                                do_journal_get_write_access);
2664
2665        err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2666                                write_end_fn);
2667        if (ret == 0)
2668                ret = err;
2669        err = ext4_journal_stop(handle);
2670        if (!ret)
2671                ret = err;
2672
2673        walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2674        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2675out:
2676        return ret;
2677}
2678
2679static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2680static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2681
2682/*
2683 * Note that we don't need to start a transaction unless we're journaling data
2684 * because we should have holes filled from ext4_page_mkwrite(). We even don't
2685 * need to file the inode to the transaction's list in ordered mode because if
2686 * we are writing back data added by write(), the inode is already there and if
2687 * we are writing back data modified via mmap(), noone guarantees in which
2688 * transaction the data will hit the disk. In case we are journaling data, we
2689 * cannot start transaction directly because transaction start ranks above page
2690 * lock so we have to do some magic.
2691 *
2692 * This function can get called via...
2693 *   - ext4_da_writepages after taking page lock (have journal handle)
2694 *   - journal_submit_inode_data_buffers (no journal handle)
2695 *   - shrink_page_list via pdflush (no journal handle)
2696 *   - grab_page_cache when doing write_begin (have journal handle)
2697 *
2698 * We don't do any block allocation in this function. If we have page with
2699 * multiple blocks we need to write those buffer_heads that are mapped. This
2700 * is important for mmaped based write. So if we do with blocksize 1K
2701 * truncate(f, 1024);
2702 * a = mmap(f, 0, 4096);
2703 * a[0] = 'a';
2704 * truncate(f, 4096);
2705 * we have in the page first buffer_head mapped via page_mkwrite call back
2706 * but other bufer_heads would be unmapped but dirty(dirty done via the
2707 * do_wp_page). So writepage should write the first block. If we modify
2708 * the mmap area beyond 1024 we will again get a page_fault and the
2709 * page_mkwrite callback will do the block allocation and mark the
2710 * buffer_heads mapped.
2711 *
2712 * We redirty the page if we have any buffer_heads that is either delay or
2713 * unwritten in the page.
2714 *
2715 * We can get recursively called as show below.
2716 *
2717 *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
2718 *              ext4_writepage()
2719 *
2720 * But since we don't do any block allocation we should not deadlock.
2721 * Page also have the dirty flag cleared so we don't get recurive page_lock.
2722 */
2723static int ext4_writepage(struct page *page,
2724                          struct writeback_control *wbc)
2725{
2726        int ret = 0, commit_write = 0;
2727        loff_t size;
2728        unsigned int len;
2729        struct buffer_head *page_bufs = NULL;
2730        struct inode *inode = page->mapping->host;
2731
2732        trace_ext4_writepage(inode, page);
2733        size = i_size_read(inode);
2734        if (page->index == size >> PAGE_CACHE_SHIFT)
2735                len = size & ~PAGE_CACHE_MASK;
2736        else
2737                len = PAGE_CACHE_SIZE;
2738
2739        /*
2740         * If the page does not have buffers (for whatever reason),
2741         * try to create them using __block_write_begin.  If this
2742         * fails, redirty the page and move on.
2743         */
2744        if (!page_has_buffers(page)) {
2745                if (__block_write_begin(page, 0, len,
2746                                        noalloc_get_block_write)) {
2747                redirty_page:
2748                        redirty_page_for_writepage(wbc, page);
2749                        unlock_page(page);
2750                        return 0;
2751                }
2752                commit_write = 1;
2753        }
2754        page_bufs = page_buffers(page);
2755        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2756                              ext4_bh_delay_or_unwritten)) {
2757                /*
2758                 * We don't want to do block allocation, so redirty
2759                 * the page and return.  We may reach here when we do
2760                 * a journal commit via journal_submit_inode_data_buffers.
2761                 * We can also reach here via shrink_page_list
2762                 */
2763                goto redirty_page;
2764        }
2765        if (commit_write)
2766                /* now mark the buffer_heads as dirty and uptodate */
2767                block_commit_write(page, 0, len);
2768
2769        if (PageChecked(page) && ext4_should_journal_data(inode))
2770                /*
2771                 * It's mmapped pagecache.  Add buffers and journal it.  There
2772                 * doesn't seem much point in redirtying the page here.
2773                 */
2774                return __ext4_journalled_writepage(page, len);
2775
2776        if (buffer_uninit(page_bufs)) {
2777                ext4_set_bh_endio(page_bufs, inode);
2778                ret = block_write_full_page_endio(page, noalloc_get_block_write,
2779                                            wbc, ext4_end_io_buffer_write);
2780        } else
2781                ret = block_write_full_page(page, noalloc_get_block_write,
2782                                            wbc);
2783
2784        return ret;
2785}
2786
2787/*
2788 * This is called via ext4_da_writepages() to
2789 * calulate the total number of credits to reserve to fit
2790 * a single extent allocation into a single transaction,
2791 * ext4_da_writpeages() will loop calling this before
2792 * the block allocation.
2793 */
2794
2795static int ext4_da_writepages_trans_blocks(struct inode *inode)
2796{
2797        int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
2798
2799        /*
2800         * With non-extent format the journal credit needed to
2801         * insert nrblocks contiguous block is dependent on
2802         * number of contiguous block. So we will limit
2803         * number of contiguous block to a sane value
2804         */
2805        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2806            (max_blocks > EXT4_MAX_TRANS_DATA))
2807                max_blocks = EXT4_MAX_TRANS_DATA;
2808
2809        return ext4_chunk_trans_blocks(inode, max_blocks);
2810}
2811
2812/*
2813 * write_cache_pages_da - walk the list of dirty pages of the given
2814 * address space and call the callback function (which usually writes
2815 * the pages).
2816 *
2817 * This is a forked version of write_cache_pages().  Differences:
2818 *      Range cyclic is ignored.
2819 *      no_nrwrite_index_update is always presumed true
2820 */
2821static int write_cache_pages_da(struct address_space *mapping,
2822                                struct writeback_control *wbc,
2823                                struct mpage_da_data *mpd,
2824                                pgoff_t *done_index)
2825{
2826        int ret = 0;
2827        int done = 0;
2828        struct pagevec pvec;
2829        unsigned nr_pages;
2830        pgoff_t index;
2831        pgoff_t end;            /* Inclusive */
2832        long nr_to_write = wbc->nr_to_write;
2833        int tag;
2834
2835        pagevec_init(&pvec, 0);
2836        index = wbc->range_start >> PAGE_CACHE_SHIFT;
2837        end = wbc->range_end >> PAGE_CACHE_SHIFT;
2838
2839        if (wbc->sync_mode == WB_SYNC_ALL)
2840                tag = PAGECACHE_TAG_TOWRITE;
2841        else
2842                tag = PAGECACHE_TAG_DIRTY;
2843
2844        *done_index = index;
2845        while (!done && (index <= end)) {
2846                int i;
2847
2848                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2849                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2850                if (nr_pages == 0)
2851                        break;
2852
2853                for (i = 0; i < nr_pages; i++) {
2854                        struct page *page = pvec.pages[i];
2855
2856                        /*
2857                         * At this point, the page may be truncated or
2858                         * invalidated (changing page->mapping to NULL), or
2859                         * even swizzled back from swapper_space to tmpfs file
2860                         * mapping. However, page->index will not change
2861                         * because we have a reference on the page.
2862                         */
2863                        if (page->index > end) {
2864                                done = 1;
2865                                break;
2866                        }
2867
2868                        *done_index = page->index + 1;
2869
2870                        lock_page(page);
2871
2872                        /*
2873                         * Page truncated or invalidated. We can freely skip it
2874                         * then, even for data integrity operations: the page
2875                         * has disappeared concurrently, so there could be no
2876                         * real expectation of this data interity operation
2877                         * even if there is now a new, dirty page at the same
2878                         * pagecache address.
2879                         */
2880                        if (unlikely(page->mapping != mapping)) {
2881continue_unlock:
2882                                unlock_page(page);
2883                                continue;
2884                        }
2885
2886                        if (!PageDirty(page)) {
2887                                /* someone wrote it for us */
2888                                goto continue_unlock;
2889                        }
2890
2891                        if (PageWriteback(page)) {
2892                                if (wbc->sync_mode != WB_SYNC_NONE)
2893                                        wait_on_page_writeback(page);
2894                                else
2895                                        goto continue_unlock;
2896                        }
2897
2898                        BUG_ON(PageWriteback(page));
2899                        if (!clear_page_dirty_for_io(page))
2900                                goto continue_unlock;
2901
2902                        ret = __mpage_da_writepage(page, wbc, mpd);
2903                        if (unlikely(ret)) {
2904                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
2905                                        unlock_page(page);
2906                                        ret = 0;
2907                                } else {
2908                                        done = 1;
2909                                        break;
2910                                }
2911                        }
2912
2913                        if (nr_to_write > 0) {
2914                                nr_to_write--;
2915                                if (nr_to_write == 0 &&
2916                                    wbc->sync_mode == WB_SYNC_NONE) {
2917                                        /*
2918                                         * We stop writing back only if we are
2919                                         * not doing integrity sync. In case of
2920                                         * integrity sync we have to keep going
2921                                         * because someone may be concurrently
2922                                         * dirtying pages, and we might have
2923                                         * synced a lot of newly appeared dirty
2924                                         * pages, but have not synced all of the
2925                                         * old dirty pages.
2926                                         */
2927                                        done = 1;
2928                                        break;
2929                                }
2930                        }
2931                }
2932                pagevec_release(&pvec);
2933                cond_resched();
2934        }
2935        return ret;
2936}
2937
2938
2939static int ext4_da_writepages(struct address_space *mapping,
2940                              struct writeback_control *wbc)
2941{
2942        pgoff_t index;
2943        int range_whole = 0;
2944        handle_t *handle = NULL;
2945        struct mpage_da_data mpd;
2946        struct inode *inode = mapping->host;
2947        int pages_written = 0;
2948        long pages_skipped;
2949        unsigned int max_pages;
2950        int range_cyclic, cycled = 1, io_done = 0;
2951        int needed_blocks, ret = 0;
2952        long desired_nr_to_write, nr_to_writebump = 0;
2953        loff_t range_start = wbc->range_start;
2954        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2955        pgoff_t done_index = 0;
2956        pgoff_t end;
2957
2958        trace_ext4_da_writepages(inode, wbc);
2959
2960        /*
2961         * No pages to write? This is mainly a kludge to avoid starting
2962         * a transaction for special inodes like journal inode on last iput()
2963         * because that could violate lock ordering on umount
2964         */
2965        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2966                return 0;
2967
2968        /*
2969         * If the filesystem has aborted, it is read-only, so return
2970         * right away instead of dumping stack traces later on that
2971         * will obscure the real source of the problem.  We test
2972         * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2973         * the latter could be true if the filesystem is mounted
2974         * read-only, and in that case, ext4_da_writepages should
2975         * *never* be called, so if that ever happens, we would want
2976         * the stack trace.
2977         */
2978        if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2979                return -EROFS;
2980
2981        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2982                range_whole = 1;
2983
2984        range_cyclic = wbc->range_cyclic;
2985        if (wbc->range_cyclic) {
2986                index = mapping->writeback_index;
2987                if (index)
2988                        cycled = 0;
2989                wbc->range_start = index << PAGE_CACHE_SHIFT;
2990                wbc->range_end  = LLONG_MAX;
2991                wbc->range_cyclic = 0;
2992                end = -1;
2993        } else {
2994                index = wbc->range_start >> PAGE_CACHE_SHIFT;
2995                end = wbc->range_end >> PAGE_CACHE_SHIFT;
2996        }
2997
2998        /*
2999         * This works around two forms of stupidity.  The first is in
3000         * the writeback code, which caps the maximum number of pages

3001         * written to be 1024 pages.  This is wrong on multiple
3002         * levels; different architectues have a different page size,
3003         * which changes the maximum amount of data which gets
3004         * written.  Secondly, 4 megabytes is way too small.  XFS
3005         * forces this value to be 16 megabytes by multiplying
3006         * nr_to_write parameter by four, and then relies on its
3007         * allocator to allocate larger extents to make them
3008         * contiguous.  Unfortunately this brings us to the second
3009         * stupidity, which is that ext4's mballoc code only allocates
3010         * at most 2048 blocks.  So we force contiguous writes up to
3011         * the number of dirty blocks in the inode, or
3012         * sbi->max_writeback_mb_bump whichever is smaller.
3013         */
3014        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
3015        if (!range_cyclic && range_whole) {
3016                if (wbc->nr_to_write == LONG_MAX)
3017                        desired_nr_to_write = wbc->nr_to_write;
3018                else
3019                        desired_nr_to_write = wbc->nr_to_write * 8;
3020        } else
3021                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
3022                                                           max_pages);
3023        if (desired_nr_to_write > max_pages)
3024                desired_nr_to_write = max_pages;
3025
3026        if (wbc->nr_to_write < desired_nr_to_write) {
3027                nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
3028                wbc->nr_to_write = desired_nr_to_write;
3029        }
3030
3031        mpd.wbc = wbc;
3032        mpd.inode = mapping->host;
3033
3034        pages_skipped = wbc->pages_skipped;
3035
3036retry:
3037        if (wbc->sync_mode == WB_SYNC_ALL)
3038                tag_pages_for_writeback(mapping, index, end);
3039
3040        while (!ret && wbc->nr_to_write > 0) {
3041
3042                /*
3043                 * we  insert one extent at a time. So we need
3044                 * credit needed for single extent allocation.
3045                 * journalled mode is currently not supported
3046                 * by delalloc
3047                 */
3048                BUG_ON(ext4_should_journal_data(inode));
3049                needed_blocks = ext4_da_writepages_trans_blocks(inode);
3050
3051                /* start a new transaction*/
3052                handle = ext4_journal_start(inode, needed_blocks);
3053                if (IS_ERR(handle)) {
3054                        ret = PTR_ERR(handle);
3055                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
3056                               "%ld pages, ino %lu; err %d", __func__,
3057                                wbc->nr_to_write, inode->i_ino, ret);
3058                        goto out_writepages;
3059                }
3060
3061                /*
3062                 * Now call __mpage_da_writepage to find the next
3063                 * contiguous region of logical blocks that need
3064                 * blocks to be allocated by ext4.  We don't actually
3065                 * submit the blocks for I/O here, even though
3066                 * write_cache_pages thinks it will, and will set the
3067                 * pages as clean for write before calling
3068                 * __mpage_da_writepage().
3069                 */
3070                mpd.b_size = 0;
3071                mpd.b_state = 0;
3072                mpd.b_blocknr = 0;
3073                mpd.first_page = 0;
3074                mpd.next_page = 0;
3075                mpd.io_done = 0;
3076                mpd.pages_written = 0;
3077                mpd.retval = 0;
3078                ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3079                /*
3080                 * If we have a contiguous extent of pages and we
3081                 * haven't done the I/O yet, map the blocks and submit
3082                 * them for I/O.
3083                 */
3084                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3085                        mpage_da_map_and_submit(&mpd);
3086                        ret = MPAGE_DA_EXTENT_TAIL;
3087                }
3088                trace_ext4_da_write_pages(inode, &mpd);
3089                wbc->nr_to_write -= mpd.pages_written;
3090
3091                ext4_journal_stop(handle);
3092
3093                if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
3094                        /* commit the transaction which would
3095                         * free blocks released in the transaction
3096                         * and try again
3097                         */
3098                        jbd2_journal_force_commit_nested(sbi->s_journal);
3099                        wbc->pages_skipped = pages_skipped;
3100                        ret = 0;
3101                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
3102                        /*
3103                         * got one extent now try with
3104                         * rest of the pages
3105                         */
3106                        pages_written += mpd.pages_written;
3107                        wbc->pages_skipped = pages_skipped;
3108                        ret = 0;
3109                        io_done = 1;
3110                } else if (wbc->nr_to_write)
3111                        /*
3112                         * There is no more writeout needed
3113                         * or we requested for a noblocking writeout
3114                         * and we found the device congested
3115                         */
3116                        break;
3117        }
3118        if (!io_done && !cycled) {
3119                cycled = 1;
3120                index = 0;
3121                wbc->range_start = index << PAGE_CACHE_SHIFT;
3122                wbc->range_end  = mapping->writeback_index - 1;
3123                goto retry;
3124        }
3125        if (pages_skipped != wbc->pages_skipped)
3126                ext4_msg(inode->i_sb, KERN_CRIT,
3127                         "This should not happen leaving %s "
3128                         "with nr_to_write = %ld ret = %d",
3129                         __func__, wbc->nr_to_write, ret);
3130
3131        /* Update index */
3132        wbc->range_cyclic = range_cyclic;
3133        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
3134                /*
3135                 * set the writeback_index so that range_cyclic
3136                 * mode will write it back later
3137                 */
3138                mapping->writeback_index = done_index;
3139
3140out_writepages:
3141        wbc->nr_to_write -= nr_to_writebump;
3142        wbc->range_start = range_start;
3143        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
3144        return ret;
3145}
3146
3147#define FALL_BACK_TO_NONDELALLOC 1
3148static int ext4_nonda_switch(struct super_block *sb)
3149{
3150        s64 free_blocks, dirty_blocks;
3151        struct ext4_sb_info *sbi = EXT4_SB(sb);
3152
3153        /*
3154         * switch to non delalloc mode if we are running low
3155         * on free block. The free block accounting via percpu
3156         * counters can get slightly wrong with percpu_counter_batch getting
3157         * accumulated on each CPU without updating global counters
3158         * Delalloc need an accurate free block accounting. So switch
3159         * to non delalloc when we are near to error range.
3160         */
3161        free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
3162        dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
3163        if (2 * free_blocks < 3 * dirty_blocks ||
3164                free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
3165                /*
3166                 * free block count is less than 150% of dirty blocks
3167                 * or free blocks is less than watermark
3168                 */
3169                return 1;
3170        }
3171        /*
3172         * Even if we don't switch but are nearing capacity,
3173         * start pushing delalloc when 1/2 of free blocks are dirty.
3174         */
3175        if (free_blocks < 2 * dirty_blocks)
3176                writeback_inodes_sb_if_idle(sb);
3177
3178        return 0;
3179}
3180
3181static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3182                               loff_t pos, unsigned len, unsigned flags,
3183                               struct page **pagep, void **fsdata)
3184{
3185        int ret, retries = 0;
3186        struct page *page;
3187        pgoff_t index;
3188        struct inode *inode = mapping->host;
3189        handle_t *handle;
3190
3191        index = pos >> PAGE_CACHE_SHIFT;
3192
3193        if (ext4_nonda_switch(inode->i_sb)) {
3194                *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
3195                return ext4_write_begin(file, mapping, pos,
3196                                        len, flags, pagep, fsdata);
3197        }
3198        *fsdata = (void *)0;
3199        trace_ext4_da_write_begin(inode, pos, len, flags);
3200retry:
3201        /*
3202         * With delayed allocation, we don't log the i_disksize update
3203         * if there is delayed block allocation. But we still need
3204         * to journalling the i_disksize update if writes to the end
3205         * of file which has an already mapped buffer.
3206         */
3207        handle = ext4_journal_start(inode, 1);
3208        if (IS_ERR(handle)) {
3209                ret = PTR_ERR(handle);
3210                goto out;
3211        }
3212        /* We cannot recurse into the filesystem as the transaction is already
3213         * started */
3214        flags |= AOP_FLAG_NOFS;
3215
3216        page = grab_cache_page_write_begin(mapping, index, flags);
3217        if (!page) {
3218                ext4_journal_stop(handle);
3219                ret = -ENOMEM;
3220                goto out;
3221        }
3222        *pagep = page;
3223
3224        ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
3225        if (ret < 0) {
3226                unlock_page(page);
3227                ext4_journal_stop(handle);
3228                page_cache_release(page);
3229                /*
3230                 * block_write_begin may have instantiated a few blocks
3231                 * outside i_size.  Trim these off again. Don't need
3232                 * i_size_read because we hold i_mutex.
3233                 */
3234                if (pos + len > inode->i_size)
3235                        ext4_truncate_failed_write(inode);
3236        }
3237
3238        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3239                goto retry;
3240out:
3241        return ret;
3242}
3243
3244/*
3245 * Check if we should update i_disksize
3246 * when write to the end of file but not require block allocation
3247 */
3248static int ext4_da_should_update_i_disksize(struct page *page,
3249                                            unsigned long offset)
3250{
3251        struct buffer_head *bh;
3252        struct inode *inode = page->mapping->host;
3253        unsigned int idx;
3254        int i;
3255
3256        bh = page_buffers(page);
3257        idx = offset >> inode->i_blkbits;
3258
3259        for (i = 0; i < idx; i++)
3260                bh = bh->b_this_page;
3261
3262        if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
3263                return 0;
3264        return 1;
3265}
3266
3267static int ext4_da_write_end(struct file *file,
3268                             struct address_space *mapping,
3269                             loff_t pos, unsigned len, unsigned copied,
3270                             struct page *page, void *fsdata)
3271{
3272        struct inode *inode = mapping->host;
3273        int ret = 0, ret2;
3274        handle_t *handle = ext4_journal_current_handle();
3275        loff_t new_i_size;
3276        unsigned long start, end;
3277        int write_mode = (int)(unsigned long)fsdata;
3278
3279        if (write_mode == FALL_BACK_TO_NONDELALLOC) {
3280                if (ext4_should_order_data(inode)) {
3281                        return ext4_ordered_write_end(file, mapping, pos,
3282                                        len, copied, page, fsdata);
3283                } else if (ext4_should_writeback_data(inode)) {
3284                        return ext4_writeback_write_end(file, mapping, pos,
3285                                        len, copied, page, fsdata);
3286                } else {
3287                        BUG();
3288                }
3289        }
3290
3291        trace_ext4_da_write_end(inode, pos, len, copied);
3292        start = pos & (PAGE_CACHE_SIZE - 1);
3293        end = start + copied - 1;
3294
3295        /*
3296         * generic_write_end() will run mark_inode_dirty() if i_size
3297         * changes.  So let's piggyback the i_disksize mark_inode_dirty
3298         * into that.
3299         */
3300
3301        new_i_size = pos + copied;
3302        if (new_i_size > EXT4_I(inode)->i_disksize) {
3303                if (ext4_da_should_update_i_disksize(page, end)) {
3304                        down_write(&EXT4_I(inode)->i_data_sem);
3305                        if (new_i_size > EXT4_I(inode)->i_disksize) {
3306                                /*
3307                                 * Updating i_disksize when extending file
3308                                 * without needing block allocation
3309                                 */
3310                                if (ext4_should_order_data(inode))
3311                                        ret = ext4_jbd2_file_inode(handle,
3312                                                                   inode);
3313
3314                                EXT4_I(inode)->i_disksize = new_i_size;
3315                        }
3316                        up_write(&EXT4_I(inode)->i_data_sem);
3317                        /* We need to mark inode dirty even if
3318                         * new_i_size is less that inode->i_size
3319                         * bu greater than i_disksize.(hint delalloc)
3320                         */
3321                        ext4_mark_inode_dirty(handle, inode);
3322                }
3323        }
3324        ret2 = generic_write_end(file, mapping, pos, len, copied,
3325                                                        page, fsdata);
3326        copied = ret2;
3327        if (ret2 < 0)
3328                ret = ret2;
3329        ret2 = ext4_journal_stop(handle);
3330        if (!ret)
3331                ret = ret2;
3332
3333        return ret ? ret : copied;
3334}
3335
3336static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
3337{
3338        /*
3339         * Drop reserved blocks
3340         */
3341        BUG_ON(!PageLocked(page));
3342        if (!page_has_buffers(page))
3343                goto out;
3344
3345        ext4_da_page_release_reservation(page, offset);
3346
3347out:
3348        ext4_invalidatepage(page, offset);
3349
3350        return;
3351}
3352
3353/*
3354 * Force all delayed allocation blocks to be allocated for a given inode.
3355 */
3356int ext4_alloc_da_blocks(struct inode *inode)
3357{
3358        trace_ext4_alloc_da_blocks(inode);
3359
3360        if (!EXT4_I(inode)->i_reserved_data_blocks &&
3361            !EXT4_I(inode)->i_reserved_meta_blocks)
3362                return 0;
3363
3364        /*
3365         * We do something simple for now.  The filemap_flush() will
3366         * also start triggering a write of the data blocks, which is
3367         * not strictly speaking necessary (and for users of
3368         * laptop_mode, not even desirable).  However, to do otherwise
3369         * would require replicating code paths in:
3370         *
3371         * ext4_da_writepages() ->
3372         *    write_cache_pages() ---> (via passed in callback function)
3373         *        __mpage_da_writepage() -->
3374         *           mpage_add_bh_to_extent()
3375         *           mpage_da_map_blocks()
3376         *
3377         * The problem is that write_cache_pages(), located in
3378         * mm/page-writeback.c, marks pages clean in preparation for
3379         * doing I/O, which is not desirable if we're not planning on
3380         * doing I/O at all.
3381         *
3382         * We could call write_cache_pages(), and then redirty all of
3383         * the pages by calling redirty_page_for_writepage() but that
3384         * would be ugly in the extreme.  So instead we would need to
3385         * replicate parts of the code in the above functions,
3386         * simplifying them becuase we wouldn't actually intend to
3387         * write out the pages, but rather only collect contiguous
3388         * logical block extents, call the multi-block allocator, and
3389         * then update the buffer heads with the block allocations.
3390         *
3391         * For now, though, we'll cheat by calling filemap_flush(),
3392         * which will map the blocks, and start the I/O, but not
3393         * actually wait for the I/O to complete.
3394         */
3395        return filemap_flush(inode->i_mapping);
3396}
3397
3398/*
3399 * bmap() is special.  It gets used by applications such as lilo and by
3400 * the swapper to find the on-disk block of a specific piece of data.
3401 *
3402 * Naturally, this is dangerous if the block concerned is still in the
3403 * journal.  If somebody makes a swapfile on an ext4 data-journaling
3404 * filesystem and enables swap, then they may get a nasty shock when the
3405 * data getting swapped to that swapfile suddenly gets overwritten by
3406 * the original zero's written out previously to the journal and
3407 * awaiting writeback in the kernel's buffer cache.
3408 *
3409 * So, if we see any bmap calls here on a modified, data-journaled file,
3410 * take extra steps to flush any blocks which might be in the cache.
3411 */
3412static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3413{
3414        struct inode *inode = mapping->host;
3415        journal_t *journal;
3416        int err;
3417
3418        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3419                        test_opt(inode->i_sb, DELALLOC)) {
3420                /*
3421                 * With delalloc we want to sync the file
3422                 * so that we can make sure we allocate
3423                 * blocks for file
3424                 */
3425                filemap_write_and_wait(mapping);
3426        }
3427
3428        if (EXT4_JOURNAL(inode) &&
3429            ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3430                /*
3431                 * This is a REALLY heavyweight approach, but the use of
3432                 * bmap on dirty files is expected to be extremely rare:
3433                 * only if we run lilo or swapon on a freshly made file
3434                 * do we expect this to happen.
3435                 *
3436                 * (bmap requires CAP_SYS_RAWIO so this does not
3437                 * represent an unprivileged user DOS attack --- we'd be
3438                 * in trouble if mortal users could trigger this path at
3439                 * will.)
3440                 *
3441                 * NB. EXT4_STATE_JDATA is not set on files other than
3442                 * regular files.  If somebody wants to bmap a directory
3443                 * or symlink and gets confused because the buffer
3444                 * hasn't yet been flushed to disk, they deserve
3445                 * everything they get.
3446                 */
3447
3448                ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3449                journal = EXT4_JOURNAL(inode);
3450                jbd2_journal_lock_updates(journal);
3451                err = jbd2_journal_flush(journal);
3452                jbd2_journal_unlock_updates(journal);
3453
3454                if (err)
3455                        return 0;
3456        }
3457
3458        return generic_block_bmap(mapping, block, ext4_get_block);
3459}
3460
3461static int ext4_readpage(struct file *file, struct page *page)
3462{
3463        return mpage_readpage(page, ext4_get_block);
3464}
3465
3466static int
3467ext4_readpages(struct file *file, struct address_space *mapping,
3468                struct list_head *pages, unsigned nr_pages)
3469{
3470        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3471}
3472
3473static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3474{
3475        struct buffer_head *head, *bh;
3476        unsigned int curr_off = 0;
3477
3478        if (!page_has_buffers(page))
3479                return;
3480        head = bh = page_buffers(page);
3481        do {
3482                if (offset <= curr_off && test_clear_buffer_uninit(bh)
3483                                        && bh->b_private) {
3484                        ext4_free_io_end(bh->b_private);
3485                        bh->b_private = NULL;
3486                        bh->b_end_io = NULL;
3487                }
3488                curr_off = curr_off + bh->b_size;
3489                bh = bh->b_this_page;
3490        } while (bh != head);
3491}
3492
3493static void ext4_invalidatepage(struct page *page, unsigned long offset)
3494{
3495        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3496
3497        /*
3498         * free any io_end structure allocated for buffers to be discarded
3499         */
3500        if (ext4_should_dioread_nolock(page->mapping->host))
3501                ext4_invalidatepage_free_endio(page, offset);
3502        /*
3503         * If it's a full truncate we just forget about the pending dirtying
3504         */
3505        if (offset == 0)
3506                ClearPageChecked(page);
3507
3508        if (journal)
3509                jbd2_journal_invalidatepage(journal, page, offset);
3510        else
3511                block_invalidatepage(page, offset);
3512}
3513
3514static int ext4_releasepage(struct page *page, gfp_t wait)
3515{
3516        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3517
3518        WARN_ON(PageChecked(page));
3519        if (!page_has_buffers(page))
3520                return 0;
3521        if (journal)
3522                return jbd2_journal_try_to_free_buffers(journal, page, wait);
3523        else
3524                return try_to_free_buffers(page);
3525}
3526
3527/*
3528 * O_DIRECT for ext3 (or indirect map) based files
3529 *
3530 * If the O_DIRECT write will extend the file then add this inode to the
3531 * orphan list.  So recovery will truncate it back to the original size
3532 * if the machine crashes during the write.
3533 *
3534 * If the O_DIRECT write is intantiating holes inside i_size and the machine
3535 * crashes then stale disk data _may_ be exposed inside the file. But current
3536 * VFS code falls back into buffered path in that case so we are safe.
3537 */
3538static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3539                              const struct iovec *iov, loff_t offset,
3540                              unsigned long nr_segs)
3541{
3542        struct file *file = iocb->ki_filp;
3543        struct inode *inode = file->f_mapping->host;
3544        struct ext4_inode_info *ei = EXT4_I(inode);
3545        handle_t *handle;
3546        ssize_t ret;
3547        int orphan = 0;
3548        size_t count = iov_length(iov, nr_segs);
3549        int retries = 0;
3550
3551        if (rw == WRITE) {
3552                loff_t final_size = offset + count;
3553
3554                if (final_size > inode->i_size) {
3555                        /* Credits for sb + inode write */
3556                        handle = ext4_journal_start(inode, 2);
3557                        if (IS_ERR(handle)) {
3558                                ret = PTR_ERR(handle);
3559                                goto out;
3560                        }
3561                        ret = ext4_orphan_add(handle, inode);
3562                        if (ret) {
3563                                ext4_journal_stop(handle);
3564                                goto out;
3565                        }
3566                        orphan = 1;
3567                        ei->i_disksize = inode->i_size;
3568                        ext4_journal_stop(handle);
3569                }
3570        }
3571
3572retry:
3573        if (rw == READ && ext4_should_dioread_nolock(inode))
3574                ret = __blockdev_direct_IO(rw, iocb, inode,
3575                                 inode->i_sb->s_bdev, iov,
3576                                 offset, nr_segs,
3577                                 ext4_get_block, NULL, NULL, 0);
3578        else {
3579                ret = blockdev_direct_IO(rw, iocb, inode,
3580                                 inode->i_sb->s_bdev, iov,
3581                                 offset, nr_segs,
3582                                 ext4_get_block, NULL);
3583
3584                if (unlikely((rw & WRITE) && ret < 0)) {
3585                        loff_t isize = i_size_read(inode);
3586                        loff_t end = offset + iov_length(iov, nr_segs);
3587
3588                        if (end > isize)
3589                                vmtruncate(inode, isize);
3590                }
3591        }
3592        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3593                goto retry;
3594
3595        if (orphan) {
3596                int err;
3597
3598                /* Credits for sb + inode write */
3599                handle = ext4_journal_start(inode, 2);
3600                if (IS_ERR(handle)) {
3601                        /* This is really bad luck. We've written the data
3602                         * but cannot extend i_size. Bail out and pretend
3603                         * the write failed... */
3604                        ret = PTR_ERR(handle);
3605                        if (inode->i_nlink)
3606                                ext4_orphan_del(NULL, inode);
3607
3608                        goto out;
3609                }
3610                if (inode->i_nlink)
3611                        ext4_orphan_del(handle, inode);
3612                if (ret > 0) {
3613                        loff_t end = offset + ret;
3614                        if (end > inode->i_size) {
3615                                ei->i_disksize = end;
3616                                i_size_write(inode, end);
3617                                /*
3618                                 * We're going to return a positive `ret'
3619                                 * here due to non-zero-length I/O, so there's
3620                                 * no way of reporting error returns from
3621                                 * ext4_mark_inode_dirty() to userspace.  So
3622                                 * ignore it.
3623                                 */
3624                                ext4_mark_inode_dirty(handle, inode);
3625                        }
3626                }
3627                err = ext4_journal_stop(handle);
3628                if (ret == 0)
3629                        ret = err;
3630        }
3631out:
3632        return ret;
3633}
3634
3635/*
3636 * ext4_get_block used when preparing for a DIO write or buffer write.
3637 * We allocate an uinitialized extent if blocks haven't been allocated.
3638 * The extent will be converted to initialized after the IO is complete.
3639 */
3640static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3641                   struct buffer_head *bh_result, int create)
3642{
3643        ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3644                   inode->i_ino, create);
3645        return _ext4_get_block(inode, iblock, bh_result,
3646                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
3647}
3648
3649static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3650                            ssize_t size, void *private, int ret,
3651                            bool is_async)
3652{
3653        ext4_io_end_t *io_end = iocb->private;
3654        struct workqueue_struct *wq;
3655        unsigned long flags;
3656        struct ext4_inode_info *ei;
3657
3658        /* if not async direct IO or dio with 0 bytes write, just return */
3659        if (!io_end || !size)
3660                goto out;
3661
3662        ext_debug("ext4_end_io_dio(): io_end 0x%p"
3663                  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3664                  iocb->private, io_end->inode->i_ino, iocb, offset,
3665                  size);
3666
3667        /* if not aio dio with unwritten extents, just free io and return */
3668        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3669                ext4_free_io_end(io_end);
3670                iocb->private = NULL;
3671out:
3672                if (is_async)
3673                        aio_complete(iocb, ret, 0);
3674                return;
3675        }
3676
3677        io_end->offset = offset;
3678        io_end->size = size;
3679        if (is_async) {
3680                io_end->iocb = iocb;
3681                io_end->result = ret;
3682        }
3683        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3684
3685        /* Add the io_end to per-inode completed aio dio list*/
3686        ei = EXT4_I(io_end->inode);
3687        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3688        list_add_tail(&io_end->list, &ei->i_completed_io_list);
3689        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3690
3691        /* queue the work to convert unwritten extents to written */
3692        queue_work(wq, &io_end->work);
3693        iocb->private = NULL;
3694}
3695
3696static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3697{
3698        ext4_io_end_t *io_end = bh->b_private;
3699        struct workqueue_struct *wq;
3700        struct inode *inode;
3701        unsigned long flags;
3702
3703        if (!test_clear_buffer_uninit(bh) || !io_end)
3704                goto out;
3705
3706        if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3707                printk("sb umounted, discard end_io request for inode %lu\n",
3708                        io_end->inode->i_ino);
3709                ext4_free_io_end(io_end);
3710                goto out;
3711        }
3712
3713        io_end->flag = EXT4_IO_END_UNWRITTEN;
3714        inode = io_end->inode;
3715
3716        /* Add the io_end to per-inode completed io list*/
3717        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3718        list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3719        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3720
3721        wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3722        /* queue the work to convert unwritten extents to written */
3723        queue_work(wq, &io_end->work);
3724out:
3725        bh->b_private = NULL;
3726        bh->b_end_io = NULL;
3727        clear_buffer_uninit(bh);
3728        end_buffer_async_write(bh, uptodate);
3729}
3730
3731static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3732{
3733        ext4_io_end_t *io_end;
3734        struct page *page = bh->b_page;
3735        loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3736        size_t size = bh->b_size;
3737
3738retry:
3739        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3740        if (!io_end) {
3741                pr_warn_ratelimited("%s: allocation fail\n", __func__);
3742                schedule();
3743                goto retry;
3744        }
3745        io_end->offset = offset;
3746        io_end->size = size;
3747        /*
3748         * We need to hold a reference to the page to make sure it
3749         * doesn't get evicted before ext4_end_io_work() has a chance
3750         * to convert the extent from written to unwritten.
3751         */
3752        io_end->page = page;
3753        get_page(io_end->page);
3754
3755        bh->b_private = io_end;
3756        bh->b_end_io = ext4_end_io_buffer_write;
3757        return 0;
3758}
3759
3760/*
3761 * For ext4 extent files, ext4 will do direct-io write to holes,
3762 * preallocated extents, and those write extend the file, no need to
3763 * fall back to buffered IO.
3764 *
3765 * For holes, we fallocate those blocks, mark them as uninitialized
3766 * If those blocks were preallocated, we mark sure they are splited, but
3767 * still keep the range to write as uninitialized.
3768 *
3769 * The unwrritten extents will be converted to written when DIO is completed.
3770 * For async direct IO, since the IO may still pending when return, we
3771 * set up an end_io call back function, which will do the convertion
3772 * when async direct IO completed.
3773 *
3774 * If the O_DIRECT write will extend the file then add this inode to the
3775 * orphan list.  So recovery will truncate it back to the original size
3776 * if the machine crashes during the write.
3777 *
3778 */
3779static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3780                              const struct iovec *iov, loff_t offset,
3781                              unsigned long nr_segs)
3782{
3783        struct file *file = iocb->ki_filp;
3784        struct inode *inode = file->f_mapping->host;
3785        ssize_t ret;
3786        size_t count = iov_length(iov, nr_segs);
3787
3788        loff_t final_size = offset + count;
3789        if (rw == WRITE && final_size <= inode->i_size) {
3790                /*
3791                 * We could direct write to holes and fallocate.
3792                 *
3793                 * Allocated blocks to fill the hole are marked as uninitialized
3794                 * to prevent paralel buffered read to expose the stale data
3795                 * before DIO complete the data IO.
3796                 *
3797                 * As to previously fallocated extents, ext4 get_block
3798                 * will just simply mark the buffer mapped but still
3799                 * keep the extents uninitialized.
3800                 *
3801                 * for non AIO case, we will convert those unwritten extents
3802                 * to written after return back from blockdev_direct_IO.
3803                 *
3804                 * for async DIO, the conversion needs to be defered when
3805                 * the IO is completed. The ext4 end_io callback function
3806                 * will be called to take care of the conversion work.
3807                 * Here for async case, we allocate an io_end structure to
3808                 * hook to the iocb.
3809                 */
3810                iocb->private = NULL;
3811                EXT4_I(inode)->cur_aio_dio = NULL;
3812                if (!is_sync_kiocb(iocb)) {
3813                        iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3814                        if (!iocb->private)
3815                                return -ENOMEM;
3816                        /*
3817                         * we save the io structure for current async
3818                         * direct IO, so that later ext4_map_blocks()
3819                         * could flag the io structure whether there
3820                         * is a unwritten extents needs to be converted
3821                         * when IO is completed.
3822                         */
3823                        EXT4_I(inode)->cur_aio_dio = iocb->private;
3824                }
3825
3826                ret = blockdev_direct_IO(rw, iocb, inode,
3827                                         inode->i_sb->s_bdev, iov,
3828                                         offset, nr_segs,
3829                                         ext4_get_block_write,
3830                                         ext4_end_io_dio);
3831                if (iocb->private)
3832                        EXT4_I(inode)->cur_aio_dio = NULL;
3833                /*
3834                 * The io_end structure takes a reference to the inode,
3835                 * that structure needs to be destroyed and the
3836                 * reference to the inode need to be dropped, when IO is
3837                 * complete, even with 0 byte write, or failed.
3838                 *
3839                 * In the successful AIO DIO case, the io_end structure will be
3840                 * desctroyed and the reference to the inode will be dropped
3841                 * after the end_io call back function is called.
3842                 *
3843                 * In the case there is 0 byte write, or error case, since
3844                 * VFS direct IO won't invoke the end_io call back function,
3845                 * we need to free the end_io structure here.
3846                 */
3847                if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3848                        ext4_free_io_end(iocb->private);
3849                        iocb->private = NULL;
3850                } else if (ret > 0 && ext4_test_inode_state(inode,
3851                                                EXT4_STATE_DIO_UNWRITTEN)) {
3852                        int err;
3853                        /*
3854                         * for non AIO case, since the IO is already
3855                         * completed, we could do the convertion right here
3856                         */
3857                        err = ext4_convert_unwritten_extents(inode,
3858                                                             offset, ret);
3859                        if (err < 0)
3860                                ret = err;
3861                        ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3862                }
3863                return ret;
3864        }
3865
3866        /* for write the the end of file case, we fall back to old way */
3867        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3868}
3869
3870static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3871                              const struct iovec *iov, loff_t offset,
3872                              unsigned long nr_segs)
3873{
3874        struct file *file = iocb->ki_filp;
3875        struct inode *inode = file->f_mapping->host;
3876
3877        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3878                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3879
3880        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3881}
3882
3883/*
3884 * Pages can be marked dirty completely asynchronously from ext4's journalling
3885 * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
3886 * much here because ->set_page_dirty is called under VFS locks.  The page is
3887 * not necessarily locked.
3888 *
3889 * We cannot just dirty the page and leave attached buffers clean, because the
3890 * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
3891 * or jbddirty because all the journalling code will explode.
3892 *
3893 * So what we do is to mark the page "pending dirty" and next time writepage
3894 * is called, propagate that into the buffers appropriately.
3895 */
3896static int ext4_journalled_set_page_dirty(struct page *page)
3897{
3898        SetPageChecked(page);
3899        return __set_page_dirty_nobuffers(page);
3900}
3901
3902static const struct address_space_operations ext4_ordered_aops = {
3903        .readpage               = ext4_readpage,
3904        .readpages              = ext4_readpages,
3905        .writepage              = ext4_writepage,
3906        .sync_page              = block_sync_page,
3907        .write_begin            = ext4_write_begin,
3908        .write_end              = ext4_ordered_write_end,
3909        .bmap                   = ext4_bmap,
3910        .invalidatepage         = ext4_invalidatepage,
3911        .releasepage            = ext4_releasepage,
3912        .direct_IO              = ext4_direct_IO,
3913        .migratepage            = buffer_migrate_page,
3914        .is_partially_uptodate  = block_is_partially_uptodate,
3915        .error_remove_page      = generic_error_remove_page,
3916};
3917
3918static const struct address_space_operations ext4_writeback_aops = {
3919        .readpage               = ext4_readpage,
3920        .readpages              = ext4_readpages,
3921        .writepage              = ext4_writepage,
3922        .sync_page              = block_sync_page,
3923        .write_begin            = ext4_write_begin,
3924        .write_end              = ext4_writeback_write_end,
3925        .bmap                   = ext4_bmap,
3926        .invalidatepage         = ext4_invalidatepage,
3927        .releasepage            = ext4_releasepage,
3928        .direct_IO              = ext4_direct_IO,
3929        .migratepage            = buffer_migrate_page,
3930        .is_partially_uptodate  = block_is_partially_uptodate,
3931        .error_remove_page      = generic_error_remove_page,
3932};
3933
3934static const struct address_space_operations ext4_journalled_aops = {
3935        .readpage               = ext4_readpage,
3936        .readpages              = ext4_readpages,
3937        .writepage              = ext4_writepage,
3938        .sync_page              = block_sync_page,
3939        .write_begin            = ext4_write_begin,
3940        .write_end              = ext4_journalled_write_end,
3941        .set_page_dirty         = ext4_journalled_set_page_dirty,
3942        .bmap                   = ext4_bmap,
3943        .invalidatepage         = ext4_invalidatepage,
3944        .releasepage            = ext4_releasepage,
3945        .is_partially_uptodate  = block_is_partially_uptodate,
3946        .error_remove_page      = generic_error_remove_page,
3947};
3948
3949static const struct address_space_operations ext4_da_aops = {
3950        .readpage               = ext4_readpage,
3951        .readpages              = ext4_readpages,
3952        .writepage              = ext4_writepage,
3953        .writepages             = ext4_da_writepages,
3954        .sync_page              = block_sync_page,
3955        .write_begin            = ext4_da_write_begin,
3956        .write_end              = ext4_da_write_end,
3957        .bmap                   = ext4_bmap,
3958        .invalidatepage         = ext4_da_invalidatepage,
3959        .releasepage            = ext4_releasepage,
3960        .direct_IO              = ext4_direct_IO,
3961        .migratepage            = buffer_migrate_page,
3962        .is_partially_uptodate  = block_is_partially_uptodate,
3963        .error_remove_page      = generic_error_remove_page,
3964};
3965
3966void ext4_set_aops(struct inode *inode)
3967{
3968        if (ext4_should_order_data(inode) &&
3969                test_opt(inode->i_sb, DELALLOC))
3970                inode->i_mapping->a_ops = &ext4_da_aops;
3971        else if (ext4_should_order_data(inode))
3972                inode->i_mapping->a_ops = &ext4_ordered_aops;
3973        else if (ext4_should_writeback_data(inode) &&
3974                 test_opt(inode->i_sb, DELALLOC))
3975                inode->i_mapping->a_ops = &ext4_da_aops;
3976        else if (ext4_should_writeback_data(inode))
3977                inode->i_mapping->a_ops = &ext4_writeback_aops;
3978        else
3979                inode->i_mapping->a_ops = &ext4_journalled_aops;
3980}
3981
3982/*
3983 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3984 * up to the end of the block which corresponds to `from'.
3985 * This required during truncate. We need to physically zero the tail end
3986 * of that block so it doesn't yield old data if the file is later grown.
3987 */
3988int ext4_block_truncate_page(handle_t *handle,
3989                struct address_space *mapping, loff_t from)
3990{
3991        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3992        unsigned offset = from & (PAGE_CACHE_SIZE-1);
3993        unsigned blocksize, length, pos;
3994        ext4_lblk_t iblock;
3995        struct inode *inode = mapping->host;
3996        struct buffer_head *bh;
3997        struct page *page;
3998        int err = 0;
3999
4000        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,

4001                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
4002        if (!page)
4003                return -EINVAL;
4004
4005        blocksize = inode->i_sb->s_blocksize;
4006        length = blocksize - (offset & (blocksize - 1));
4007        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
4008
4009        if (!page_has_buffers(page))
4010                create_empty_buffers(page, blocksize, 0);
4011
4012        /* Find the buffer that contains "offset" */
4013        bh = page_buffers(page);
4014        pos = blocksize;
4015        while (offset >= pos) {
4016                bh = bh->b_this_page;
4017                iblock++;
4018                pos += blocksize;
4019        }
4020
4021        err = 0;
4022        if (buffer_freed(bh)) {
4023                BUFFER_TRACE(bh, "freed: skip");
4024                goto unlock;
4025        }
4026
4027        if (!buffer_mapped(bh)) {
4028                BUFFER_TRACE(bh, "unmapped");
4029                ext4_get_block(inode, iblock, bh, 0);
4030                /* unmapped? It's a hole - nothing to do */
4031                if (!buffer_mapped(bh)) {
4032                        BUFFER_TRACE(bh, "still unmapped");
4033                        goto unlock;
4034                }
4035        }
4036
4037        /* Ok, it's mapped. Make sure it's up-to-date */
4038        if (PageUptodate(page))
4039                set_buffer_uptodate(bh);
4040
4041        if (!buffer_uptodate(bh)) {
4042                err = -EIO;
4043                ll_rw_block(READ, 1, &bh);
4044                wait_on_buffer(bh);
4045                /* Uhhuh. Read error. Complain and punt. */
4046                if (!buffer_uptodate(bh))
4047                        goto unlock;
4048        }
4049
4050        if (ext4_should_journal_data(inode)) {
4051                BUFFER_TRACE(bh, "get write access");
4052                err = ext4_journal_get_write_access(handle, bh);
4053                if (err)
4054                        goto unlock;
4055        }
4056
4057        zero_user(page, offset, length);
4058
4059        BUFFER_TRACE(bh, "zeroed end of block");
4060
4061        err = 0;
4062        if (ext4_should_journal_data(inode)) {
4063                err = ext4_handle_dirty_metadata(handle, inode, bh);
4064        } else {
4065                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
4066                        err = ext4_jbd2_file_inode(handle, inode);
4067                mark_buffer_dirty(bh);
4068        }
4069
4070unlock:
4071        unlock_page(page);
4072        page_cache_release(page);
4073        return err;
4074}
4075
4076/*
4077 * Probably it should be a library function... search for first non-zero word
4078 * or memcmp with zero_page, whatever is better for particular architecture.
4079 * Linus?
4080 */
4081static inline int all_zeroes(__le32 *p, __le32 *q)
4082{
4083        while (p < q)
4084                if (*p++)
4085                        return 0;
4086        return 1;
4087}
4088
4089/**
4090 *      ext4_find_shared - find the indirect blocks for partial truncation.
4091 *      @inode:   inode in question
4092 *      @depth:   depth of the affected branch
4093 *      @offsets: offsets of pointers in that branch (see ext4_block_to_path)
4094 *      @chain:   place to store the pointers to partial indirect blocks
4095 *      @top:     place to the (detached) top of branch
4096 *
4097 *      This is a helper function used by ext4_truncate().
4098 *
4099 *      When we do truncate() we may have to clean the ends of several
4100 *      indirect blocks but leave the blocks themselves alive. Block is
4101 *      partially truncated if some data below the new i_size is refered
4102 *      from it (and it is on the path to the first completely truncated
4103 *      data block, indeed).  We have to free the top of that path along
4104 *      with everything to the right of the path. Since no allocation
4105 *      past the truncation point is possible until ext4_truncate()
4106 *      finishes, we may safely do the latter, but top of branch may
4107 *      require special attention - pageout below the truncation point
4108 *      might try to populate it.
4109 *
4110 *      We atomically detach the top of branch from the tree, store the
4111 *      block number of its root in *@top, pointers to buffer_heads of
4112 *      partially truncated blocks - in @chain[].bh and pointers to
4113 *      their last elements that should not be removed - in
4114 *      @chain[].p. Return value is the pointer to last filled element
4115 *      of @chain.
4116 *
4117 *      The work left to caller to do the actual freeing of subtrees:
4118 *              a) free the subtree starting from *@top
4119 *              b) free the subtrees whose roots are stored in
4120 *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
4121 *              c) free the subtrees growing from the inode past the @chain[0].
4122 *                      (no partially truncated stuff there).  */
4123
4124static Indirect *ext4_find_shared(struct inode *inode, int depth,
4125                                  ext4_lblk_t offsets[4], Indirect chain[4],
4126                                  __le32 *top)
4127{
4128        Indirect *partial, *p;
4129        int k, err;
4130
4131        *top = 0;
4132        /* Make k index the deepest non-null offset + 1 */
4133        for (k = depth; k > 1 && !offsets[k-1]; k--)
4134                ;
4135        partial = ext4_get_branch(inode, k, offsets, chain, &err);
4136        /* Writer: pointers */
4137        if (!partial)
4138                partial = chain + k-1;
4139        /*
4140         * If the branch acquired continuation since we've looked at it -
4141         * fine, it should all survive and (new) top doesn't belong to us.
4142         */
4143        if (!partial->key && *partial->p)
4144                /* Writer: end */
4145                goto no_top;
4146        for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
4147                ;
4148        /*
4149         * OK, we've found the last block that must survive. The rest of our
4150         * branch should be detached before unlocking. However, if that rest
4151         * of branch is all ours and does not grow immediately from the inode
4152         * it's easier to cheat and just decrement partial->p.
4153         */
4154        if (p == chain + k - 1 && p > chain) {
4155                p->p--;
4156        } else {
4157                *top = *p->p;
4158                /* Nope, don't do this in ext4.  Must leave the tree intact */
4159#if 0
4160                *p->p = 0;
4161#endif
4162        }
4163        /* Writer: end */
4164
4165        while (partial > p) {
4166                brelse(partial->bh);
4167                partial--;
4168        }
4169no_top:
4170        return partial;
4171}
4172
4173/*
4174 * Zero a number of block pointers in either an inode or an indirect block.
4175 * If we restart the transaction we must again get write access to the
4176 * indirect block for further modification.
4177 *
4178 * We release `count' blocks on disk, but (last - first) may be greater
4179 * than `count' because there can be holes in there.
4180 */
4181static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4182                             struct buffer_head *bh,
4183                             ext4_fsblk_t block_to_free,
4184                             unsigned long count, __le32 *first,
4185                             __le32 *last)
4186{
4187        __le32 *p;
4188        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4189        int     err;
4190
4191        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4192                flags |= EXT4_FREE_BLOCKS_METADATA;
4193
4194        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4195                                   count)) {
4196                EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4197                                 "blocks %llu len %lu",
4198                                 (unsigned long long) block_to_free, count);
4199                return 1;
4200        }
4201
4202        if (try_to_extend_transaction(handle, inode)) {
4203                if (bh) {
4204                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4205                        err = ext4_handle_dirty_metadata(handle, inode, bh);
4206                        if (unlikely(err)) {
4207                                ext4_std_error(inode->i_sb, err);
4208                                return 1;
4209                        }
4210                }
4211                err = ext4_mark_inode_dirty(handle, inode);
4212                if (unlikely(err)) {
4213                        ext4_std_error(inode->i_sb, err);
4214                        return 1;
4215                }
4216                err = ext4_truncate_restart_trans(handle, inode,
4217                                                  blocks_for_truncate(inode));
4218                if (unlikely(err)) {
4219                        ext4_std_error(inode->i_sb, err);
4220                        return 1;
4221                }
4222                if (bh) {
4223                        BUFFER_TRACE(bh, "retaking write access");
4224                        ext4_journal_get_write_access(handle, bh);
4225                }
4226        }
4227
4228        for (p = first; p < last; p++)
4229                *p = 0;
4230
4231        ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
4232        return 0;
4233}
4234
4235/**
4236 * ext4_free_data - free a list of data blocks
4237 * @handle:     handle for this transaction
4238 * @inode:      inode we are dealing with
4239 * @this_bh:    indirect buffer_head which contains *@first and *@last
4240 * @first:      array of block numbers
4241 * @last:       points immediately past the end of array
4242 *
4243 * We are freeing all blocks refered from that array (numbers are stored as
4244 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
4245 *
4246 * We accumulate contiguous runs of blocks to free.  Conveniently, if these
4247 * blocks are contiguous then releasing them at one time will only affect one
4248 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
4249 * actually use a lot of journal space.
4250 *
4251 * @this_bh will be %NULL if @first and @last point into the inode's direct
4252 * block pointers.
4253 */
4254static void ext4_free_data(handle_t *handle, struct inode *inode,
4255                           struct buffer_head *this_bh,
4256                           __le32 *first, __le32 *last)
4257{
4258        ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
4259        unsigned long count = 0;            /* Number of blocks in the run */
4260        __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
4261                                               corresponding to
4262                                               block_to_free */
4263        ext4_fsblk_t nr;                    /* Current block # */
4264        __le32 *p;                          /* Pointer into inode/ind
4265                                               for current block */
4266        int err;
4267
4268        if (this_bh) {                          /* For indirect block */
4269                BUFFER_TRACE(this_bh, "get_write_access");
4270                err = ext4_journal_get_write_access(handle, this_bh);
4271                /* Important: if we can't update the indirect pointers
4272                 * to the blocks, we can't free them. */
4273                if (err)
4274                        return;
4275        }
4276
4277        for (p = first; p < last; p++) {
4278                nr = le32_to_cpu(*p);
4279                if (nr) {
4280                        /* accumulate blocks to free if they're contiguous */
4281                        if (count == 0) {
4282                                block_to_free = nr;
4283                                block_to_free_p = p;
4284                                count = 1;
4285                        } else if (nr == block_to_free + count) {
4286                                count++;
4287                        } else {
4288                                if (ext4_clear_blocks(handle, inode, this_bh,
4289                                                      block_to_free, count,
4290                                                      block_to_free_p, p))
4291                                        break;
4292                                block_to_free = nr;
4293                                block_to_free_p = p;
4294                                count = 1;
4295                        }
4296                }
4297        }
4298
4299        if (count > 0)
4300                ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4301                                  count, block_to_free_p, p);
4302
4303        if (this_bh) {
4304                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
4305
4306                /*
4307                 * The buffer head should have an attached journal head at this
4308                 * point. However, if the data is corrupted and an indirect
4309                 * block pointed to itself, it would have been detached when
4310                 * the block was cleared. Check for this instead of OOPSing.
4311                 */
4312                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4313                        ext4_handle_dirty_metadata(handle, inode, this_bh);
4314                else
4315                        EXT4_ERROR_INODE(inode,
4316                                         "circular indirect block detected at "
4317                                         "block %llu",
4318                                (unsigned long long) this_bh->b_blocknr);
4319        }
4320}
4321
4322/**
4323 *      ext4_free_branches - free an array of branches
4324 *      @handle: JBD handle for this transaction
4325 *      @inode: inode we are dealing with
4326 *      @parent_bh: the buffer_head which contains *@first and *@last
4327 *      @first: array of block numbers
4328 *      @last:  pointer immediately past the end of array
4329 *      @depth: depth of the branches to free
4330 *
4331 *      We are freeing all blocks refered from these branches (numbers are
4332 *      stored as little-endian 32-bit) and updating @inode->i_blocks
4333 *      appropriately.
4334 */
4335static void ext4_free_branches(handle_t *handle, struct inode *inode,
4336                               struct buffer_head *parent_bh,
4337                               __le32 *first, __le32 *last, int depth)
4338{
4339        ext4_fsblk_t nr;
4340        __le32 *p;
4341
4342        if (ext4_handle_is_aborted(handle))
4343                return;
4344
4345        if (depth--) {
4346                struct buffer_head *bh;
4347                int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4348                p = last;
4349                while (--p >= first) {
4350                        nr = le32_to_cpu(*p);
4351                        if (!nr)
4352                                continue;               /* A hole */
4353
4354                        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4355                                                   nr, 1)) {
4356                                EXT4_ERROR_INODE(inode,
4357                                                 "invalid indirect mapped "
4358                                                 "block %lu (level %d)",
4359                                                 (unsigned long) nr, depth);
4360                                break;
4361                        }
4362
4363                        /* Go read the buffer for the next level down */
4364                        bh = sb_bread(inode->i_sb, nr);
4365
4366                        /*
4367                         * A read failure? Report error and clear slot
4368                         * (should be rare).
4369                         */
4370                        if (!bh) {
4371                                EXT4_ERROR_INODE_BLOCK(inode, nr,
4372                                                       "Read failure");
4373                                continue;
4374                        }
4375
4376                        /* This zaps the entire block.  Bottom up. */
4377                        BUFFER_TRACE(bh, "free child branches");
4378                        ext4_free_branches(handle, inode, bh,
4379                                        (__le32 *) bh->b_data,
4380                                        (__le32 *) bh->b_data + addr_per_block,
4381                                        depth);
4382                        brelse(bh);
4383
4384                        /*
4385                         * Everything below this this pointer has been
4386                         * released.  Now let this top-of-subtree go.
4387                         *
4388                         * We want the freeing of this indirect block to be
4389                         * atomic in the journal with the updating of the
4390                         * bitmap block which owns it.  So make some room in
4391                         * the journal.
4392                         *
4393                         * We zero the parent pointer *after* freeing its
4394                         * pointee in the bitmaps, so if extend_transaction()
4395                         * for some reason fails to put the bitmap changes and
4396                         * the release into the same transaction, recovery
4397                         * will merely complain about releasing a free block,
4398                         * rather than leaking blocks.
4399                         */
4400                        if (ext4_handle_is_aborted(handle))
4401                                return;
4402                        if (try_to_extend_transaction(handle, inode)) {
4403                                ext4_mark_inode_dirty(handle, inode);
4404                                ext4_truncate_restart_trans(handle, inode,
4405                                            blocks_for_truncate(inode));
4406                        }
4407
4408                        /*
4409                         * The forget flag here is critical because if
4410                         * we are journaling (and not doing data
4411                         * journaling), we have to make sure a revoke
4412                         * record is written to prevent the journal
4413                         * replay from overwriting the (former)
4414                         * indirect block if it gets reallocated as a
4415                         * data block.  This must happen in the same
4416                         * transaction where the data blocks are
4417                         * actually freed.
4418                         */
4419                        ext4_free_blocks(handle, inode, 0, nr, 1,
4420                                         EXT4_FREE_BLOCKS_METADATA|
4421                                         EXT4_FREE_BLOCKS_FORGET);
4422
4423                        if (parent_bh) {
4424                                /*
4425                                 * The block which we have just freed is
4426                                 * pointed to by an indirect block: journal it
4427                                 */
4428                                BUFFER_TRACE(parent_bh, "get_write_access");
4429                                if (!ext4_journal_get_write_access(handle,
4430                                                                   parent_bh)){
4431                                        *p = 0;
4432                                        BUFFER_TRACE(parent_bh,
4433                                        "call ext4_handle_dirty_metadata");
4434                                        ext4_handle_dirty_metadata(handle,
4435                                                                   inode,
4436                                                                   parent_bh);
4437                                }
4438                        }
4439                }
4440        } else {
4441                /* We have reached the bottom of the tree. */
4442                BUFFER_TRACE(parent_bh, "free data blocks");
4443                ext4_free_data(handle, inode, parent_bh, first, last);
4444        }
4445}
4446
4447int ext4_can_truncate(struct inode *inode)
4448{
4449        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4450                return 0;
4451        if (S_ISREG(inode->i_mode))
4452                return 1;
4453        if (S_ISDIR(inode->i_mode))
4454                return 1;
4455        if (S_ISLNK(inode->i_mode))
4456                return !ext4_inode_is_fast_symlink(inode);
4457        return 0;
4458}
4459
4460/*
4461 * ext4_truncate()
4462 *
4463 * We block out ext4_get_block() block instantiations across the entire
4464 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
4465 * simultaneously on behalf of the same inode.
4466 *
4467 * As we work through the truncate and commmit bits of it to the journal there
4468 * is one core, guiding principle: the file's tree must always be consistent on
4469 * disk.  We must be able to restart the truncate after a crash.
4470 *
4471 * The file's tree may be transiently inconsistent in memory (although it
4472 * probably isn't), but whenever we close off and commit a journal transaction,
4473 * the contents of (the filesystem + the journal) must be consistent and
4474 * restartable.  It's pretty simple, really: bottom up, right to left (although
4475 * left-to-right works OK too).
4476 *
4477 * Note that at recovery time, journal replay occurs *before* the restart of
4478 * truncate against the orphan inode list.
4479 *
4480 * The committed inode has the new, desired i_size (which is the same as
4481 * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
4482 * that this inode's truncate did not complete and it will again call
4483 * ext4_truncate() to have another go.  So there will be instantiated blocks
4484 * to the right of the truncation point in a crashed ext4 filesystem.  But
4485 * that's fine - as long as they are linked from the inode, the post-crash
4486 * ext4_truncate() run will find them and release them.
4487 */
4488void ext4_truncate(struct inode *inode)
4489{
4490        handle_t *handle;
4491        struct ext4_inode_info *ei = EXT4_I(inode);
4492        __le32 *i_data = ei->i_data;
4493        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4494        struct address_space *mapping = inode->i_mapping;
4495        ext4_lblk_t offsets[4];
4496        Indirect chain[4];
4497        Indirect *partial;
4498        __le32 nr = 0;
4499        int n;
4500        ext4_lblk_t last_block;
4501        unsigned blocksize = inode->i_sb->s_blocksize;
4502
4503        if (!ext4_can_truncate(inode))
4504                return;
4505
4506        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4507
4508        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4509                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4510
4511        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4512                ext4_ext_truncate(inode);
4513                return;
4514        }
4515
4516        handle = start_transaction(inode);
4517        if (IS_ERR(handle))
4518                return;         /* AKPM: return what? */
4519
4520        last_block = (inode->i_size + blocksize-1)
4521                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4522
4523        if (inode->i_size & (blocksize - 1))
4524                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4525                        goto out_stop;
4526
4527        n = ext4_block_to_path(inode, last_block, offsets, NULL);
4528        if (n == 0)
4529                goto out_stop;  /* error */
4530
4531        /*
4532         * OK.  This truncate is going to happen.  We add the inode to the
4533         * orphan list, so that if this truncate spans multiple transactions,
4534         * and we crash, we will resume the truncate when the filesystem
4535         * recovers.  It also marks the inode dirty, to catch the new size.
4536         *
4537         * Implication: the file must always be in a sane, consistent
4538         * truncatable state while each transaction commits.
4539         */
4540        if (ext4_orphan_add(handle, inode))
4541                goto out_stop;
4542
4543        /*
4544         * From here we block out all ext4_get_block() callers who want to
4545         * modify the block allocation tree.
4546         */
4547        down_write(&ei->i_data_sem);
4548
4549        ext4_discard_preallocations(inode);
4550
4551        /*
4552         * The orphan list entry will now protect us from any crash which
4553         * occurs before the truncate completes, so it is now safe to propagate
4554         * the new, shorter inode size (held for now in i_size) into the
4555         * on-disk inode. We do this via i_disksize, which is the value which
4556         * ext4 *really* writes onto the disk inode.
4557         */
4558        ei->i_disksize = inode->i_size;
4559
4560        if (n == 1) {           /* direct blocks */
4561                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4562                               i_data + EXT4_NDIR_BLOCKS);
4563                goto do_indirects;
4564        }
4565
4566        partial = ext4_find_shared(inode, n, offsets, chain, &nr);
4567        /* Kill the top of shared branch (not detached) */
4568        if (nr) {
4569                if (partial == chain) {
4570                        /* Shared branch grows from the inode */
4571                        ext4_free_branches(handle, inode, NULL,
4572                                           &nr, &nr+1, (chain+n-1) - partial);
4573                        *partial->p = 0;
4574                        /*
4575                         * We mark the inode dirty prior to restart,
4576                         * and prior to stop.  No need for it here.
4577                         */
4578                } else {
4579                        /* Shared branch grows from an indirect block */
4580                        BUFFER_TRACE(partial->bh, "get_write_access");
4581                        ext4_free_branches(handle, inode, partial->bh,
4582                                        partial->p,
4583                                        partial->p+1, (chain+n-1) - partial);
4584                }
4585        }
4586        /* Clear the ends of indirect blocks on the shared branch */
4587        while (partial > chain) {
4588                ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
4589                                   (__le32*)partial->bh->b_data+addr_per_block,
4590                                   (chain+n-1) - partial);
4591                BUFFER_TRACE(partial->bh, "call brelse");
4592                brelse(partial->bh);
4593                partial--;
4594        }
4595do_indirects:
4596        /* Kill the remaining (whole) subtrees */
4597        switch (offsets[0]) {
4598        default:
4599                nr = i_data[EXT4_IND_BLOCK];
4600                if (nr) {
4601                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
4602                        i_data[EXT4_IND_BLOCK] = 0;
4603                }
4604        case EXT4_IND_BLOCK:
4605                nr = i_data[EXT4_DIND_BLOCK];
4606                if (nr) {
4607                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
4608                        i_data[EXT4_DIND_BLOCK] = 0;
4609                }
4610        case EXT4_DIND_BLOCK:
4611                nr = i_data[EXT4_TIND_BLOCK];
4612                if (nr) {
4613                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
4614                        i_data[EXT4_TIND_BLOCK] = 0;
4615                }
4616        case EXT4_TIND_BLOCK:
4617                ;
4618        }
4619
4620        up_write(&ei->i_data_sem);
4621        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4622        ext4_mark_inode_dirty(handle, inode);
4623
4624        /*
4625         * In a multi-transaction truncate, we only make the final transaction
4626         * synchronous
4627         */
4628        if (IS_SYNC(inode))
4629                ext4_handle_sync(handle);
4630out_stop:
4631        /*
4632         * If this was a simple ftruncate(), and the file will remain alive
4633         * then we need to clear up the orphan record which we created above.
4634         * However, if this was a real unlink then we were called by
4635         * ext4_delete_inode(), and we allow that function to clean up the
4636         * orphan info for us.
4637         */
4638        if (inode->i_nlink)
4639                ext4_orphan_del(handle, inode);
4640
4641        ext4_journal_stop(handle);
4642}
4643
4644/*
4645 * ext4_get_inode_loc returns with an extra refcount against the inode's
4646 * underlying buffer_head on success. If 'in_mem' is true, we have all
4647 * data in memory that is needed to recreate the on-disk version of this
4648 * inode.
4649 */
4650static int __ext4_get_inode_loc(struct inode *inode,
4651                                struct ext4_iloc *iloc, int in_mem)
4652{
4653        struct ext4_group_desc  *gdp;
4654        struct buffer_head      *bh;
4655        struct super_block      *sb = inode->i_sb;
4656        ext4_fsblk_t            block;
4657        int                     inodes_per_block, inode_offset;
4658
4659        iloc->bh = NULL;
4660        if (!ext4_valid_inum(sb, inode->i_ino))
4661                return -EIO;
4662
4663        iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
4664        gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
4665        if (!gdp)
4666                return -EIO;
4667
4668        /*
4669         * Figure out the offset within the block group inode table
4670         */
4671        inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
4672        inode_offset = ((inode->i_ino - 1) %
4673                        EXT4_INODES_PER_GROUP(sb));
4674        block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
4675        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
4676
4677        bh = sb_getblk(sb, block);
4678        if (!bh) {
4679                EXT4_ERROR_INODE_BLOCK(inode, block,
4680                                       "unable to read itable block");
4681                return -EIO;
4682        }
4683        if (!buffer_uptodate(bh)) {
4684                lock_buffer(bh);
4685
4686                /*
4687                 * If the buffer has the write error flag, we have failed
4688                 * to write out another inode in the same block.  In this
4689                 * case, we don't have to read the block because we may
4690                 * read the old inode data successfully.
4691                 */
4692                if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4693                        set_buffer_uptodate(bh);
4694
4695                if (buffer_uptodate(bh)) {
4696                        /* someone brought it uptodate while we waited */
4697                        unlock_buffer(bh);
4698                        goto has_buffer;
4699                }
4700
4701                /*
4702                 * If we have all information of the inode in memory and this
4703                 * is the only valid inode in the block, we need not read the
4704                 * block.
4705                 */
4706                if (in_mem) {
4707                        struct buffer_head *bitmap_bh;
4708                        int i, start;
4709
4710                        start = inode_offset & ~(inodes_per_block - 1);
4711
4712                        /* Is the inode bitmap in cache? */
4713                        bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
4714                        if (!bitmap_bh)
4715                                goto make_io;
4716
4717                        /*
4718                         * If the inode bitmap isn't in cache then the
4719                         * optimisation may end up performing two reads instead
4720                         * of one, so skip it.
4721                         */
4722                        if (!buffer_uptodate(bitmap_bh)) {
4723                                brelse(bitmap_bh);
4724                                goto make_io;
4725                        }
4726                        for (i = start; i < start + inodes_per_block; i++) {
4727                                if (i == inode_offset)
4728                                        continue;
4729                                if (ext4_test_bit(i, bitmap_bh->b_data))
4730                                        break;
4731                        }
4732                        brelse(bitmap_bh);
4733                        if (i == start + inodes_per_block) {
4734                                /* all other inodes are free, so skip I/O */
4735                                memset(bh->b_data, 0, bh->b_size);
4736                                set_buffer_uptodate(bh);
4737                                unlock_buffer(bh);
4738                                goto has_buffer;
4739                        }
4740                }
4741
4742make_io:
4743                /*
4744                 * If we need to do any I/O, try to pre-readahead extra
4745                 * blocks from the inode table.
4746                 */
4747                if (EXT4_SB(sb)->s_inode_readahead_blks) {
4748                        ext4_fsblk_t b, end, table;
4749                        unsigned num;
4750
4751                        table = ext4_inode_table(sb, gdp);
4752                        /* s_inode_readahead_blks is always a power of 2 */
4753                        b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4754                        if (table > b)
4755                                b = table;
4756                        end = b + EXT4_SB(sb)->s_inode_readahead_blks;
4757                        num = EXT4_INODES_PER_GROUP(sb);
4758                        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4759                                       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
4760                                num -= ext4_itable_unused_count(sb, gdp);
4761                        table += num / inodes_per_block;
4762                        if (end > table)
4763                                end = table;
4764                        while (b <= end)
4765                                sb_breadahead(sb, b++);
4766                }
4767
4768                /*
4769                 * There are other valid inodes in the buffer, this inode
4770                 * has in-inode xattrs, or we don't have this inode in memory.
4771                 * Read the block from disk.
4772                 */
4773                get_bh(bh);
4774                bh->b_end_io = end_buffer_read_sync;
4775                submit_bh(READ_META, bh);
4776                wait_on_buffer(bh);
4777                if (!buffer_uptodate(bh)) {
4778                        EXT4_ERROR_INODE_BLOCK(inode, block,
4779                                               "unable to read itable block");
4780                        brelse(bh);
4781                        return -EIO;
4782                }
4783        }
4784has_buffer:
4785        iloc->bh = bh;
4786        return 0;
4787}
4788
4789int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4790{
4791        /* We have all inode data except xattrs in memory here. */
4792        return __ext4_get_inode_loc(inode, iloc,
4793                !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4794}
4795
4796void ext4_set_inode_flags(struct inode *inode)
4797{
4798        unsigned int flags = EXT4_I(inode)->i_flags;
4799
4800        inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
4801        if (flags & EXT4_SYNC_FL)
4802                inode->i_flags |= S_SYNC;
4803        if (flags & EXT4_APPEND_FL)
4804                inode->i_flags |= S_APPEND;
4805        if (flags & EXT4_IMMUTABLE_FL)
4806                inode->i_flags |= S_IMMUTABLE;
4807        if (flags & EXT4_NOATIME_FL)
4808                inode->i_flags |= S_NOATIME;
4809        if (flags & EXT4_DIRSYNC_FL)
4810                inode->i_flags |= S_DIRSYNC;
4811}
4812
4813/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4814void ext4_get_inode_flags(struct ext4_inode_info *ei)
4815{
4816        unsigned int vfs_fl;
4817        unsigned long old_fl, new_fl;
4818
4819        do {
4820                vfs_fl = ei->vfs_inode.i_flags;
4821                old_fl = ei->i_flags;
4822                new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
4823                                EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
4824                                EXT4_DIRSYNC_FL);
4825                if (vfs_fl & S_SYNC)
4826                        new_fl |= EXT4_SYNC_FL;
4827                if (vfs_fl & S_APPEND)
4828                        new_fl |= EXT4_APPEND_FL;
4829                if (vfs_fl & S_IMMUTABLE)
4830                        new_fl |= EXT4_IMMUTABLE_FL;
4831                if (vfs_fl & S_NOATIME)
4832                        new_fl |= EXT4_NOATIME_FL;
4833                if (vfs_fl & S_DIRSYNC)
4834                        new_fl |= EXT4_DIRSYNC_FL;
4835        } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
4836}
4837
4838static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4839                                  struct ext4_inode_info *ei)
4840{
4841        blkcnt_t i_blocks ;
4842        struct inode *inode = &(ei->vfs_inode);
4843        struct super_block *sb = inode->i_sb;
4844
4845        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4846                                EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
4847                /* we are using combined 48 bit field */
4848                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4849                                        le32_to_cpu(raw_inode->i_blocks_lo);
4850                if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
4851                        /* i_blocks represent file system block size */
4852                        return i_blocks  << (inode->i_blkbits - 9);
4853                } else {
4854                        return i_blocks;
4855                }
4856        } else {
4857                return le32_to_cpu(raw_inode->i_blocks_lo);
4858        }
4859}
4860
4861struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4862{
4863        struct ext4_iloc iloc;
4864        struct ext4_inode *raw_inode;
4865        struct ext4_inode_info *ei;
4866        struct inode *inode;
4867        journal_t *journal = EXT4_SB(sb)->s_journal;
4868        long ret;
4869        int block;
4870
4871        inode = iget_locked(sb, ino);
4872        if (!inode)
4873                return ERR_PTR(-ENOMEM);
4874        if (!(inode->i_state & I_NEW))
4875                return inode;
4876
4877        ei = EXT4_I(inode);
4878        iloc.bh = 0;
4879
4880        ret = __ext4_get_inode_loc(inode, &iloc, 0);
4881        if (ret < 0)
4882                goto bad_inode;
4883        raw_inode = ext4_raw_inode(&iloc);
4884        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4885        inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
4886        inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
4887        if (!(test_opt(inode->i_sb, NO_UID32))) {
4888                inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
4889                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
4890        }
4891        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4892
4893        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
4894        ei->i_dir_start_lookup = 0;
4895        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4896        /* We now have enough fields to check if the inode was active or not.
4897         * This is needed because nfsd might try to access dead inodes
4898         * the test is that same one that e2fsck uses
4899         * NeilBrown 1999oct15
4900         */
4901        if (inode->i_nlink == 0) {
4902                if (inode->i_mode == 0 ||
4903                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
4904                        /* this inode is deleted */
4905                        ret = -ESTALE;
4906                        goto bad_inode;
4907                }
4908                /* The only unlinked inodes we let through here have
4909                 * valid i_mode and are being read by the orphan
4910                 * recovery code: that's fine, we're about to complete
4911                 * the process of deleting those. */
4912        }
4913        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4914        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4915        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4916        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
4917                ei->i_file_acl |=
4918                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4919        inode->i_size = ext4_isize(raw_inode);
4920        ei->i_disksize = inode->i_size;
4921#ifdef CONFIG_QUOTA
4922        ei->i_reserved_quota = 0;
4923#endif
4924        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4925        ei->i_block_group = iloc.block_group;
4926        ei->i_last_alloc_group = ~0;
4927        /*
4928         * NOTE! The in-memory inode i_data array is in little-endian order
4929         * even on big-endian machines: we do NOT byteswap the block numbers!
4930         */
4931        for (block = 0; block < EXT4_N_BLOCKS; block++)
4932                ei->i_data[block] = raw_inode->i_block[block];
4933        INIT_LIST_HEAD(&ei->i_orphan);
4934
4935        /*
4936         * Set transaction id's of transactions that have to be committed
4937         * to finish f[data]sync. We set them to currently running transaction
4938         * as we cannot be sure that the inode or some of its metadata isn't
4939         * part of the transaction - the inode could have been reclaimed and
4940         * now it is reread from disk.
4941         */
4942        if (journal) {
4943                transaction_t *transaction;
4944                tid_t tid;
4945
4946                read_lock(&journal->j_state_lock);
4947                if (journal->j_running_transaction)
4948                        transaction = journal->j_running_transaction;
4949                else
4950                        transaction = journal->j_committing_transaction;
4951                if (transaction)
4952                        tid = transaction->t_tid;
4953                else
4954                        tid = journal->j_commit_sequence;
4955                read_unlock(&journal->j_state_lock);
4956                ei->i_sync_tid = tid;
4957                ei->i_datasync_tid = tid;
4958        }
4959
4960        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4961                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4962                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4963                    EXT4_INODE_SIZE(inode->i_sb)) {
4964                        ret = -EIO;
4965                        goto bad_inode;
4966                }
4967                if (ei->i_extra_isize == 0) {
4968                        /* The extra space is currently unused. Use it. */
4969                        ei->i_extra_isize = sizeof(struct ext4_inode) -
4970                                            EXT4_GOOD_OLD_INODE_SIZE;
4971                } else {
4972                        __le32 *magic = (void *)raw_inode +
4973                                        EXT4_GOOD_OLD_INODE_SIZE +
4974                                        ei->i_extra_isize;
4975                        if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4976                                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4977                }
4978        } else
4979                ei->i_extra_isize = 0;
4980
4981        EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
4982        EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
4983        EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4984        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4985
4986        inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4987        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4988                if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4989                        inode->i_version |=
4990                        (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4991        }
4992
4993        ret = 0;
4994        if (ei->i_file_acl &&
4995            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
4996                EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
4997                                 ei->i_file_acl);
4998                ret = -EIO;
4999                goto bad_inode;
5000        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

5001                if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5002                    (S_ISLNK(inode->i_mode) &&
5003                     !ext4_inode_is_fast_symlink(inode)))
5004                        /* Validate extent which is part of inode */
5005                        ret = ext4_ext_check_inode(inode);
5006        } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5007                   (S_ISLNK(inode->i_mode) &&
5008                    !ext4_inode_is_fast_symlink(inode))) {
5009                /* Validate block references which are part of inode */
5010                ret = ext4_check_inode_blockref(inode);
5011        }
5012        if (ret)
5013                goto bad_inode;
5014
5015        if (S_ISREG(inode->i_mode)) {
5016                inode->i_op = &ext4_file_inode_operations;
5017                inode->i_fop = &ext4_file_operations;
5018                ext4_set_aops(inode);
5019        } else if (S_ISDIR(inode->i_mode)) {
5020                inode->i_op = &ext4_dir_inode_operations;
5021                inode->i_fop = &ext4_dir_operations;
5022        } else if (S_ISLNK(inode->i_mode)) {
5023                if (ext4_inode_is_fast_symlink(inode)) {
5024                        inode->i_op = &ext4_fast_symlink_inode_operations;
5025                        nd_terminate_link(ei->i_data, inode->i_size,
5026                                sizeof(ei->i_data) - 1);
5027                } else {
5028                        inode->i_op = &ext4_symlink_inode_operations;
5029                        ext4_set_aops(inode);
5030                }
5031        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
5032              S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
5033                inode->i_op = &ext4_special_inode_operations;
5034                if (raw_inode->i_block[0])
5035                        init_special_inode(inode, inode->i_mode,
5036                           old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
5037                else
5038                        init_special_inode(inode, inode->i_mode,
5039                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5040        } else {
5041                ret = -EIO;
5042                EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5043                goto bad_inode;
5044        }
5045        brelse(iloc.bh);
5046        ext4_set_inode_flags(inode);
5047        unlock_new_inode(inode);
5048        return inode;
5049
5050bad_inode:
5051        brelse(iloc.bh);
5052        iget_failed(inode);
5053        return ERR_PTR(ret);
5054}
5055
5056static int ext4_inode_blocks_set(handle_t *handle,
5057                                struct ext4_inode *raw_inode,
5058                                struct ext4_inode_info *ei)
5059{
5060        struct inode *inode = &(ei->vfs_inode);
5061        u64 i_blocks = inode->i_blocks;
5062        struct super_block *sb = inode->i_sb;
5063
5064        if (i_blocks <= ~0U) {
5065                /*
5066                 * i_blocks can be represnted in a 32 bit variable
5067                 * as multiple of 512 bytes
5068                 */
5069                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
5070                raw_inode->i_blocks_high = 0;
5071                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5072                return 0;
5073        }
5074        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
5075                return -EFBIG;
5076
5077        if (i_blocks <= 0xffffffffffffULL) {
5078                /*
5079                 * i_blocks can be represented in a 48 bit variable
5080                 * as multiple of 512 bytes
5081                 */
5082                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
5083                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5084                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5085        } else {
5086                ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5087                /* i_block is stored in file system block size */
5088                i_blocks = i_blocks >> (inode->i_blkbits - 9);
5089                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
5090                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5091        }
5092        return 0;
5093}
5094
5095/*
5096 * Post the struct inode info into an on-disk inode location in the
5097 * buffer-cache.  This gobbles the caller's reference to the
5098 * buffer_head in the inode location struct.
5099 *
5100 * The caller must have write access to iloc->bh.
5101 */
5102static int ext4_do_update_inode(handle_t *handle,
5103                                struct inode *inode,
5104                                struct ext4_iloc *iloc)
5105{
5106        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
5107        struct ext4_inode_info *ei = EXT4_I(inode);
5108        struct buffer_head *bh = iloc->bh;
5109        int err = 0, rc, block;
5110
5111        /* For fields not not tracking in the in-memory inode,
5112         * initialise them to zero for new inodes. */
5113        if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5114                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5115
5116        ext4_get_inode_flags(ei);
5117        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
5118        if (!(test_opt(inode->i_sb, NO_UID32))) {
5119                raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
5120                raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
5121/*
5122 * Fix up interoperability with old kernels. Otherwise, old inodes get
5123 * re-used with the upper 16 bits of the uid/gid intact
5124 */
5125                if (!ei->i_dtime) {
5126                        raw_inode->i_uid_high =
5127                                cpu_to_le16(high_16_bits(inode->i_uid));
5128                        raw_inode->i_gid_high =
5129                                cpu_to_le16(high_16_bits(inode->i_gid));
5130                } else {
5131                        raw_inode->i_uid_high = 0;
5132                        raw_inode->i_gid_high = 0;
5133                }
5134        } else {
5135                raw_inode->i_uid_low =
5136                        cpu_to_le16(fs_high2lowuid(inode->i_uid));
5137                raw_inode->i_gid_low =
5138                        cpu_to_le16(fs_high2lowgid(inode->i_gid));
5139                raw_inode->i_uid_high = 0;
5140                raw_inode->i_gid_high = 0;
5141        }
5142        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
5143
5144        EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
5145        EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
5146        EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
5147        EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
5148
5149        if (ext4_inode_blocks_set(handle, raw_inode, ei))
5150                goto out_brelse;
5151        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
5152        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
5153        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
5154            cpu_to_le32(EXT4_OS_HURD))
5155                raw_inode->i_file_acl_high =
5156                        cpu_to_le16(ei->i_file_acl >> 32);
5157        raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
5158        ext4_isize_set(raw_inode, ei->i_disksize);
5159        if (ei->i_disksize > 0x7fffffffULL) {
5160                struct super_block *sb = inode->i_sb;
5161                if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
5162                                EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
5163                                EXT4_SB(sb)->s_es->s_rev_level ==
5164                                cpu_to_le32(EXT4_GOOD_OLD_REV)) {
5165                        /* If this is the first large file
5166                         * created, add a flag to the superblock.
5167                         */
5168                        err = ext4_journal_get_write_access(handle,
5169                                        EXT4_SB(sb)->s_sbh);
5170                        if (err)
5171                                goto out_brelse;
5172                        ext4_update_dynamic_rev(sb);
5173                        EXT4_SET_RO_COMPAT_FEATURE(sb,
5174                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5175                        sb->s_dirt = 1;
5176                        ext4_handle_sync(handle);
5177                        err = ext4_handle_dirty_metadata(handle, NULL,
5178                                        EXT4_SB(sb)->s_sbh);
5179                }
5180        }
5181        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
5182        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
5183                if (old_valid_dev(inode->i_rdev)) {
5184                        raw_inode->i_block[0] =
5185                                cpu_to_le32(old_encode_dev(inode->i_rdev));
5186                        raw_inode->i_block[1] = 0;
5187                } else {
5188                        raw_inode->i_block[0] = 0;
5189                        raw_inode->i_block[1] =
5190                                cpu_to_le32(new_encode_dev(inode->i_rdev));
5191                        raw_inode->i_block[2] = 0;
5192                }
5193        } else
5194                for (block = 0; block < EXT4_N_BLOCKS; block++)
5195                        raw_inode->i_block[block] = ei->i_data[block];
5196
5197        raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
5198        if (ei->i_extra_isize) {
5199                if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
5200                        raw_inode->i_version_hi =
5201                        cpu_to_le32(inode->i_version >> 32);
5202                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
5203        }
5204
5205        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5206        rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5207        if (!err)
5208                err = rc;
5209        ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5210
5211        ext4_update_inode_fsync_trans(handle, inode, 0);
5212out_brelse:
5213        brelse(bh);
5214        ext4_std_error(inode->i_sb, err);
5215        return err;
5216}
5217
5218/*
5219 * ext4_write_inode()
5220 *
5221 * We are called from a few places:
5222 *
5223 * - Within generic_file_write() for O_SYNC files.
5224 *   Here, there will be no transaction running. We wait for any running
5225 *   trasnaction to commit.
5226 *
5227 * - Within sys_sync(), kupdate and such.
5228 *   We wait on commit, if tol to.
5229 *
5230 * - Within prune_icache() (PF_MEMALLOC == true)
5231 *   Here we simply return.  We can't afford to block kswapd on the
5232 *   journal commit.
5233 *
5234 * In all cases it is actually safe for us to return without doing anything,
5235 * because the inode has been copied into a raw inode buffer in
5236 * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
5237 * knfsd.
5238 *
5239 * Note that we are absolutely dependent upon all inode dirtiers doing the
5240 * right thing: they *must* call mark_inode_dirty() after dirtying info in
5241 * which we are interested.
5242 *
5243 * It would be a bug for them to not do this.  The code:
5244 *
5245 *      mark_inode_dirty(inode)
5246 *      stuff();
5247 *      inode->i_size = expr;
5248 *
5249 * is in error because a kswapd-driven write_inode() could occur while
5250 * `stuff()' is running, and the new i_size will be lost.  Plus the inode
5251 * will no longer be on the superblock's dirty inode list.
5252 */
5253int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5254{
5255        int err;
5256
5257        if (current->flags & PF_MEMALLOC)
5258                return 0;
5259
5260        if (EXT4_SB(inode->i_sb)->s_journal) {
5261                if (ext4_journal_current_handle()) {
5262                        jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
5263                        dump_stack();
5264                        return -EIO;
5265                }
5266
5267                if (wbc->sync_mode != WB_SYNC_ALL)
5268                        return 0;
5269
5270                err = ext4_force_commit(inode->i_sb);
5271        } else {
5272                struct ext4_iloc iloc;
5273
5274                err = __ext4_get_inode_loc(inode, &iloc, 0);
5275                if (err)
5276                        return err;
5277                if (wbc->sync_mode == WB_SYNC_ALL)
5278                        sync_dirty_buffer(iloc.bh);
5279                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5280                        EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5281                                         "IO error syncing inode");
5282                        err = -EIO;
5283                }
5284                brelse(iloc.bh);
5285        }
5286        return err;
5287}
5288
5289/*
5290 * ext4_setattr()
5291 *
5292 * Called from notify_change.
5293 *
5294 * We want to trap VFS attempts to truncate the file as soon as
5295 * possible.  In particular, we want to make sure that when the VFS
5296 * shrinks i_size, we put the inode on the orphan list and modify
5297 * i_disksize immediately, so that during the subsequent flushing of
5298 * dirty pages and freeing of disk blocks, we can guarantee that any
5299 * commit will leave the blocks being flushed in an unused state on
5300 * disk.  (On recovery, the inode will get truncated and the blocks will
5301 * be freed, so we have a strong guarantee that no future commit will
5302 * leave these blocks visible to the user.)
5303 *
5304 * Another thing we have to assure is that if we are in ordered mode
5305 * and inode is still attached to the committing transaction, we must
5306 * we start writeout of all the dirty pages which are being truncated.
5307 * This way we are sure that all the data written in the previous
5308 * transaction are already on disk (truncate waits for pages under
5309 * writeback).
5310 *
5311 * Called with inode->i_mutex down.
5312 */
5313int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5314{
5315        struct inode *inode = dentry->d_inode;
5316        int error, rc = 0;
5317        int orphan = 0;
5318        const unsigned int ia_valid = attr->ia_valid;
5319
5320        error = inode_change_ok(inode, attr);
5321        if (error)
5322                return error;
5323
5324        if (is_quota_modification(inode, attr))
5325                dquot_initialize(inode);
5326        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5327                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
5328                handle_t *handle;
5329
5330                /* (user+group)*(old+new) structure, inode write (sb,
5331                 * inode block, ? - but truncate inode update has it) */
5332                handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
5333                                        EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
5334                if (IS_ERR(handle)) {
5335                        error = PTR_ERR(handle);
5336                        goto err_out;
5337                }
5338                error = dquot_transfer(inode, attr);
5339                if (error) {
5340                        ext4_journal_stop(handle);
5341                        return error;
5342                }
5343                /* Update corresponding info in inode so that everything is in
5344                 * one transaction */
5345                if (attr->ia_valid & ATTR_UID)
5346                        inode->i_uid = attr->ia_uid;
5347                if (attr->ia_valid & ATTR_GID)
5348                        inode->i_gid = attr->ia_gid;
5349                error = ext4_mark_inode_dirty(handle, inode);
5350                ext4_journal_stop(handle);
5351        }
5352
5353        if (attr->ia_valid & ATTR_SIZE) {
5354                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5355                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5356
5357                        if (attr->ia_size > sbi->s_bitmap_maxbytes)
5358                                return -EFBIG;
5359                }
5360        }
5361
5362        if (S_ISREG(inode->i_mode) &&
5363            attr->ia_valid & ATTR_SIZE &&
5364            (attr->ia_size < inode->i_size ||
5365             (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5366                handle_t *handle;
5367
5368                handle = ext4_journal_start(inode, 3);
5369                if (IS_ERR(handle)) {
5370                        error = PTR_ERR(handle);
5371                        goto err_out;
5372                }
5373                if (ext4_handle_valid(handle)) {
5374                        error = ext4_orphan_add(handle, inode);
5375                        orphan = 1;
5376                }
5377                EXT4_I(inode)->i_disksize = attr->ia_size;
5378                rc = ext4_mark_inode_dirty(handle, inode);
5379                if (!error)
5380                        error = rc;
5381                ext4_journal_stop(handle);
5382
5383                if (ext4_should_order_data(inode)) {
5384                        error = ext4_begin_ordered_truncate(inode,
5385                                                            attr->ia_size);
5386                        if (error) {
5387                                /* Do as much error cleanup as possible */
5388                                handle = ext4_journal_start(inode, 3);
5389                                if (IS_ERR(handle)) {
5390                                        ext4_orphan_del(NULL, inode);
5391                                        goto err_out;
5392                                }
5393                                ext4_orphan_del(handle, inode);
5394                                orphan = 0;
5395                                ext4_journal_stop(handle);
5396                                goto err_out;
5397                        }
5398                }
5399                /* ext4_truncate will clear the flag */
5400                if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5401                        ext4_truncate(inode);
5402        }
5403
5404        if ((attr->ia_valid & ATTR_SIZE) &&
5405            attr->ia_size != i_size_read(inode))
5406                rc = vmtruncate(inode, attr->ia_size);
5407
5408        if (!rc) {
5409                setattr_copy(inode, attr);
5410                mark_inode_dirty(inode);
5411        }
5412
5413        /*
5414         * If the call to ext4_truncate failed to get a transaction handle at
5415         * all, we need to clean up the in-core orphan list manually.
5416         */
5417        if (orphan && inode->i_nlink)
5418                ext4_orphan_del(NULL, inode);
5419
5420        if (!rc && (ia_valid & ATTR_MODE))
5421                rc = ext4_acl_chmod(inode);
5422
5423err_out:
5424        ext4_std_error(inode->i_sb, error);
5425        if (!error)
5426                error = rc;
5427        return error;
5428}
5429
5430int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5431                 struct kstat *stat)
5432{
5433        struct inode *inode;
5434        unsigned long delalloc_blocks;
5435
5436        inode = dentry->d_inode;
5437        generic_fillattr(inode, stat);
5438
5439        /*
5440         * We can't update i_blocks if the block allocation is delayed
5441         * otherwise in the case of system crash before the real block
5442         * allocation is done, we will have i_blocks inconsistent with
5443         * on-disk file blocks.
5444         * We always keep i_blocks updated together with real
5445         * allocation. But to not confuse with user, stat
5446         * will return the blocks that include the delayed allocation
5447         * blocks for this file.
5448         */
5449        delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5450
5451        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5452        return 0;
5453}
5454
5455static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5456                                      int chunk)
5457{
5458        int indirects;
5459
5460        /* if nrblocks are contiguous */
5461        if (chunk) {
5462                /*
5463                 * With N contiguous data blocks, it need at most
5464                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
5465                 * 2 dindirect blocks
5466                 * 1 tindirect block
5467                 */
5468                indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
5469                return indirects + 3;
5470        }
5471        /*
5472         * if nrblocks are not contiguous, worse case, each block touch
5473         * a indirect block, and each indirect block touch a double indirect
5474         * block, plus a triple indirect block
5475         */
5476        indirects = nrblocks * 2 + 1;
5477        return indirects;
5478}
5479
5480static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5481{
5482        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5483                return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5484        return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5485}
5486
5487/*
5488 * Account for index blocks, block groups bitmaps and block group
5489 * descriptor blocks if modify datablocks and index blocks
5490 * worse case, the indexs blocks spread over different block groups
5491 *
5492 * If datablocks are discontiguous, they are possible to spread over
5493 * different block groups too. If they are contiuguous, with flexbg,
5494 * they could still across block group boundary.
5495 *
5496 * Also account for superblock, inode, quota and xattr blocks
5497 */
5498static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5499{
5500        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5501        int gdpblocks;
5502        int idxblocks;
5503        int ret = 0;
5504
5505        /*
5506         * How many index blocks need to touch to modify nrblocks?
5507         * The "Chunk" flag indicating whether the nrblocks is
5508         * physically contiguous on disk
5509         *
5510         * For Direct IO and fallocate, they calls get_block to allocate
5511         * one single extent at a time, so they could set the "Chunk" flag
5512         */
5513        idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
5514
5515        ret = idxblocks;
5516
5517        /*
5518         * Now let's see how many group bitmaps and group descriptors need
5519         * to account
5520         */
5521        groups = idxblocks;
5522        if (chunk)
5523                groups += 1;
5524        else
5525                groups += nrblocks;
5526
5527        gdpblocks = groups;
5528        if (groups > ngroups)
5529                groups = ngroups;
5530        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
5531                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
5532
5533        /* bitmaps and block group descriptor blocks */
5534        ret += groups + gdpblocks;
5535
5536        /* Blocks for super block, inode, quota and xattr blocks */
5537        ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
5538
5539        return ret;
5540}
5541
5542/*
5543 * Calulate the total number of credits to reserve to fit
5544 * the modification of a single pages into a single transaction,
5545 * which may include multiple chunks of block allocations.
5546 *
5547 * This could be called via ext4_write_begin()
5548 *
5549 * We need to consider the worse case, when
5550 * one new block per extent.
5551 */
5552int ext4_writepage_trans_blocks(struct inode *inode)
5553{
5554        int bpp = ext4_journal_blocks_per_page(inode);
5555        int ret;
5556
5557        ret = ext4_meta_trans_blocks(inode, bpp, 0);
5558
5559        /* Account for data blocks for journalled mode */
5560        if (ext4_should_journal_data(inode))
5561                ret += bpp;
5562        return ret;
5563}
5564
5565/*
5566 * Calculate the journal credits for a chunk of data modification.
5567 *
5568 * This is called from DIO, fallocate or whoever calling
5569 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5570 *
5571 * journal buffers for data blocks are not included here, as DIO
5572 * and fallocate do no need to journal data buffers.
5573 */
5574int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
5575{
5576        return ext4_meta_trans_blocks(inode, nrblocks, 1);
5577}
5578
5579/*
5580 * The caller must have previously called ext4_reserve_inode_write().
5581 * Give this, we know that the caller already has write access to iloc->bh.
5582 */
5583int ext4_mark_iloc_dirty(handle_t *handle,
5584                         struct inode *inode, struct ext4_iloc *iloc)
5585{
5586        int err = 0;
5587
5588        if (test_opt(inode->i_sb, I_VERSION))
5589                inode_inc_iversion(inode);
5590
5591        /* the do_update_inode consumes one bh->b_count */
5592        get_bh(iloc->bh);
5593
5594        /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5595        err = ext4_do_update_inode(handle, inode, iloc);
5596        put_bh(iloc->bh);
5597        return err;
5598}
5599
5600/*
5601 * On success, We end up with an outstanding reference count against
5602 * iloc->bh.  This _must_ be cleaned up later.
5603 */
5604
5605int
5606ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
5607                         struct ext4_iloc *iloc)
5608{
5609        int err;
5610
5611        err = ext4_get_inode_loc(inode, iloc);
5612        if (!err) {
5613                BUFFER_TRACE(iloc->bh, "get_write_access");
5614                err = ext4_journal_get_write_access(handle, iloc->bh);
5615                if (err) {
5616                        brelse(iloc->bh);
5617                        iloc->bh = NULL;
5618                }
5619        }
5620        ext4_std_error(inode->i_sb, err);
5621        return err;
5622}
5623
5624/*
5625 * Expand an inode by new_extra_isize bytes.
5626 * Returns 0 on success or negative error number on failure.
5627 */
5628static int ext4_expand_extra_isize(struct inode *inode,
5629                                   unsigned int new_extra_isize,
5630                                   struct ext4_iloc iloc,
5631                                   handle_t *handle)
5632{
5633        struct ext4_inode *raw_inode;
5634        struct ext4_xattr_ibody_header *header;
5635
5636        if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
5637                return 0;
5638
5639        raw_inode = ext4_raw_inode(&iloc);
5640
5641        header = IHDR(inode, raw_inode);
5642
5643        /* No extended attributes present */
5644        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5645            header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5646                memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5647                        new_extra_isize);
5648                EXT4_I(inode)->i_extra_isize = new_extra_isize;
5649                return 0;
5650        }
5651
5652        /* try to expand with EAs present */
5653        return ext4_expand_extra_isize_ea(inode, new_extra_isize,
5654                                          raw_inode, handle);
5655}
5656
5657/*
5658 * What we do here is to mark the in-core inode as clean with respect to inode
5659 * dirtiness (it may still be data-dirty).
5660 * This means that the in-core inode may be reaped by prune_icache
5661 * without having to perform any I/O.  This is a very good thing,
5662 * because *any* task may call prune_icache - even ones which
5663 * have a transaction open against a different journal.
5664 *
5665 * Is this cheating?  Not really.  Sure, we haven't written the
5666 * inode out, but prune_icache isn't a user-visible syncing function.
5667 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
5668 * we start and wait on commits.
5669 *
5670 * Is this efficient/effective?  Well, we're being nice to the system
5671 * by cleaning up our inodes proactively so they can be reaped
5672 * without I/O.  But we are potentially leaving up to five seconds'
5673 * worth of inodes floating about which prune_icache wants us to
5674 * write out.  One way to fix that would be to get prune_icache()
5675 * to do a write_super() to free up some memory.  It has the desired
5676 * effect.
5677 */
5678int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5679{
5680        struct ext4_iloc iloc;
5681        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5682        static unsigned int mnt_count;
5683        int err, ret;
5684
5685        might_sleep();
5686        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5687        err = ext4_reserve_inode_write(handle, inode, &iloc);
5688        if (ext4_handle_valid(handle) &&
5689            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5690            !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5691                /*
5692                 * We need extra buffer credits since we may write into EA block
5693                 * with this same handle. If journal_extend fails, then it will
5694                 * only result in a minor loss of functionality for that inode.
5695                 * If this is felt to be critical, then e2fsck should be run to
5696                 * force a large enough s_min_extra_isize.
5697                 */
5698                if ((jbd2_journal_extend(handle,
5699                             EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
5700                        ret = ext4_expand_extra_isize(inode,
5701                                                      sbi->s_want_extra_isize,
5702                                                      iloc, handle);
5703                        if (ret) {
5704                                ext4_set_inode_state(inode,
5705                                                     EXT4_STATE_NO_EXPAND);
5706                                if (mnt_count !=
5707                                        le16_to_cpu(sbi->s_es->s_mnt_count)) {
5708                                        ext4_warning(inode->i_sb,
5709                                        "Unable to expand inode %lu. Delete"
5710                                        " some EAs or run e2fsck.",
5711                                        inode->i_ino);
5712                                        mnt_count =
5713                                          le16_to_cpu(sbi->s_es->s_mnt_count);
5714                                }
5715                        }
5716                }
5717        }
5718        if (!err)
5719                err = ext4_mark_iloc_dirty(handle, inode, &iloc);
5720        return err;
5721}
5722
5723/*
5724 * ext4_dirty_inode() is called from __mark_inode_dirty()
5725 *
5726 * We're really interested in the case where a file is being extended.
5727 * i_size has been changed by generic_commit_write() and we thus need
5728 * to include the updated inode in the current transaction.
5729 *
5730 * Also, dquot_alloc_block() will always dirty the inode when blocks
5731 * are allocated to the file.
5732 *
5733 * If the inode is marked synchronous, we don't honour that here - doing
5734 * so would cause a commit on atime updates, which we don't bother doing.
5735 * We handle synchronous inodes at the highest possible level.
5736 */
5737void ext4_dirty_inode(struct inode *inode)
5738{
5739        handle_t *handle;
5740
5741        handle = ext4_journal_start(inode, 2);
5742        if (IS_ERR(handle))
5743                goto out;
5744
5745        ext4_mark_inode_dirty(handle, inode);
5746
5747        ext4_journal_stop(handle);
5748out:
5749        return;
5750}
5751
5752#if 0
5753/*
5754 * Bind an inode's backing buffer_head into this transaction, to prevent
5755 * it from being flushed to disk early.  Unlike
5756 * ext4_reserve_inode_write, this leaves behind no bh reference and
5757 * returns no iloc structure, so the caller needs to repeat the iloc
5758 * lookup to mark the inode dirty later.
5759 */
5760static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5761{
5762        struct ext4_iloc iloc;
5763
5764        int err = 0;
5765        if (handle) {
5766                err = ext4_get_inode_loc(inode, &iloc);
5767                if (!err) {
5768                        BUFFER_TRACE(iloc.bh, "get_write_access");
5769                        err = jbd2_journal_get_write_access(handle, iloc.bh);
5770                        if (!err)
5771                                err = ext4_handle_dirty_metadata(handle,
5772                                                                 NULL,
5773                                                                 iloc.bh);
5774                        brelse(iloc.bh);
5775                }
5776        }
5777        ext4_std_error(inode->i_sb, err);
5778        return err;
5779}
5780#endif
5781
5782int ext4_change_inode_journal_flag(struct inode *inode, int val)
5783{
5784        journal_t *journal;
5785        handle_t *handle;
5786        int err;
5787
5788        /*
5789         * We have to be very careful here: changing a data block's
5790         * journaling status dynamically is dangerous.  If we write a
5791         * data block to the journal, change the status and then delete
5792         * that block, we risk forgetting to revoke the old log record
5793         * from the journal and so a subsequent replay can corrupt data.
5794         * So, first we make sure that the journal is empty and that
5795         * nobody is changing anything.
5796         */
5797
5798        journal = EXT4_JOURNAL(inode);
5799        if (!journal)
5800                return 0;
5801        if (is_journal_aborted(journal))
5802                return -EROFS;
5803
5804        jbd2_journal_lock_updates(journal);
5805        jbd2_journal_flush(journal);
5806
5807        /*
5808         * OK, there are no updates running now, and all cached data is
5809         * synced to disk.  We are now in a completely consistent state
5810         * which doesn't have anything in the journal, and we know that
5811         * no filesystem updates are running, so it is safe to modify
5812         * the inode's in-core data-journaling state flag now.
5813         */
5814
5815        if (val)
5816                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5817        else
5818                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5819        ext4_set_aops(inode);
5820
5821        jbd2_journal_unlock_updates(journal);
5822
5823        /* Finally we can mark the inode as dirty. */
5824
5825        handle = ext4_journal_start(inode, 1);
5826        if (IS_ERR(handle))
5827                return PTR_ERR(handle);
5828
5829        err = ext4_mark_inode_dirty(handle, inode);
5830        ext4_handle_sync(handle);
5831        ext4_journal_stop(handle);
5832        ext4_std_error(inode->i_sb, err);
5833
5834        return err;
5835}
5836
5837static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5838{
5839        return !buffer_mapped(bh);
5840}
5841
5842int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5843{
5844        struct page *page = vmf->page;
5845        loff_t size;
5846        unsigned long len;
5847        int ret = -EINVAL;
5848        void *fsdata;
5849        struct file *file = vma->vm_file;
5850        struct inode *inode = file->f_path.dentry->d_inode;
5851        struct address_space *mapping = inode->i_mapping;
5852
5853        /*
5854         * Get i_alloc_sem to stop truncates messing with the inode. We cannot
5855         * get i_mutex because we are already holding mmap_sem.
5856         */
5857        down_read(&inode->i_alloc_sem);
5858        size = i_size_read(inode);
5859        if (page->mapping != mapping || size <= page_offset(page)
5860            || !PageUptodate(page)) {
5861                /* page got truncated from under us? */
5862                goto out_unlock;
5863        }
5864        ret = 0;
5865        if (PageMappedToDisk(page))
5866                goto out_unlock;
5867
5868        if (page->index == size >> PAGE_CACHE_SHIFT)
5869                len = size & ~PAGE_CACHE_MASK;
5870        else
5871                len = PAGE_CACHE_SIZE;
5872
5873        lock_page(page);
5874        /*
5875         * return if we have all the buffers mapped. This avoid
5876         * the need to call write_begin/write_end which does a
5877         * journal_start/journal_stop which can block and take
5878         * long time
5879         */
5880        if (page_has_buffers(page)) {
5881                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5882                                        ext4_bh_unmapped)) {
5883                        unlock_page(page);
5884                        goto out_unlock;
5885                }
5886        }
5887        unlock_page(page);
5888        /*
5889         * OK, we need to fill the hole... Do write_begin write_end
5890         * to do block allocation/reservation.We are not holding
5891         * inode.i__mutex here. That allow * parallel write_begin,
5892         * write_end call. lock_page prevent this from happening
5893         * on the same page though
5894         */
5895        ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
5896                        len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
5897        if (ret < 0)
5898                goto out_unlock;
5899        ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
5900                        len, len, page, fsdata);
5901        if (ret < 0)
5902                goto out_unlock;
5903        ret = 0;
5904out_unlock:
5905        if (ret)
5906                ret = VM_FAULT_SIGBUS;
5907        up_read(&inode->i_alloc_sem);
5908        return ret;
5909}
5910