linux/fs/gfs2/bmap.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   3 * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   4 *
   5 * This copyrighted material is made available to anyone wishing to use,
   6 * modify, copy, or redistribute it subject to the terms and conditions
   7 * of the GNU General Public License version 2.
   8 */
   9
  10#include <linux/spinlock.h>
  11#include <linux/completion.h>
  12#include <linux/buffer_head.h>
  13#include <linux/blkdev.h>
  14#include <linux/gfs2_ondisk.h>
  15#include <linux/crc32.h>
  16#include <linux/iomap.h>
  17#include <linux/ktime.h>
  18
  19#include "gfs2.h"
  20#include "incore.h"
  21#include "bmap.h"
  22#include "glock.h"
  23#include "inode.h"
  24#include "meta_io.h"
  25#include "quota.h"
  26#include "rgrp.h"
  27#include "log.h"
  28#include "super.h"
  29#include "trans.h"
  30#include "dir.h"
  31#include "util.h"
  32#include "aops.h"
  33#include "trace_gfs2.h"
  34
  35/* This doesn't need to be that large as max 64 bit pointers in a 4k
  36 * block is 512, so __u16 is fine for that. It saves stack space to
  37 * keep it small.
  38 */
  39struct metapath {
  40        struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  41        __u16 mp_list[GFS2_MAX_META_HEIGHT];
  42        int mp_fheight; /* find_metapath height */
  43        int mp_aheight; /* actual height (lookup height) */
  44};
  45
  46static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
  47
  48/**
  49 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  50 * @ip: the inode
  51 * @dibh: the dinode buffer
  52 * @block: the block number that was allocated
  53 * @page: The (optional) page. This is looked up if @page is NULL
  54 *
  55 * Returns: errno
  56 */
  57
  58static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  59                               u64 block, struct page *page)
  60{
  61        struct inode *inode = &ip->i_inode;
  62        struct buffer_head *bh;
  63        int release = 0;
  64
  65        if (!page || page->index) {
  66                page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
  67                if (!page)
  68                        return -ENOMEM;
  69                release = 1;
  70        }
  71
  72        if (!PageUptodate(page)) {
  73                void *kaddr = kmap(page);
  74                u64 dsize = i_size_read(inode);
  75 
  76                if (dsize > gfs2_max_stuffed_size(ip))
  77                        dsize = gfs2_max_stuffed_size(ip);
  78
  79                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  80                memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
  81                kunmap(page);
  82
  83                SetPageUptodate(page);
  84        }
  85
  86        if (!page_has_buffers(page))
  87                create_empty_buffers(page, BIT(inode->i_blkbits),
  88                                     BIT(BH_Uptodate));
  89
  90        bh = page_buffers(page);
  91
  92        if (!buffer_mapped(bh))
  93                map_bh(bh, inode->i_sb, block);
  94
  95        set_buffer_uptodate(bh);
  96        if (gfs2_is_jdata(ip))
  97                gfs2_trans_add_data(ip->i_gl, bh);
  98        else {
  99                mark_buffer_dirty(bh);
 100                gfs2_ordered_add_inode(ip);
 101        }
 102
 103        if (release) {
 104                unlock_page(page);
 105                put_page(page);
 106        }
 107
 108        return 0;
 109}
 110
 111/**
 112 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 113 * @ip: The GFS2 inode to unstuff
 114 * @page: The (optional) page. This is looked up if the @page is NULL
 115 *
 116 * This routine unstuffs a dinode and returns it to a "normal" state such
 117 * that the height can be grown in the traditional way.
 118 *
 119 * Returns: errno
 120 */
 121
 122int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 123{
 124        struct buffer_head *bh, *dibh;
 125        struct gfs2_dinode *di;
 126        u64 block = 0;
 127        int isdir = gfs2_is_dir(ip);
 128        int error;
 129
 130        down_write(&ip->i_rw_mutex);
 131
 132        error = gfs2_meta_inode_buffer(ip, &dibh);
 133        if (error)
 134                goto out;
 135
 136        if (i_size_read(&ip->i_inode)) {
 137                /* Get a free block, fill it with the stuffed data,
 138                   and write it out to disk */
 139
 140                unsigned int n = 1;
 141                error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 142                if (error)
 143                        goto out_brelse;
 144                if (isdir) {
 145                        gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 146                        error = gfs2_dir_get_new_buffer(ip, block, &bh);
 147                        if (error)
 148                                goto out_brelse;
 149                        gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 150                                              dibh, sizeof(struct gfs2_dinode));
 151                        brelse(bh);
 152                } else {
 153                        error = gfs2_unstuffer_page(ip, dibh, block, page);
 154                        if (error)
 155                                goto out_brelse;
 156                }
 157        }
 158
 159        /*  Set up the pointer to the new block  */
 160
 161        gfs2_trans_add_meta(ip->i_gl, dibh);
 162        di = (struct gfs2_dinode *)dibh->b_data;
 163        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 164
 165        if (i_size_read(&ip->i_inode)) {
 166                *(__be64 *)(di + 1) = cpu_to_be64(block);
 167                gfs2_add_inode_blocks(&ip->i_inode, 1);
 168                di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 169        }
 170
 171        ip->i_height = 1;
 172        di->di_height = cpu_to_be16(1);
 173
 174out_brelse:
 175        brelse(dibh);
 176out:
 177        up_write(&ip->i_rw_mutex);
 178        return error;
 179}
 180
 181
 182/**
 183 * find_metapath - Find path through the metadata tree
 184 * @sdp: The superblock
 185 * @block: The disk block to look up
 186 * @mp: The metapath to return the result in
 187 * @height: The pre-calculated height of the metadata tree
 188 *
 189 *   This routine returns a struct metapath structure that defines a path
 190 *   through the metadata of inode "ip" to get to block "block".
 191 *
 192 *   Example:
 193 *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 194 *   filesystem with a blocksize of 4096.
 195 *
 196 *   find_metapath() would return a struct metapath structure set to:
 197 *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
 198 *
 199 *   That means that in order to get to the block containing the byte at
 200 *   offset 101342453, we would load the indirect block pointed to by pointer
 201 *   0 in the dinode.  We would then load the indirect block pointed to by
 202 *   pointer 48 in that indirect block.  We would then load the data block
 203 *   pointed to by pointer 165 in that indirect block.
 204 *
 205 *             ----------------------------------------
 206 *             | Dinode |                             |
 207 *             |        |                            4|
 208 *             |        |0 1 2 3 4 5                 9|
 209 *             |        |                            6|
 210 *             ----------------------------------------
 211 *                       |
 212 *                       |
 213 *                       V
 214 *             ----------------------------------------
 215 *             | Indirect Block                       |
 216 *             |                                     5|
 217 *             |            4 4 4 4 4 5 5            1|
 218 *             |0           5 6 7 8 9 0 1            2|
 219 *             ----------------------------------------
 220 *                                |
 221 *                                |
 222 *                                V
 223 *             ----------------------------------------
 224 *             | Indirect Block                       |
 225 *             |                         1 1 1 1 1   5|
 226 *             |                         6 6 6 6 6   1|
 227 *             |0                        3 4 5 6 7   2|
 228 *             ----------------------------------------
 229 *                                           |
 230 *                                           |
 231 *                                           V
 232 *             ----------------------------------------
 233 *             | Data block containing offset         |
 234 *             |            101342453                 |
 235 *             |                                      |
 236 *             |                                      |
 237 *             ----------------------------------------
 238 *
 239 */
 240
 241static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 242                          struct metapath *mp, unsigned int height)
 243{
 244        unsigned int i;
 245
 246        mp->mp_fheight = height;
 247        for (i = height; i--;)
 248                mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 249}
 250
 251static inline unsigned int metapath_branch_start(const struct metapath *mp)
 252{
 253        if (mp->mp_list[0] == 0)
 254                return 2;
 255        return 1;
 256}
 257
 258/**
 259 * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 260 * @height: The metadata height (0 = dinode)
 261 * @mp: The metapath
 262 */
 263static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 264{
 265        struct buffer_head *bh = mp->mp_bh[height];
 266        if (height == 0)
 267                return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 268        return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 269}
 270
 271/**
 272 * metapointer - Return pointer to start of metadata in a buffer
 273 * @height: The metadata height (0 = dinode)
 274 * @mp: The metapath
 275 *
 276 * Return a pointer to the block number of the next height of the metadata
 277 * tree given a buffer containing the pointer to the current height of the
 278 * metadata tree.
 279 */
 280
 281static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 282{
 283        __be64 *p = metaptr1(height, mp);
 284        return p + mp->mp_list[height];
 285}
 286
 287static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
 288{
 289        const struct buffer_head *bh = mp->mp_bh[height];
 290        return (const __be64 *)(bh->b_data + bh->b_size);
 291}
 292
 293static void clone_metapath(struct metapath *clone, struct metapath *mp)
 294{
 295        unsigned int hgt;
 296
 297        *clone = *mp;
 298        for (hgt = 0; hgt < mp->mp_aheight; hgt++)
 299                get_bh(clone->mp_bh[hgt]);
 300}
 301
 302static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 303{
 304        const __be64 *t;
 305
 306        for (t = start; t < end; t++) {
 307                struct buffer_head *rabh;
 308
 309                if (!*t)
 310                        continue;
 311
 312                rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 313                if (trylock_buffer(rabh)) {
 314                        if (!buffer_uptodate(rabh)) {
 315                                rabh->b_end_io = end_buffer_read_sync;
 316                                submit_bh(REQ_OP_READ,
 317                                          REQ_RAHEAD | REQ_META | REQ_PRIO,
 318                                          rabh);
 319                                continue;
 320                        }
 321                        unlock_buffer(rabh);
 322                }
 323                brelse(rabh);
 324        }
 325}
 326
 327static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 328                             unsigned int x, unsigned int h)
 329{
 330        for (; x < h; x++) {
 331                __be64 *ptr = metapointer(x, mp);
 332                u64 dblock = be64_to_cpu(*ptr);
 333                int ret;
 334
 335                if (!dblock)
 336                        break;
 337                ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]);
 338                if (ret)
 339                        return ret;
 340        }
 341        mp->mp_aheight = x + 1;
 342        return 0;
 343}
 344
 345/**
 346 * lookup_metapath - Walk the metadata tree to a specific point
 347 * @ip: The inode
 348 * @mp: The metapath
 349 *
 350 * Assumes that the inode's buffer has already been looked up and
 351 * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 352 * by find_metapath().
 353 *
 354 * If this function encounters part of the tree which has not been
 355 * allocated, it returns the current height of the tree at the point
 356 * at which it found the unallocated block. Blocks which are found are
 357 * added to the mp->mp_bh[] list.
 358 *
 359 * Returns: error
 360 */
 361
 362static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 363{
 364        return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 365}
 366
 367/**
 368 * fillup_metapath - fill up buffers for the metadata path to a specific height
 369 * @ip: The inode
 370 * @mp: The metapath
 371 * @h: The height to which it should be mapped
 372 *
 373 * Similar to lookup_metapath, but does lookups for a range of heights
 374 *
 375 * Returns: error or the number of buffers filled
 376 */
 377
 378static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 379{
 380        unsigned int x = 0;
 381        int ret;
 382
 383        if (h) {
 384                /* find the first buffer we need to look up. */
 385                for (x = h - 1; x > 0; x--) {
 386                        if (mp->mp_bh[x])
 387                                break;
 388                }
 389        }
 390        ret = __fillup_metapath(ip, mp, x, h);
 391        if (ret)
 392                return ret;
 393        return mp->mp_aheight - x - 1;
 394}
 395
 396static void release_metapath(struct metapath *mp)
 397{
 398        int i;
 399
 400        for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 401                if (mp->mp_bh[i] == NULL)
 402                        break;
 403                brelse(mp->mp_bh[i]);
 404                mp->mp_bh[i] = NULL;
 405        }
 406}
 407
 408/**
 409 * gfs2_extent_length - Returns length of an extent of blocks
 410 * @bh: The metadata block
 411 * @ptr: Current position in @bh
 412 * @limit: Max extent length to return
 413 * @eob: Set to 1 if we hit "end of block"
 414 *
 415 * Returns: The length of the extent (minimum of one block)
 416 */
 417
 418static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
 419{
 420        const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 421        const __be64 *first = ptr;
 422        u64 d = be64_to_cpu(*ptr);
 423
 424        *eob = 0;
 425        do {
 426                ptr++;
 427                if (ptr >= end)
 428                        break;
 429                d++;
 430        } while(be64_to_cpu(*ptr) == d);
 431        if (ptr >= end)
 432                *eob = 1;
 433        return ptr - first;
 434}
 435
 436typedef const __be64 *(*gfs2_metadata_walker)(
 437                struct metapath *mp,
 438                const __be64 *start, const __be64 *end,
 439                u64 factor, void *data);
 440
 441#define WALK_STOP ((__be64 *)0)
 442#define WALK_NEXT ((__be64 *)1)
 443
 444static int gfs2_walk_metadata(struct inode *inode, sector_t lblock,
 445                u64 len, struct metapath *mp, gfs2_metadata_walker walker,
 446                void *data)
 447{
 448        struct metapath clone;
 449        struct gfs2_inode *ip = GFS2_I(inode);
 450        struct gfs2_sbd *sdp = GFS2_SB(inode);
 451        const __be64 *start, *end, *ptr;
 452        u64 factor = 1;
 453        unsigned int hgt;
 454        int ret = 0;
 455
 456        for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--)
 457                factor *= sdp->sd_inptrs;
 458
 459        for (;;) {
 460                u64 step;
 461
 462                /* Walk indirect block. */
 463                start = metapointer(hgt, mp);
 464                end = metaend(hgt, mp);
 465
 466                step = (end - start) * factor;
 467                if (step > len)
 468                        end = start + DIV_ROUND_UP_ULL(len, factor);
 469
 470                ptr = walker(mp, start, end, factor, data);
 471                if (ptr == WALK_STOP)
 472                        break;
 473                if (step >= len)
 474                        break;
 475                len -= step;
 476                if (ptr != WALK_NEXT) {
 477                        BUG_ON(!*ptr);
 478                        mp->mp_list[hgt] += ptr - start;
 479                        goto fill_up_metapath;
 480                }
 481
 482lower_metapath:
 483                /* Decrease height of metapath. */
 484                if (mp != &clone) {
 485                        clone_metapath(&clone, mp);
 486                        mp = &clone;
 487                }
 488                brelse(mp->mp_bh[hgt]);
 489                mp->mp_bh[hgt] = NULL;
 490                if (!hgt)
 491                        break;
 492                hgt--;
 493                factor *= sdp->sd_inptrs;
 494
 495                /* Advance in metadata tree. */
 496                (mp->mp_list[hgt])++;
 497                start = metapointer(hgt, mp);
 498                end = metaend(hgt, mp);
 499                if (start >= end) {
 500                        mp->mp_list[hgt] = 0;
 501                        if (!hgt)
 502                                break;
 503                        goto lower_metapath;
 504                }
 505
 506fill_up_metapath:
 507                /* Increase height of metapath. */
 508                if (mp != &clone) {
 509                        clone_metapath(&clone, mp);
 510                        mp = &clone;
 511                }
 512                ret = fillup_metapath(ip, mp, ip->i_height - 1);
 513                if (ret < 0)
 514                        break;
 515                hgt += ret;
 516                for (; ret; ret--)
 517                        do_div(factor, sdp->sd_inptrs);
 518                mp->mp_aheight = hgt + 1;
 519        }
 520        if (mp == &clone)
 521                release_metapath(mp);
 522        return ret;
 523}
 524
 525struct gfs2_hole_walker_args {
 526        u64 blocks;
 527};
 528
 529static const __be64 *gfs2_hole_walker(struct metapath *mp,
 530                const __be64 *start, const __be64 *end,
 531                u64 factor, void *data)
 532{
 533        struct gfs2_hole_walker_args *args = data;
 534        const __be64 *ptr;
 535
 536        for (ptr = start; ptr < end; ptr++) {
 537                if (*ptr) {
 538                        args->blocks += (ptr - start) * factor;
 539                        if (mp->mp_aheight == mp->mp_fheight)
 540                                return WALK_STOP;
 541                        return ptr;  /* increase height */
 542                }
 543        }
 544        args->blocks += (end - start) * factor;
 545        return WALK_NEXT;
 546}
 547
 548/**
 549 * gfs2_hole_size - figure out the size of a hole
 550 * @inode: The inode
 551 * @lblock: The logical starting block number
 552 * @len: How far to look (in blocks)
 553 * @mp: The metapath at lblock
 554 * @iomap: The iomap to store the hole size in
 555 *
 556 * This function modifies @mp.
 557 *
 558 * Returns: errno on error
 559 */
 560static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
 561                          struct metapath *mp, struct iomap *iomap)
 562{
 563        struct gfs2_hole_walker_args args = { };
 564        int ret = 0;
 565
 566        ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args);
 567        if (!ret)
 568                iomap->length = args.blocks << inode->i_blkbits;
 569        return ret;
 570}
 571
 572static inline __be64 *gfs2_indirect_init(struct metapath *mp,
 573                                         struct gfs2_glock *gl, unsigned int i,
 574                                         unsigned offset, u64 bn)
 575{
 576        __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 577                       ((i > 1) ? sizeof(struct gfs2_meta_header) :
 578                                 sizeof(struct gfs2_dinode)));
 579        BUG_ON(i < 1);
 580        BUG_ON(mp->mp_bh[i] != NULL);
 581        mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 582        gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 583        gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 584        gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 585        ptr += offset;
 586        *ptr = cpu_to_be64(bn);
 587        return ptr;
 588}
 589
 590enum alloc_state {
 591        ALLOC_DATA = 0,
 592        ALLOC_GROW_DEPTH = 1,
 593        ALLOC_GROW_HEIGHT = 2,
 594        /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 595};
 596
 597/**
 598 * gfs2_iomap_alloc - Build a metadata tree of the requested height
 599 * @inode: The GFS2 inode
 600 * @iomap: The iomap structure
 601 * @flags: iomap flags
 602 * @mp: The metapath, with proper height information calculated
 603 *
 604 * In this routine we may have to alloc:
 605 *   i) Indirect blocks to grow the metadata tree height
 606 *  ii) Indirect blocks to fill in lower part of the metadata tree
 607 * iii) Data blocks
 608 *
 609 * This function is called after gfs2_iomap_get, which works out the
 610 * total number of blocks which we need via gfs2_alloc_size.
 611 *
 612 * We then do the actual allocation asking for an extent at a time (if
 613 * enough contiguous free blocks are available, there will only be one
 614 * allocation request per call) and uses the state machine to initialise
 615 * the blocks in order.
 616 *
 617 * Right now, this function will allocate at most one indirect block
 618 * worth of data -- with a default block size of 4K, that's slightly
 619 * less than 2M.  If this limitation is ever removed to allow huge
 620 * allocations, we would probably still want to limit the iomap size we
 621 * return to avoid stalling other tasks during huge writes; the next
 622 * iomap iteration would then find the blocks already allocated.
 623 *
 624 * Returns: errno on error
 625 */
 626
 627static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 628                            unsigned flags, struct metapath *mp)
 629{
 630        struct gfs2_inode *ip = GFS2_I(inode);
 631        struct gfs2_sbd *sdp = GFS2_SB(inode);
 632        struct buffer_head *dibh = mp->mp_bh[0];
 633        u64 bn;
 634        unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 635        size_t dblks = iomap->length >> inode->i_blkbits;
 636        const unsigned end_of_metadata = mp->mp_fheight - 1;
 637        int ret;
 638        enum alloc_state state;
 639        __be64 *ptr;
 640        __be64 zero_bn = 0;
 641
 642        BUG_ON(mp->mp_aheight < 1);
 643        BUG_ON(dibh == NULL);
 644        BUG_ON(dblks < 1);
 645
 646        gfs2_trans_add_meta(ip->i_gl, dibh);
 647
 648        down_write(&ip->i_rw_mutex);
 649
 650        if (mp->mp_fheight == mp->mp_aheight) {
 651                /* Bottom indirect block exists */
 652                state = ALLOC_DATA;
 653        } else {
 654                /* Need to allocate indirect blocks */
 655                if (mp->mp_fheight == ip->i_height) {
 656                        /* Writing into existing tree, extend tree down */
 657                        iblks = mp->mp_fheight - mp->mp_aheight;
 658                        state = ALLOC_GROW_DEPTH;
 659                } else {
 660                        /* Building up tree height */
 661                        state = ALLOC_GROW_HEIGHT;
 662                        iblks = mp->mp_fheight - ip->i_height;
 663                        branch_start = metapath_branch_start(mp);
 664                        iblks += (mp->mp_fheight - branch_start);
 665                }
 666        }
 667
 668        /* start of the second part of the function (state machine) */
 669
 670        blks = dblks + iblks;
 671        i = mp->mp_aheight;
 672        do {
 673                n = blks - alloced;
 674                ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 675                if (ret)
 676                        goto out;
 677                alloced += n;
 678                if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 679                        gfs2_trans_add_unrevoke(sdp, bn, n);
 680                switch (state) {
 681                /* Growing height of tree */
 682                case ALLOC_GROW_HEIGHT:
 683                        if (i == 1) {
 684                                ptr = (__be64 *)(dibh->b_data +
 685                                                 sizeof(struct gfs2_dinode));
 686                                zero_bn = *ptr;
 687                        }
 688                        for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 689                             i++, n--)
 690                                gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 691                        if (i - 1 == mp->mp_fheight - ip->i_height) {
 692                                i--;
 693                                gfs2_buffer_copy_tail(mp->mp_bh[i],
 694                                                sizeof(struct gfs2_meta_header),
 695                                                dibh, sizeof(struct gfs2_dinode));
 696                                gfs2_buffer_clear_tail(dibh,
 697                                                sizeof(struct gfs2_dinode) +
 698                                                sizeof(__be64));
 699                                ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 700                                        sizeof(struct gfs2_meta_header));
 701                                *ptr = zero_bn;
 702                                state = ALLOC_GROW_DEPTH;
 703                                for(i = branch_start; i < mp->mp_fheight; i++) {
 704                                        if (mp->mp_bh[i] == NULL)
 705                                                break;
 706                                        brelse(mp->mp_bh[i]);
 707                                        mp->mp_bh[i] = NULL;
 708                                }
 709                                i = branch_start;
 710                        }
 711                        if (n == 0)
 712                                break;
 713                /* Branching from existing tree */
 714                case ALLOC_GROW_DEPTH:
 715                        if (i > 1 && i < mp->mp_fheight)
 716                                gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 717                        for (; i < mp->mp_fheight && n > 0; i++, n--)
 718                                gfs2_indirect_init(mp, ip->i_gl, i,
 719                                                   mp->mp_list[i-1], bn++);
 720                        if (i == mp->mp_fheight)
 721                                state = ALLOC_DATA;
 722                        if (n == 0)
 723                                break;
 724                /* Tree complete, adding data blocks */
 725                case ALLOC_DATA:
 726                        BUG_ON(n > dblks);
 727                        BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 728                        gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 729                        dblks = n;
 730                        ptr = metapointer(end_of_metadata, mp);
 731                        iomap->addr = bn << inode->i_blkbits;
 732                        iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
 733                        while (n-- > 0)
 734                                *ptr++ = cpu_to_be64(bn++);
 735                        break;
 736                }
 737        } while (iomap->addr == IOMAP_NULL_ADDR);
 738
 739        iomap->type = IOMAP_MAPPED;
 740        iomap->length = (u64)dblks << inode->i_blkbits;
 741        ip->i_height = mp->mp_fheight;
 742        gfs2_add_inode_blocks(&ip->i_inode, alloced);
 743        gfs2_dinode_out(ip, dibh->b_data);
 744out:
 745        up_write(&ip->i_rw_mutex);
 746        return ret;
 747}
 748
 749#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
 750
 751/**
 752 * gfs2_alloc_size - Compute the maximum allocation size
 753 * @inode: The inode
 754 * @mp: The metapath
 755 * @size: Requested size in blocks
 756 *
 757 * Compute the maximum size of the next allocation at @mp.
 758 *
 759 * Returns: size in blocks
 760 */
 761static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
 762{
 763        struct gfs2_inode *ip = GFS2_I(inode);
 764        struct gfs2_sbd *sdp = GFS2_SB(inode);
 765        const __be64 *first, *ptr, *end;
 766
 767        /*
 768         * For writes to stuffed files, this function is called twice via
 769         * gfs2_iomap_get, before and after unstuffing. The size we return the
 770         * first time needs to be large enough to get the reservation and
 771         * allocation sizes right.  The size we return the second time must
 772         * be exact or else gfs2_iomap_alloc won't do the right thing.
 773         */
 774
 775        if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
 776                unsigned int maxsize = mp->mp_fheight > 1 ?
 777                        sdp->sd_inptrs : sdp->sd_diptrs;
 778                maxsize -= mp->mp_list[mp->mp_fheight - 1];
 779                if (size > maxsize)
 780                        size = maxsize;
 781                return size;
 782        }
 783
 784        first = metapointer(ip->i_height - 1, mp);
 785        end = metaend(ip->i_height - 1, mp);
 786        if (end - first > size)
 787                end = first + size;
 788        for (ptr = first; ptr < end; ptr++) {
 789                if (*ptr)
 790                        break;
 791        }
 792        return ptr - first;
 793}
 794
 795/**
 796 * gfs2_iomap_get - Map blocks from an inode to disk blocks
 797 * @inode: The inode
 798 * @pos: Starting position in bytes
 799 * @length: Length to map, in bytes
 800 * @flags: iomap flags
 801 * @iomap: The iomap structure
 802 * @mp: The metapath
 803 *
 804 * Returns: errno
 805 */
 806static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 807                          unsigned flags, struct iomap *iomap,
 808                          struct metapath *mp)
 809{
 810        struct gfs2_inode *ip = GFS2_I(inode);
 811        struct gfs2_sbd *sdp = GFS2_SB(inode);
 812        loff_t size = i_size_read(inode);
 813        __be64 *ptr;
 814        sector_t lblock;
 815        sector_t lblock_stop;
 816        int ret;
 817        int eob;
 818        u64 len;
 819        struct buffer_head *dibh = NULL, *bh;
 820        u8 height;
 821
 822        if (!length)
 823                return -EINVAL;
 824
 825        down_read(&ip->i_rw_mutex);
 826
 827        ret = gfs2_meta_inode_buffer(ip, &dibh);
 828        if (ret)
 829                goto unlock;
 830        mp->mp_bh[0] = dibh;
 831
 832        if (gfs2_is_stuffed(ip)) {
 833                if (flags & IOMAP_WRITE) {
 834                        loff_t max_size = gfs2_max_stuffed_size(ip);
 835
 836                        if (pos + length > max_size)
 837                                goto unstuff;
 838                        iomap->length = max_size;
 839                } else {
 840                        if (pos >= size) {
 841                                if (flags & IOMAP_REPORT) {
 842                                        ret = -ENOENT;
 843                                        goto unlock;
 844                                } else {
 845                                        /* report a hole */
 846                                        iomap->offset = pos;
 847                                        iomap->length = length;
 848                                        goto do_alloc;
 849                                }
 850                        }
 851                        iomap->length = size;
 852                }
 853                iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 854                              sizeof(struct gfs2_dinode);
 855                iomap->type = IOMAP_INLINE;
 856                iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
 857                goto out;
 858        }
 859
 860unstuff:
 861        lblock = pos >> inode->i_blkbits;
 862        iomap->offset = lblock << inode->i_blkbits;
 863        lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 864        len = lblock_stop - lblock + 1;
 865        iomap->length = len << inode->i_blkbits;
 866
 867        height = ip->i_height;
 868        while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
 869                height++;
 870        find_metapath(sdp, lblock, mp, height);
 871        if (height > ip->i_height || gfs2_is_stuffed(ip))
 872                goto do_alloc;
 873
 874        ret = lookup_metapath(ip, mp);
 875        if (ret)
 876                goto unlock;
 877
 878        if (mp->mp_aheight != ip->i_height)
 879                goto do_alloc;
 880
 881        ptr = metapointer(ip->i_height - 1, mp);
 882        if (*ptr == 0)
 883                goto do_alloc;
 884
 885        bh = mp->mp_bh[ip->i_height - 1];
 886        len = gfs2_extent_length(bh, ptr, len, &eob);
 887
 888        iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 889        iomap->length = len << inode->i_blkbits;
 890        iomap->type = IOMAP_MAPPED;
 891        iomap->flags |= IOMAP_F_MERGED;
 892        if (eob)
 893                iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 894
 895out:
 896        iomap->bdev = inode->i_sb->s_bdev;
 897unlock:
 898        up_read(&ip->i_rw_mutex);
 899        return ret;
 900
 901do_alloc:
 902        iomap->addr = IOMAP_NULL_ADDR;
 903        iomap->type = IOMAP_HOLE;
 904        if (flags & IOMAP_REPORT) {
 905                if (pos >= size)
 906                        ret = -ENOENT;
 907                else if (height == ip->i_height)
 908                        ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 909                else
 910                        iomap->length = size - pos;
 911        } else if (flags & IOMAP_WRITE) {
 912                u64 alloc_size;
 913
 914                if (flags & IOMAP_DIRECT)
 915                        goto out;  /* (see gfs2_file_direct_write) */
 916
 917                len = gfs2_alloc_size(inode, mp, len);
 918                alloc_size = len << inode->i_blkbits;
 919                if (alloc_size < iomap->length)
 920                        iomap->length = alloc_size;
 921        } else {
 922                if (pos < size && height == ip->i_height)
 923                        ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 924        }
 925        goto out;
 926}
 927
 928static int gfs2_write_lock(struct inode *inode)
 929{
 930        struct gfs2_inode *ip = GFS2_I(inode);
 931        struct gfs2_sbd *sdp = GFS2_SB(inode);
 932        int error;
 933
 934        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
 935        error = gfs2_glock_nq(&ip->i_gh);
 936        if (error)
 937                goto out_uninit;
 938        if (&ip->i_inode == sdp->sd_rindex) {
 939                struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 940
 941                error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
 942                                           GL_NOCACHE, &m_ip->i_gh);
 943                if (error)
 944                        goto out_unlock;
 945        }
 946        return 0;
 947
 948out_unlock:
 949        gfs2_glock_dq(&ip->i_gh);
 950out_uninit:
 951        gfs2_holder_uninit(&ip->i_gh);
 952        return error;
 953}
 954
 955static void gfs2_write_unlock(struct inode *inode)
 956{
 957        struct gfs2_inode *ip = GFS2_I(inode);
 958        struct gfs2_sbd *sdp = GFS2_SB(inode);
 959
 960        if (&ip->i_inode == sdp->sd_rindex) {
 961                struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 962
 963                gfs2_glock_dq_uninit(&m_ip->i_gh);
 964        }
 965        gfs2_glock_dq_uninit(&ip->i_gh);
 966}
 967
 968static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos,
 969                                unsigned copied, struct page *page,
 970                                struct iomap *iomap)
 971{
 972        struct gfs2_inode *ip = GFS2_I(inode);
 973
 974        gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
 975}
 976
 977static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
 978                                  loff_t length, unsigned flags,
 979                                  struct iomap *iomap,
 980                                  struct metapath *mp)
 981{
 982        struct gfs2_inode *ip = GFS2_I(inode);
 983        struct gfs2_sbd *sdp = GFS2_SB(inode);
 984        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
 985        bool unstuff, alloc_required;
 986        int ret;
 987
 988        ret = gfs2_write_lock(inode);
 989        if (ret)
 990                return ret;
 991
 992        unstuff = gfs2_is_stuffed(ip) &&
 993                  pos + length > gfs2_max_stuffed_size(ip);
 994
 995        ret = gfs2_iomap_get(inode, pos, length, flags, iomap, mp);
 996        if (ret)
 997                goto out_unlock;
 998
 999        alloc_required = unstuff || iomap->type == IOMAP_HOLE;
1000
1001        if (alloc_required || gfs2_is_jdata(ip))
1002                gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1003                                       &ind_blocks);
1004
1005        if (alloc_required) {
1006                struct gfs2_alloc_parms ap = {
1007                        .target = data_blocks + ind_blocks
1008                };
1009
1010                ret = gfs2_quota_lock_check(ip, &ap);
1011                if (ret)
1012                        goto out_unlock;
1013
1014                ret = gfs2_inplace_reserve(ip, &ap);
1015                if (ret)
1016                        goto out_qunlock;
1017        }
1018
1019        rblocks = RES_DINODE + ind_blocks;
1020        if (gfs2_is_jdata(ip))
1021                rblocks += data_blocks;
1022        if (ind_blocks || data_blocks)
1023                rblocks += RES_STATFS + RES_QUOTA;
1024        if (inode == sdp->sd_rindex)
1025                rblocks += 2 * RES_STATFS;
1026        if (alloc_required)
1027                rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1028
1029        ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits);
1030        if (ret)
1031                goto out_trans_fail;
1032
1033        if (unstuff) {
1034                ret = gfs2_unstuff_dinode(ip, NULL);
1035                if (ret)
1036                        goto out_trans_end;
1037                release_metapath(mp);
1038                ret = gfs2_iomap_get(inode, iomap->offset, iomap->length,
1039                                     flags, iomap, mp);
1040                if (ret)
1041                        goto out_trans_end;
1042        }
1043
1044        if (iomap->type == IOMAP_HOLE) {
1045                ret = gfs2_iomap_alloc(inode, iomap, flags, mp);
1046                if (ret) {
1047                        gfs2_trans_end(sdp);
1048                        gfs2_inplace_release(ip);
1049                        punch_hole(ip, iomap->offset, iomap->length);
1050                        goto out_qunlock;
1051                }
1052        }
1053        if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip))
1054                iomap->page_done = gfs2_iomap_journaled_page_done;
1055        return 0;
1056
1057out_trans_end:
1058        gfs2_trans_end(sdp);
1059out_trans_fail:
1060        if (alloc_required)
1061                gfs2_inplace_release(ip);
1062out_qunlock:
1063        if (alloc_required)
1064                gfs2_quota_unlock(ip);
1065out_unlock:
1066        gfs2_write_unlock(inode);
1067        return ret;
1068}
1069
1070static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1071                            unsigned flags, struct iomap *iomap)
1072{
1073        struct gfs2_inode *ip = GFS2_I(inode);
1074        struct metapath mp = { .mp_aheight = 1, };
1075        int ret;
1076
1077        iomap->flags |= IOMAP_F_BUFFER_HEAD;
1078
1079        trace_gfs2_iomap_start(ip, pos, length, flags);
1080        if ((flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)) {
1081                ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1082        } else {
1083                ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1084
1085                /*
1086                 * Silently fall back to buffered I/O for stuffed files or if
1087                 * we've hot a hole (see gfs2_file_direct_write).
1088                 */
1089                if ((flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT) &&
1090                    iomap->type != IOMAP_MAPPED)
1091                        ret = -ENOTBLK;
1092        }
1093        if (!ret) {
1094                get_bh(mp.mp_bh[0]);
1095                iomap->private = mp.mp_bh[0];
1096        }
1097        release_metapath(&mp);
1098        trace_gfs2_iomap_end(ip, iomap, ret);
1099        return ret;
1100}
1101
1102static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1103                          ssize_t written, unsigned flags, struct iomap *iomap)
1104{
1105        struct gfs2_inode *ip = GFS2_I(inode);
1106        struct gfs2_sbd *sdp = GFS2_SB(inode);
1107        struct gfs2_trans *tr = current->journal_info;
1108        struct buffer_head *dibh = iomap->private;
1109
1110        if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE)
1111                goto out;
1112
1113        if (iomap->type != IOMAP_INLINE) {
1114                gfs2_ordered_add_inode(ip);
1115
1116                if (tr->tr_num_buf_new)
1117                        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1118                else
1119                        gfs2_trans_add_meta(ip->i_gl, dibh);
1120        }
1121
1122        if (inode == sdp->sd_rindex) {
1123                adjust_fs_space(inode);
1124                sdp->sd_rindex_uptodate = 0;
1125        }
1126
1127        gfs2_trans_end(sdp);
1128        gfs2_inplace_release(ip);
1129
1130        if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1131                /* Deallocate blocks that were just allocated. */
1132                loff_t blockmask = i_blocksize(inode) - 1;
1133                loff_t end = (pos + length) & ~blockmask;
1134
1135                pos = (pos + written + blockmask) & ~blockmask;
1136                if (pos < end) {
1137                        truncate_pagecache_range(inode, pos, end - 1);
1138                        punch_hole(ip, pos, end - pos);
1139                }
1140        }
1141
1142        if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1143                gfs2_quota_unlock(ip);
1144        gfs2_write_unlock(inode);
1145
1146out:
1147        if (dibh)
1148                brelse(dibh);
1149        return 0;
1150}
1151
1152const struct iomap_ops gfs2_iomap_ops = {
1153        .iomap_begin = gfs2_iomap_begin,
1154        .iomap_end = gfs2_iomap_end,
1155};
1156
1157/**
1158 * gfs2_block_map - Map one or more blocks of an inode to a disk block
1159 * @inode: The inode
1160 * @lblock: The logical block number
1161 * @bh_map: The bh to be mapped
1162 * @create: True if its ok to alloc blocks to satify the request
1163 *
1164 * The size of the requested mapping is defined in bh_map->b_size.
1165 *
1166 * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1167 * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1168 * bh_map->b_size to indicate the size of the mapping when @lblock and
1169 * successive blocks are mapped, up to the requested size.
1170 *
1171 * Sets buffer_boundary() if a read of metadata will be required
1172 * before the next block can be mapped. Sets buffer_new() if new
1173 * blocks were allocated.
1174 *
1175 * Returns: errno
1176 */
1177
1178int gfs2_block_map(struct inode *inode, sector_t lblock,
1179                   struct buffer_head *bh_map, int create)
1180{
1181        struct gfs2_inode *ip = GFS2_I(inode);
1182        loff_t pos = (loff_t)lblock << inode->i_blkbits;
1183        loff_t length = bh_map->b_size;
1184        struct metapath mp = { .mp_aheight = 1, };
1185        struct iomap iomap = { };
1186        int ret;
1187
1188        clear_buffer_mapped(bh_map);
1189        clear_buffer_new(bh_map);
1190        clear_buffer_boundary(bh_map);
1191        trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1192
1193        if (create) {
1194                ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp);
1195                if (!ret && iomap.type == IOMAP_HOLE)
1196                        ret = gfs2_iomap_alloc(inode, &iomap, IOMAP_WRITE, &mp);
1197                release_metapath(&mp);
1198        } else {
1199                ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
1200                release_metapath(&mp);
1201        }
1202        if (ret)
1203                goto out;
1204
1205        if (iomap.length > bh_map->b_size) {
1206                iomap.length = bh_map->b_size;
1207                iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1208        }
1209        if (iomap.addr != IOMAP_NULL_ADDR)
1210                map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1211        bh_map->b_size = iomap.length;
1212        if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1213                set_buffer_boundary(bh_map);
1214        if (iomap.flags & IOMAP_F_NEW)
1215                set_buffer_new(bh_map);
1216
1217out:
1218        trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1219        return ret;
1220}
1221
1222/*
1223 * Deprecated: do not use in new code
1224 */
1225int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
1226{
1227        struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
1228        int ret;
1229        int create = *new;
1230
1231        BUG_ON(!extlen);
1232        BUG_ON(!dblock);
1233        BUG_ON(!new);
1234
1235        bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
1236        ret = gfs2_block_map(inode, lblock, &bh, create);
1237        *extlen = bh.b_size >> inode->i_blkbits;
1238        *dblock = bh.b_blocknr;
1239        if (buffer_new(&bh))
1240                *new = 1;
1241        else
1242                *new = 0;
1243        return ret;
1244}
1245
1246/**
1247 * gfs2_block_zero_range - Deal with zeroing out data
1248 *
1249 * This is partly borrowed from ext3.
1250 */
1251static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1252                                 unsigned int length)
1253{
1254        struct address_space *mapping = inode->i_mapping;
1255        struct gfs2_inode *ip = GFS2_I(inode);
1256        unsigned long index = from >> PAGE_SHIFT;
1257        unsigned offset = from & (PAGE_SIZE-1);
1258        unsigned blocksize, iblock, pos;
1259        struct buffer_head *bh;
1260        struct page *page;
1261        int err;
1262
1263        page = find_or_create_page(mapping, index, GFP_NOFS);
1264        if (!page)
1265                return 0;
1266
1267        blocksize = inode->i_sb->s_blocksize;
1268        iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
1269
1270        if (!page_has_buffers(page))
1271                create_empty_buffers(page, blocksize, 0);
1272
1273        /* Find the buffer that contains "offset" */
1274        bh = page_buffers(page);
1275        pos = blocksize;
1276        while (offset >= pos) {
1277                bh = bh->b_this_page;
1278                iblock++;
1279                pos += blocksize;
1280        }
1281
1282        err = 0;
1283
1284        if (!buffer_mapped(bh)) {
1285                gfs2_block_map(inode, iblock, bh, 0);
1286                /* unmapped? It's a hole - nothing to do */
1287                if (!buffer_mapped(bh))
1288                        goto unlock;
1289        }
1290
1291        /* Ok, it's mapped. Make sure it's up-to-date */
1292        if (PageUptodate(page))
1293                set_buffer_uptodate(bh);
1294
1295        if (!buffer_uptodate(bh)) {
1296                err = -EIO;
1297                ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1298                wait_on_buffer(bh);
1299                /* Uhhuh. Read error. Complain and punt. */
1300                if (!buffer_uptodate(bh))
1301                        goto unlock;
1302                err = 0;
1303        }
1304
1305        if (gfs2_is_jdata(ip))
1306                gfs2_trans_add_data(ip->i_gl, bh);
1307        else
1308                gfs2_ordered_add_inode(ip);
1309
1310        zero_user(page, offset, length);
1311        mark_buffer_dirty(bh);
1312unlock:
1313        unlock_page(page);
1314        put_page(page);
1315        return err;
1316}
1317
1318#define GFS2_JTRUNC_REVOKES 8192
1319
1320/**
1321 * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1322 * @inode: The inode being truncated
1323 * @oldsize: The original (larger) size
1324 * @newsize: The new smaller size
1325 *
1326 * With jdata files, we have to journal a revoke for each block which is
1327 * truncated. As a result, we need to split this into separate transactions
1328 * if the number of pages being truncated gets too large.
1329 */
1330
1331static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1332{
1333        struct gfs2_sbd *sdp = GFS2_SB(inode);
1334        u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1335        u64 chunk;
1336        int error;
1337
1338        while (oldsize != newsize) {
1339                struct gfs2_trans *tr;
1340                unsigned int offs;
1341
1342                chunk = oldsize - newsize;
1343                if (chunk > max_chunk)
1344                        chunk = max_chunk;
1345
1346                offs = oldsize & ~PAGE_MASK;
1347                if (offs && chunk > PAGE_SIZE)
1348                        chunk = offs + ((chunk - offs) & PAGE_MASK);
1349
1350                truncate_pagecache(inode, oldsize - chunk);
1351                oldsize -= chunk;
1352
1353                tr = current->journal_info;
1354                if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1355                        continue;
1356
1357                gfs2_trans_end(sdp);
1358                error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1359                if (error)
1360                        return error;
1361        }
1362
1363        return 0;
1364}
1365
1366static int trunc_start(struct inode *inode, u64 newsize)
1367{
1368        struct gfs2_inode *ip = GFS2_I(inode);
1369        struct gfs2_sbd *sdp = GFS2_SB(inode);
1370        struct buffer_head *dibh = NULL;
1371        int journaled = gfs2_is_jdata(ip);
1372        u64 oldsize = inode->i_size;
1373        int error;
1374
1375        if (journaled)
1376                error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1377        else
1378                error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1379        if (error)
1380                return error;
1381
1382        error = gfs2_meta_inode_buffer(ip, &dibh);
1383        if (error)
1384                goto out;
1385
1386        gfs2_trans_add_meta(ip->i_gl, dibh);
1387
1388        if (gfs2_is_stuffed(ip)) {
1389                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1390        } else {
1391                unsigned int blocksize = i_blocksize(inode);
1392                unsigned int offs = newsize & (blocksize - 1);
1393                if (offs) {
1394                        error = gfs2_block_zero_range(inode, newsize,
1395                                                      blocksize - offs);
1396                        if (error)
1397                                goto out;
1398                }
1399                ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1400        }
1401
1402        i_size_write(inode, newsize);
1403        ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1404        gfs2_dinode_out(ip, dibh->b_data);
1405
1406        if (journaled)
1407                error = gfs2_journaled_truncate(inode, oldsize, newsize);
1408        else
1409                truncate_pagecache(inode, newsize);
1410
1411out:
1412        brelse(dibh);
1413        if (current->journal_info)
1414                gfs2_trans_end(sdp);
1415        return error;
1416}
1417
1418int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
1419                         struct iomap *iomap)
1420{
1421        struct metapath mp = { .mp_aheight = 1, };
1422        int ret;
1423
1424        ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1425        if (!ret && iomap->type == IOMAP_HOLE)
1426                ret = gfs2_iomap_alloc(inode, iomap, IOMAP_WRITE, &mp);
1427        release_metapath(&mp);
1428        return ret;
1429}
1430
1431/**
1432 * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1433 * @ip: inode
1434 * @rg_gh: holder of resource group glock
1435 * @bh: buffer head to sweep
1436 * @start: starting point in bh
1437 * @end: end point in bh
1438 * @meta: true if bh points to metadata (rather than data)
1439 * @btotal: place to keep count of total blocks freed
1440 *
1441 * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1442 * free, and free them all. However, we do it one rgrp at a time. If this
1443 * block has references to multiple rgrps, we break it into individual
1444 * transactions. This allows other processes to use the rgrps while we're
1445 * focused on a single one, for better concurrency / performance.
1446 * At every transaction boundary, we rewrite the inode into the journal.
1447 * That way the bitmaps are kept consistent with the inode and we can recover
1448 * if we're interrupted by power-outages.
1449 *
1450 * Returns: 0, or return code if an error occurred.
1451 *          *btotal has the total number of blocks freed
1452 */
1453static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1454                              struct buffer_head *bh, __be64 *start, __be64 *end,
1455                              bool meta, u32 *btotal)
1456{
1457        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1458        struct gfs2_rgrpd *rgd;
1459        struct gfs2_trans *tr;
1460        __be64 *p;
1461        int blks_outside_rgrp;
1462        u64 bn, bstart, isize_blks;
1463        s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1464        int ret = 0;
1465        bool buf_in_tr = false; /* buffer was added to transaction */
1466
1467more_rgrps:
1468        rgd = NULL;
1469        if (gfs2_holder_initialized(rd_gh)) {
1470                rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1471                gfs2_assert_withdraw(sdp,
1472                             gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1473        }
1474        blks_outside_rgrp = 0;
1475        bstart = 0;
1476        blen = 0;
1477
1478        for (p = start; p < end; p++) {
1479                if (!*p)
1480                        continue;
1481                bn = be64_to_cpu(*p);
1482
1483                if (rgd) {
1484                        if (!rgrp_contains_block(rgd, bn)) {
1485                                blks_outside_rgrp++;
1486                                continue;
1487                        }
1488                } else {
1489                        rgd = gfs2_blk2rgrpd(sdp, bn, true);
1490                        if (unlikely(!rgd)) {
1491                                ret = -EIO;
1492                                goto out;
1493                        }
1494                        ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1495                                                 0, rd_gh);
1496                        if (ret)
1497                                goto out;
1498
1499                        /* Must be done with the rgrp glock held: */
1500                        if (gfs2_rs_active(&ip->i_res) &&
1501                            rgd == ip->i_res.rs_rbm.rgd)
1502                                gfs2_rs_deltree(&ip->i_res);
1503                }
1504
1505                /* The size of our transactions will be unknown until we
1506                   actually process all the metadata blocks that relate to
1507                   the rgrp. So we estimate. We know it can't be more than
1508                   the dinode's i_blocks and we don't want to exceed the
1509                   journal flush threshold, sd_log_thresh2. */
1510                if (current->journal_info == NULL) {
1511                        unsigned int jblocks_rqsted, revokes;
1512
1513                        jblocks_rqsted = rgd->rd_length + RES_DINODE +
1514                                RES_INDIRECT;
1515                        isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1516                        if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1517                                jblocks_rqsted +=
1518                                        atomic_read(&sdp->sd_log_thresh2);
1519                        else
1520                                jblocks_rqsted += isize_blks;
1521                        revokes = jblocks_rqsted;
1522                        if (meta)
1523                                revokes += end - start;
1524                        else if (ip->i_depth)
1525                                revokes += sdp->sd_inptrs;
1526                        ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1527                        if (ret)
1528                                goto out_unlock;
1529                        down_write(&ip->i_rw_mutex);
1530                }
1531                /* check if we will exceed the transaction blocks requested */
1532                tr = current->journal_info;
1533                if (tr->tr_num_buf_new + RES_STATFS +
1534                    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1535                        /* We set blks_outside_rgrp to ensure the loop will
1536                           be repeated for the same rgrp, but with a new
1537                           transaction. */
1538                        blks_outside_rgrp++;
1539                        /* This next part is tricky. If the buffer was added
1540                           to the transaction, we've already set some block
1541                           pointers to 0, so we better follow through and free
1542                           them, or we will introduce corruption (so break).
1543                           This may be impossible, or at least rare, but I
1544                           decided to cover the case regardless.
1545
1546                           If the buffer was not added to the transaction
1547                           (this call), doing so would exceed our transaction
1548                           size, so we need to end the transaction and start a
1549                           new one (so goto). */
1550
1551                        if (buf_in_tr)
1552                                break;
1553                        goto out_unlock;
1554                }
1555
1556                gfs2_trans_add_meta(ip->i_gl, bh);
1557                buf_in_tr = true;
1558                *p = 0;
1559                if (bstart + blen == bn) {
1560                        blen++;
1561                        continue;
1562                }
1563                if (bstart) {
1564                        __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1565                        (*btotal) += blen;
1566                        gfs2_add_inode_blocks(&ip->i_inode, -blen);
1567                }
1568                bstart = bn;
1569                blen = 1;
1570        }
1571        if (bstart) {
1572                __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1573                (*btotal) += blen;
1574                gfs2_add_inode_blocks(&ip->i_inode, -blen);
1575        }
1576out_unlock:
1577        if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1578                                            outside the rgrp we just processed,
1579                                            do it all over again. */
1580                if (current->journal_info) {
1581                        struct buffer_head *dibh;
1582
1583                        ret = gfs2_meta_inode_buffer(ip, &dibh);
1584                        if (ret)
1585                                goto out;
1586
1587                        /* Every transaction boundary, we rewrite the dinode
1588                           to keep its di_blocks current in case of failure. */
1589                        ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1590                                current_time(&ip->i_inode);
1591                        gfs2_trans_add_meta(ip->i_gl, dibh);
1592                        gfs2_dinode_out(ip, dibh->b_data);
1593                        brelse(dibh);
1594                        up_write(&ip->i_rw_mutex);
1595                        gfs2_trans_end(sdp);
1596                }
1597                gfs2_glock_dq_uninit(rd_gh);
1598                cond_resched();
1599                goto more_rgrps;
1600        }
1601out:
1602        return ret;
1603}
1604
1605static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1606{
1607        if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1608                return false;
1609        return true;
1610}
1611
1612/**
1613 * find_nonnull_ptr - find a non-null pointer given a metapath and height
1614 * @mp: starting metapath
1615 * @h: desired height to search
1616 *
1617 * Assumes the metapath is valid (with buffers) out to height h.
1618 * Returns: true if a non-null pointer was found in the metapath buffer
1619 *          false if all remaining pointers are NULL in the buffer
1620 */
1621static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1622                             unsigned int h,
1623                             __u16 *end_list, unsigned int end_aligned)
1624{
1625        struct buffer_head *bh = mp->mp_bh[h];
1626        __be64 *first, *ptr, *end;
1627
1628        first = metaptr1(h, mp);
1629        ptr = first + mp->mp_list[h];
1630        end = (__be64 *)(bh->b_data + bh->b_size);
1631        if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1632                bool keep_end = h < end_aligned;
1633                end = first + end_list[h] + keep_end;
1634        }
1635
1636        while (ptr < end) {
1637                if (*ptr) { /* if we have a non-null pointer */
1638                        mp->mp_list[h] = ptr - first;
1639                        h++;
1640                        if (h < GFS2_MAX_META_HEIGHT)
1641                                mp->mp_list[h] = 0;
1642                        return true;
1643                }
1644                ptr++;
1645        }
1646        return false;
1647}
1648
1649enum dealloc_states {
1650        DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1651        DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1652        DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1653        DEALLOC_DONE = 3,       /* process complete */
1654};
1655
1656static inline void
1657metapointer_range(struct metapath *mp, int height,
1658                  __u16 *start_list, unsigned int start_aligned,
1659                  __u16 *end_list, unsigned int end_aligned,
1660                  __be64 **start, __be64 **end)
1661{
1662        struct buffer_head *bh = mp->mp_bh[height];
1663        __be64 *first;
1664
1665        first = metaptr1(height, mp);
1666        *start = first;
1667        if (mp_eq_to_hgt(mp, start_list, height)) {
1668                bool keep_start = height < start_aligned;
1669                *start = first + start_list[height] + keep_start;
1670        }
1671        *end = (__be64 *)(bh->b_data + bh->b_size);
1672        if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1673                bool keep_end = height < end_aligned;
1674                *end = first + end_list[height] + keep_end;
1675        }
1676}
1677
1678static inline bool walk_done(struct gfs2_sbd *sdp,
1679                             struct metapath *mp, int height,
1680                             __u16 *end_list, unsigned int end_aligned)
1681{
1682        __u16 end;
1683
1684        if (end_list) {
1685                bool keep_end = height < end_aligned;
1686                if (!mp_eq_to_hgt(mp, end_list, height))
1687                        return false;
1688                end = end_list[height] + keep_end;
1689        } else
1690                end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1691        return mp->mp_list[height] >= end;
1692}
1693
1694/**
1695 * punch_hole - deallocate blocks in a file
1696 * @ip: inode to truncate
1697 * @offset: the start of the hole
1698 * @length: the size of the hole (or 0 for truncate)
1699 *
1700 * Punch a hole into a file or truncate a file at a given position.  This
1701 * function operates in whole blocks (@offset and @length are rounded
1702 * accordingly); partially filled blocks must be cleared otherwise.
1703 *
1704 * This function works from the bottom up, and from the right to the left. In
1705 * other words, it strips off the highest layer (data) before stripping any of
1706 * the metadata. Doing it this way is best in case the operation is interrupted
1707 * by power failure, etc.  The dinode is rewritten in every transaction to
1708 * guarantee integrity.
1709 */
1710static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1711{
1712        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1713        u64 maxsize = sdp->sd_heightsize[ip->i_height];
1714        struct metapath mp = {};
1715        struct buffer_head *dibh, *bh;
1716        struct gfs2_holder rd_gh;
1717        unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1718        u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1719        __u16 start_list[GFS2_MAX_META_HEIGHT];
1720        __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1721        unsigned int start_aligned, uninitialized_var(end_aligned);
1722        unsigned int strip_h = ip->i_height - 1;
1723        u32 btotal = 0;
1724        int ret, state;
1725        int mp_h; /* metapath buffers are read in to this height */
1726        u64 prev_bnr = 0;
1727        __be64 *start, *end;
1728
1729        if (offset >= maxsize) {
1730                /*
1731                 * The starting point lies beyond the allocated meta-data;
1732                 * there are no blocks do deallocate.
1733                 */
1734                return 0;
1735        }
1736
1737        /*
1738         * The start position of the hole is defined by lblock, start_list, and
1739         * start_aligned.  The end position of the hole is defined by lend,
1740         * end_list, and end_aligned.
1741         *
1742         * start_aligned and end_aligned define down to which height the start
1743         * and end positions are aligned to the metadata tree (i.e., the
1744         * position is a multiple of the metadata granularity at the height
1745         * above).  This determines at which heights additional meta pointers
1746         * needs to be preserved for the remaining data.
1747         */
1748
1749        if (length) {
1750                u64 end_offset = offset + length;
1751                u64 lend;
1752
1753                /*
1754                 * Clip the end at the maximum file size for the given height:
1755                 * that's how far the metadata goes; files bigger than that
1756                 * will have additional layers of indirection.
1757                 */
1758                if (end_offset > maxsize)
1759                        end_offset = maxsize;
1760                lend = end_offset >> bsize_shift;
1761
1762                if (lblock >= lend)
1763                        return 0;
1764
1765                find_metapath(sdp, lend, &mp, ip->i_height);
1766                end_list = __end_list;
1767                memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1768
1769                for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1770                        if (end_list[mp_h])
1771                                break;
1772                }
1773                end_aligned = mp_h;
1774        }
1775
1776        find_metapath(sdp, lblock, &mp, ip->i_height);
1777        memcpy(start_list, mp.mp_list, sizeof(start_list));
1778
1779        for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1780                if (start_list[mp_h])
1781                        break;
1782        }
1783        start_aligned = mp_h;
1784
1785        ret = gfs2_meta_inode_buffer(ip, &dibh);
1786        if (ret)
1787                return ret;
1788
1789        mp.mp_bh[0] = dibh;
1790        ret = lookup_metapath(ip, &mp);
1791        if (ret)
1792                goto out_metapath;
1793
1794        /* issue read-ahead on metadata */
1795        for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1796                metapointer_range(&mp, mp_h, start_list, start_aligned,
1797                                  end_list, end_aligned, &start, &end);
1798                gfs2_metapath_ra(ip->i_gl, start, end);
1799        }
1800
1801        if (mp.mp_aheight == ip->i_height)
1802                state = DEALLOC_MP_FULL; /* We have a complete metapath */
1803        else
1804                state = DEALLOC_FILL_MP; /* deal with partial metapath */
1805
1806        ret = gfs2_rindex_update(sdp);
1807        if (ret)
1808                goto out_metapath;
1809
1810        ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1811        if (ret)
1812                goto out_metapath;
1813        gfs2_holder_mark_uninitialized(&rd_gh);
1814
1815        mp_h = strip_h;
1816
1817        while (state != DEALLOC_DONE) {
1818                switch (state) {
1819                /* Truncate a full metapath at the given strip height.
1820                 * Note that strip_h == mp_h in order to be in this state. */
1821                case DEALLOC_MP_FULL:
1822                        bh = mp.mp_bh[mp_h];
1823                        gfs2_assert_withdraw(sdp, bh);
1824                        if (gfs2_assert_withdraw(sdp,
1825                                                 prev_bnr != bh->b_blocknr)) {
1826                                printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1827                                       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1828                                       sdp->sd_fsname,
1829                                       (unsigned long long)ip->i_no_addr,
1830                                       prev_bnr, ip->i_height, strip_h, mp_h);
1831                        }
1832                        prev_bnr = bh->b_blocknr;
1833
1834                        if (gfs2_metatype_check(sdp, bh,
1835                                                (mp_h ? GFS2_METATYPE_IN :
1836                                                        GFS2_METATYPE_DI))) {
1837                                ret = -EIO;
1838                                goto out;
1839                        }
1840
1841                        /*
1842                         * Below, passing end_aligned as 0 gives us the
1843                         * metapointer range excluding the end point: the end
1844                         * point is the first metapath we must not deallocate!
1845                         */
1846
1847                        metapointer_range(&mp, mp_h, start_list, start_aligned,
1848                                          end_list, 0 /* end_aligned */,
1849                                          &start, &end);
1850                        ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1851                                                 start, end,
1852                                                 mp_h != ip->i_height - 1,
1853                                                 &btotal);
1854
1855                        /* If we hit an error or just swept dinode buffer,
1856                           just exit. */
1857                        if (ret || !mp_h) {
1858                                state = DEALLOC_DONE;
1859                                break;
1860                        }
1861                        state = DEALLOC_MP_LOWER;
1862                        break;
1863
1864                /* lower the metapath strip height */
1865                case DEALLOC_MP_LOWER:
1866                        /* We're done with the current buffer, so release it,
1867                           unless it's the dinode buffer. Then back up to the
1868                           previous pointer. */
1869                        if (mp_h) {
1870                                brelse(mp.mp_bh[mp_h]);
1871                                mp.mp_bh[mp_h] = NULL;
1872                        }
1873                        /* If we can't get any lower in height, we've stripped
1874                           off all we can. Next step is to back up and start
1875                           stripping the previous level of metadata. */
1876                        if (mp_h == 0) {
1877                                strip_h--;
1878                                memcpy(mp.mp_list, start_list, sizeof(start_list));
1879                                mp_h = strip_h;
1880                                state = DEALLOC_FILL_MP;
1881                                break;
1882                        }
1883                        mp.mp_list[mp_h] = 0;
1884                        mp_h--; /* search one metadata height down */
1885                        mp.mp_list[mp_h]++;
1886                        if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1887                                break;
1888                        /* Here we've found a part of the metapath that is not
1889                         * allocated. We need to search at that height for the
1890                         * next non-null pointer. */
1891                        if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1892                                state = DEALLOC_FILL_MP;
1893                                mp_h++;
1894                        }
1895                        /* No more non-null pointers at this height. Back up
1896                           to the previous height and try again. */
1897                        break; /* loop around in the same state */
1898
1899                /* Fill the metapath with buffers to the given height. */
1900                case DEALLOC_FILL_MP:
1901                        /* Fill the buffers out to the current height. */
1902                        ret = fillup_metapath(ip, &mp, mp_h);
1903                        if (ret < 0)
1904                                goto out;
1905
1906                        /* On the first pass, issue read-ahead on metadata. */
1907                        if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1908                                unsigned int height = mp.mp_aheight - 1;
1909
1910                                /* No read-ahead for data blocks. */
1911                                if (mp.mp_aheight - 1 == strip_h)
1912                                        height--;
1913
1914                                for (; height >= mp.mp_aheight - ret; height--) {
1915                                        metapointer_range(&mp, height,
1916                                                          start_list, start_aligned,
1917                                                          end_list, end_aligned,
1918                                                          &start, &end);
1919                                        gfs2_metapath_ra(ip->i_gl, start, end);
1920                                }
1921                        }
1922
1923                        /* If buffers found for the entire strip height */
1924                        if (mp.mp_aheight - 1 == strip_h) {
1925                                state = DEALLOC_MP_FULL;
1926                                break;
1927                        }
1928                        if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1929                                mp_h = mp.mp_aheight - 1;
1930
1931                        /* If we find a non-null block pointer, crawl a bit
1932                           higher up in the metapath and try again, otherwise
1933                           we need to look lower for a new starting point. */
1934                        if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1935                                mp_h++;
1936                        else
1937                                state = DEALLOC_MP_LOWER;
1938                        break;
1939                }
1940        }
1941
1942        if (btotal) {
1943                if (current->journal_info == NULL) {
1944                        ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1945                                               RES_QUOTA, 0);
1946                        if (ret)
1947                                goto out;
1948                        down_write(&ip->i_rw_mutex);
1949                }
1950                gfs2_statfs_change(sdp, 0, +btotal, 0);
1951                gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1952                                  ip->i_inode.i_gid);
1953                ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1954                gfs2_trans_add_meta(ip->i_gl, dibh);
1955                gfs2_dinode_out(ip, dibh->b_data);
1956                up_write(&ip->i_rw_mutex);
1957                gfs2_trans_end(sdp);
1958        }
1959
1960out:
1961        if (gfs2_holder_initialized(&rd_gh))
1962                gfs2_glock_dq_uninit(&rd_gh);
1963        if (current->journal_info) {
1964                up_write(&ip->i_rw_mutex);
1965                gfs2_trans_end(sdp);
1966                cond_resched();
1967        }
1968        gfs2_quota_unhold(ip);
1969out_metapath:
1970        release_metapath(&mp);
1971        return ret;
1972}
1973
1974static int trunc_end(struct gfs2_inode *ip)
1975{
1976        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1977        struct buffer_head *dibh;
1978        int error;
1979
1980        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1981        if (error)
1982                return error;
1983
1984        down_write(&ip->i_rw_mutex);
1985
1986        error = gfs2_meta_inode_buffer(ip, &dibh);
1987        if (error)
1988                goto out;
1989
1990        if (!i_size_read(&ip->i_inode)) {
1991                ip->i_height = 0;
1992                ip->i_goal = ip->i_no_addr;
1993                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1994                gfs2_ordered_del_inode(ip);
1995        }
1996        ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1997        ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1998
1999        gfs2_trans_add_meta(ip->i_gl, dibh);
2000        gfs2_dinode_out(ip, dibh->b_data);
2001        brelse(dibh);
2002
2003out:
2004        up_write(&ip->i_rw_mutex);
2005        gfs2_trans_end(sdp);
2006        return error;
2007}
2008
2009/**
2010 * do_shrink - make a file smaller
2011 * @inode: the inode
2012 * @newsize: the size to make the file
2013 *
2014 * Called with an exclusive lock on @inode. The @size must
2015 * be equal to or smaller than the current inode size.
2016 *
2017 * Returns: errno
2018 */
2019
2020static int do_shrink(struct inode *inode, u64 newsize)
2021{
2022        struct gfs2_inode *ip = GFS2_I(inode);
2023        int error;
2024
2025        error = trunc_start(inode, newsize);
2026        if (error < 0)
2027                return error;
2028        if (gfs2_is_stuffed(ip))
2029                return 0;
2030
2031        error = punch_hole(ip, newsize, 0);
2032        if (error == 0)
2033                error = trunc_end(ip);
2034
2035        return error;
2036}
2037
2038void gfs2_trim_blocks(struct inode *inode)
2039{
2040        int ret;
2041
2042        ret = do_shrink(inode, inode->i_size);
2043        WARN_ON(ret != 0);
2044}
2045
2046/**
2047 * do_grow - Touch and update inode size
2048 * @inode: The inode
2049 * @size: The new size
2050 *
2051 * This function updates the timestamps on the inode and
2052 * may also increase the size of the inode. This function
2053 * must not be called with @size any smaller than the current
2054 * inode size.
2055 *
2056 * Although it is not strictly required to unstuff files here,
2057 * earlier versions of GFS2 have a bug in the stuffed file reading
2058 * code which will result in a buffer overrun if the size is larger
2059 * than the max stuffed file size. In order to prevent this from
2060 * occurring, such files are unstuffed, but in other cases we can
2061 * just update the inode size directly.
2062 *
2063 * Returns: 0 on success, or -ve on error
2064 */
2065
2066static int do_grow(struct inode *inode, u64 size)
2067{
2068        struct gfs2_inode *ip = GFS2_I(inode);
2069        struct gfs2_sbd *sdp = GFS2_SB(inode);
2070        struct gfs2_alloc_parms ap = { .target = 1, };
2071        struct buffer_head *dibh;
2072        int error;
2073        int unstuff = 0;
2074
2075        if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2076                error = gfs2_quota_lock_check(ip, &ap);
2077                if (error)
2078                        return error;
2079
2080                error = gfs2_inplace_reserve(ip, &ap);
2081                if (error)
2082                        goto do_grow_qunlock;
2083                unstuff = 1;
2084        }
2085
2086        error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2087                                 (unstuff &&
2088                                  gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2089                                 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2090                                  0 : RES_QUOTA), 0);
2091        if (error)
2092                goto do_grow_release;
2093
2094        if (unstuff) {
2095                error = gfs2_unstuff_dinode(ip, NULL);
2096                if (error)
2097                        goto do_end_trans;
2098        }
2099
2100        error = gfs2_meta_inode_buffer(ip, &dibh);
2101        if (error)
2102                goto do_end_trans;
2103
2104        i_size_write(inode, size);
2105        ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2106        gfs2_trans_add_meta(ip->i_gl, dibh);
2107        gfs2_dinode_out(ip, dibh->b_data);
2108        brelse(dibh);
2109
2110do_end_trans:
2111        gfs2_trans_end(sdp);
2112do_grow_release:
2113        if (unstuff) {
2114                gfs2_inplace_release(ip);
2115do_grow_qunlock:
2116                gfs2_quota_unlock(ip);
2117        }
2118        return error;
2119}
2120
2121/**
2122 * gfs2_setattr_size - make a file a given size
2123 * @inode: the inode
2124 * @newsize: the size to make the file
2125 *
2126 * The file size can grow, shrink, or stay the same size. This
2127 * is called holding i_rwsem and an exclusive glock on the inode
2128 * in question.
2129 *
2130 * Returns: errno
2131 */
2132
2133int gfs2_setattr_size(struct inode *inode, u64 newsize)
2134{
2135        struct gfs2_inode *ip = GFS2_I(inode);
2136        int ret;
2137
2138        BUG_ON(!S_ISREG(inode->i_mode));
2139
2140        ret = inode_newsize_ok(inode, newsize);
2141        if (ret)
2142                return ret;
2143
2144        inode_dio_wait(inode);
2145
2146        ret = gfs2_rsqa_alloc(ip);
2147        if (ret)
2148                goto out;
2149
2150        if (newsize >= inode->i_size) {
2151                ret = do_grow(inode, newsize);
2152                goto out;
2153        }
2154
2155        ret = do_shrink(inode, newsize);
2156out:
2157        gfs2_rsqa_delete(ip, NULL);
2158        return ret;
2159}
2160
2161int gfs2_truncatei_resume(struct gfs2_inode *ip)
2162{
2163        int error;
2164        error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2165        if (!error)
2166                error = trunc_end(ip);
2167        return error;
2168}
2169
2170int gfs2_file_dealloc(struct gfs2_inode *ip)
2171{
2172        return punch_hole(ip, 0, 0);
2173}
2174
2175/**
2176 * gfs2_free_journal_extents - Free cached journal bmap info
2177 * @jd: The journal
2178 *
2179 */
2180
2181void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2182{
2183        struct gfs2_journal_extent *jext;
2184
2185        while(!list_empty(&jd->extent_list)) {
2186                jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
2187                list_del(&jext->list);
2188                kfree(jext);
2189        }
2190}
2191
2192/**
2193 * gfs2_add_jextent - Add or merge a new extent to extent cache
2194 * @jd: The journal descriptor
2195 * @lblock: The logical block at start of new extent
2196 * @dblock: The physical block at start of new extent
2197 * @blocks: Size of extent in fs blocks
2198 *
2199 * Returns: 0 on success or -ENOMEM
2200 */
2201
2202static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2203{
2204        struct gfs2_journal_extent *jext;
2205
2206        if (!list_empty(&jd->extent_list)) {
2207                jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
2208                if ((jext->dblock + jext->blocks) == dblock) {
2209                        jext->blocks += blocks;
2210                        return 0;
2211                }
2212        }
2213
2214        jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2215        if (jext == NULL)
2216                return -ENOMEM;
2217        jext->dblock = dblock;
2218        jext->lblock = lblock;
2219        jext->blocks = blocks;
2220        list_add_tail(&jext->list, &jd->extent_list);
2221        jd->nr_extents++;
2222        return 0;
2223}
2224
2225/**
2226 * gfs2_map_journal_extents - Cache journal bmap info
2227 * @sdp: The super block
2228 * @jd: The journal to map
2229 *
2230 * Create a reusable "extent" mapping from all logical
2231 * blocks to all physical blocks for the given journal.  This will save
2232 * us time when writing journal blocks.  Most journals will have only one
2233 * extent that maps all their logical blocks.  That's because gfs2.mkfs
2234 * arranges the journal blocks sequentially to maximize performance.
2235 * So the extent would map the first block for the entire file length.
2236 * However, gfs2_jadd can happen while file activity is happening, so
2237 * those journals may not be sequential.  Less likely is the case where
2238 * the users created their own journals by mounting the metafs and
2239 * laying it out.  But it's still possible.  These journals might have
2240 * several extents.
2241 *
2242 * Returns: 0 on success, or error on failure
2243 */
2244
2245int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2246{
2247        u64 lblock = 0;
2248        u64 lblock_stop;
2249        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2250        struct buffer_head bh;
2251        unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2252        u64 size;
2253        int rc;
2254        ktime_t start, end;
2255
2256        start = ktime_get();
2257        lblock_stop = i_size_read(jd->jd_inode) >> shift;
2258        size = (lblock_stop - lblock) << shift;
2259        jd->nr_extents = 0;
2260        WARN_ON(!list_empty(&jd->extent_list));
2261
2262        do {
2263                bh.b_state = 0;
2264                bh.b_blocknr = 0;
2265                bh.b_size = size;
2266                rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2267                if (rc || !buffer_mapped(&bh))
2268                        goto fail;
2269                rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2270                if (rc)
2271                        goto fail;
2272                size -= bh.b_size;
2273                lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2274        } while(size > 0);
2275
2276        end = ktime_get();
2277        fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2278                jd->nr_extents, ktime_ms_delta(end, start));
2279        return 0;
2280
2281fail:
2282        fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2283                rc, jd->jd_jid,
2284                (unsigned long long)(i_size_read(jd->jd_inode) - size),
2285                jd->nr_extents);
2286        fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2287                rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2288                bh.b_state, (unsigned long long)bh.b_size);
2289        gfs2_free_journal_extents(jd);
2290        return rc;
2291}
2292
2293/**
2294 * gfs2_write_alloc_required - figure out if a write will require an allocation
2295 * @ip: the file being written to
2296 * @offset: the offset to write to
2297 * @len: the number of bytes being written
2298 *
2299 * Returns: 1 if an alloc is required, 0 otherwise
2300 */
2301
2302int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2303                              unsigned int len)
2304{
2305        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2306        struct buffer_head bh;
2307        unsigned int shift;
2308        u64 lblock, lblock_stop, size;
2309        u64 end_of_file;
2310
2311        if (!len)
2312                return 0;
2313
2314        if (gfs2_is_stuffed(ip)) {
2315                if (offset + len > gfs2_max_stuffed_size(ip))
2316                        return 1;
2317                return 0;
2318        }
2319
2320        shift = sdp->sd_sb.sb_bsize_shift;
2321        BUG_ON(gfs2_is_dir(ip));
2322        end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2323        lblock = offset >> shift;
2324        lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2325        if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2326                return 1;
2327
2328        size = (lblock_stop - lblock) << shift;
2329        do {
2330                bh.b_state = 0;
2331                bh.b_size = size;
2332                gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2333                if (!buffer_mapped(&bh))
2334                        return 1;
2335                size -= bh.b_size;
2336                lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2337        } while(size > 0);
2338
2339        return 0;
2340}
2341
2342static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2343{
2344        struct gfs2_inode *ip = GFS2_I(inode);
2345        struct buffer_head *dibh;
2346        int error;
2347
2348        if (offset >= inode->i_size)
2349                return 0;
2350        if (offset + length > inode->i_size)
2351                length = inode->i_size - offset;
2352
2353        error = gfs2_meta_inode_buffer(ip, &dibh);
2354        if (error)
2355                return error;
2356        gfs2_trans_add_meta(ip->i_gl, dibh);
2357        memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2358               length);
2359        brelse(dibh);
2360        return 0;
2361}
2362
2363static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2364                                         loff_t length)
2365{
2366        struct gfs2_sbd *sdp = GFS2_SB(inode);
2367        loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2368        int error;
2369
2370        while (length) {
2371                struct gfs2_trans *tr;
2372                loff_t chunk;
2373                unsigned int offs;
2374
2375                chunk = length;
2376                if (chunk > max_chunk)
2377                        chunk = max_chunk;
2378
2379                offs = offset & ~PAGE_MASK;
2380                if (offs && chunk > PAGE_SIZE)
2381                        chunk = offs + ((chunk - offs) & PAGE_MASK);
2382
2383                truncate_pagecache_range(inode, offset, chunk);
2384                offset += chunk;
2385                length -= chunk;
2386
2387                tr = current->journal_info;
2388                if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2389                        continue;
2390
2391                gfs2_trans_end(sdp);
2392                error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2393                if (error)
2394                        return error;
2395        }
2396        return 0;
2397}
2398
2399int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2400{
2401        struct inode *inode = file_inode(file);
2402        struct gfs2_inode *ip = GFS2_I(inode);
2403        struct gfs2_sbd *sdp = GFS2_SB(inode);
2404        int error;
2405
2406        if (gfs2_is_jdata(ip))
2407                error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2408                                         GFS2_JTRUNC_REVOKES);
2409        else
2410                error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2411        if (error)
2412                return error;
2413
2414        if (gfs2_is_stuffed(ip)) {
2415                error = stuffed_zero_range(inode, offset, length);
2416                if (error)
2417                        goto out;
2418        } else {
2419                unsigned int start_off, end_len, blocksize;
2420
2421                blocksize = i_blocksize(inode);
2422                start_off = offset & (blocksize - 1);
2423                end_len = (offset + length) & (blocksize - 1);
2424                if (start_off) {
2425                        unsigned int len = length;
2426                        if (length > blocksize - start_off)
2427                                len = blocksize - start_off;
2428                        error = gfs2_block_zero_range(inode, offset, len);
2429                        if (error)
2430                                goto out;
2431                        if (start_off + length < blocksize)
2432                                end_len = 0;
2433                }
2434                if (end_len) {
2435                        error = gfs2_block_zero_range(inode,
2436                                offset + length - end_len, end_len);
2437                        if (error)
2438                                goto out;
2439                }
2440        }
2441
2442        if (gfs2_is_jdata(ip)) {
2443                BUG_ON(!current->journal_info);
2444                gfs2_journaled_truncate_range(inode, offset, length);
2445        } else
2446                truncate_pagecache_range(inode, offset, offset + length - 1);
2447
2448        file_update_time(file);
2449        mark_inode_dirty(inode);
2450
2451        if (current->journal_info)
2452                gfs2_trans_end(sdp);
2453
2454        if (!gfs2_is_stuffed(ip))
2455                error = punch_hole(ip, offset, length);
2456
2457out:
2458        if (current->journal_info)
2459                gfs2_trans_end(sdp);
2460        return error;
2461}
2462