LXR linux/fs/gfs2/bmap.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   4 * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   5 */
   6
   7#include <linux/spinlock.h>
   8#include <linux/completion.h>
   9#include <linux/buffer_head.h>
  10#include <linux/blkdev.h>
  11#include <linux/gfs2_ondisk.h>
  12#include <linux/crc32.h>
  13#include <linux/iomap.h>
  14#include <linux/ktime.h>
  15
  16#include "gfs2.h"
  17#include "incore.h"
  18#include "bmap.h"
  19#include "glock.h"
  20#include "inode.h"
  21#include "meta_io.h"
  22#include "quota.h"
  23#include "rgrp.h"
  24#include "log.h"
  25#include "super.h"
  26#include "trans.h"
  27#include "dir.h"
  28#include "util.h"
  29#include "aops.h"
  30#include "trace_gfs2.h"
  31
  32/* This doesn't need to be that large as max 64 bit pointers in a 4k
  33 * block is 512, so __u16 is fine for that. It saves stack space to
  34 * keep it small.
  35 */
  36struct metapath {
  37        struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  38        __u16 mp_list[GFS2_MAX_META_HEIGHT];
  39        int mp_fheight; /* find_metapath height */
  40        int mp_aheight; /* actual height (lookup height) */
  41};
  42
  43static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
  44
  45/**
  46 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  47 * @ip: the inode
  48 * @dibh: the dinode buffer
  49 * @block: the block number that was allocated
  50 * @page: The (optional) page. This is looked up if @page is NULL
  51 *
  52 * Returns: errno
  53 */
  54
  55static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  56                               u64 block, struct page *page)
  57{
  58        struct inode *inode = &ip->i_inode;
  59
  60        if (!PageUptodate(page)) {
  61                void *kaddr = kmap(page);
  62                u64 dsize = i_size_read(inode);
  63 
  64                if (dsize > gfs2_max_stuffed_size(ip))
  65                        dsize = gfs2_max_stuffed_size(ip);
  66
  67                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  68                memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
  69                kunmap(page);
  70
  71                SetPageUptodate(page);
  72        }
  73
  74        if (gfs2_is_jdata(ip)) {
  75                struct buffer_head *bh;
  76
  77                if (!page_has_buffers(page))
  78                        create_empty_buffers(page, BIT(inode->i_blkbits),
  79                                             BIT(BH_Uptodate));
  80
  81                bh = page_buffers(page);
  82                if (!buffer_mapped(bh))
  83                        map_bh(bh, inode->i_sb, block);
  84
  85                set_buffer_uptodate(bh);
  86                gfs2_trans_add_data(ip->i_gl, bh);
  87        } else {
  88                set_page_dirty(page);
  89                gfs2_ordered_add_inode(ip);
  90        }
  91
  92        return 0;
  93}
  94
  95static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
  96{
  97        struct buffer_head *bh, *dibh;
  98        struct gfs2_dinode *di;
  99        u64 block = 0;
 100        int isdir = gfs2_is_dir(ip);
 101        int error;
 102
 103        error = gfs2_meta_inode_buffer(ip, &dibh);
 104        if (error)
 105                return error;
 106
 107        if (i_size_read(&ip->i_inode)) {
 108                /* Get a free block, fill it with the stuffed data,
 109                   and write it out to disk */
 110
 111                unsigned int n = 1;
 112                error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 113                if (error)
 114                        goto out_brelse;
 115                if (isdir) {
 116                        gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
 117                        error = gfs2_dir_get_new_buffer(ip, block, &bh);
 118                        if (error)
 119                                goto out_brelse;
 120                        gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 121                                              dibh, sizeof(struct gfs2_dinode));
 122                        brelse(bh);
 123                } else {
 124                        error = gfs2_unstuffer_page(ip, dibh, block, page);
 125                        if (error)
 126                                goto out_brelse;
 127                }
 128        }
 129
 130        /*  Set up the pointer to the new block  */
 131
 132        gfs2_trans_add_meta(ip->i_gl, dibh);
 133        di = (struct gfs2_dinode *)dibh->b_data;
 134        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 135
 136        if (i_size_read(&ip->i_inode)) {
 137                *(__be64 *)(di + 1) = cpu_to_be64(block);
 138                gfs2_add_inode_blocks(&ip->i_inode, 1);
 139                di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 140        }
 141
 142        ip->i_height = 1;
 143        di->di_height = cpu_to_be16(1);
 144
 145out_brelse:
 146        brelse(dibh);
 147        return error;
 148}
 149
 150/**
 151 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 152 * @ip: The GFS2 inode to unstuff
 153 *
 154 * This routine unstuffs a dinode and returns it to a "normal" state such
 155 * that the height can be grown in the traditional way.
 156 *
 157 * Returns: errno
 158 */
 159
 160int gfs2_unstuff_dinode(struct gfs2_inode *ip)
 161{
 162        struct inode *inode = &ip->i_inode;
 163        struct page *page;
 164        int error;
 165
 166        down_write(&ip->i_rw_mutex);
 167        page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
 168        error = -ENOMEM;
 169        if (!page)
 170                goto out;
 171        error = __gfs2_unstuff_inode(ip, page);
 172        unlock_page(page);
 173        put_page(page);
 174out:
 175        up_write(&ip->i_rw_mutex);
 176        return error;
 177}
 178
 179/**
 180 * find_metapath - Find path through the metadata tree
 181 * @sdp: The superblock
 182 * @block: The disk block to look up
 183 * @mp: The metapath to return the result in
 184 * @height: The pre-calculated height of the metadata tree
 185 *
 186 *   This routine returns a struct metapath structure that defines a path
 187 *   through the metadata of inode "ip" to get to block "block".
 188 *
 189 *   Example:
 190 *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 191 *   filesystem with a blocksize of 4096.
 192 *
 193 *   find_metapath() would return a struct metapath structure set to:
 194 *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
 195 *
 196 *   That means that in order to get to the block containing the byte at
 197 *   offset 101342453, we would load the indirect block pointed to by pointer
 198 *   0 in the dinode.  We would then load the indirect block pointed to by
 199 *   pointer 48 in that indirect block.  We would then load the data block
 200 *   pointed to by pointer 165 in that indirect block.
 201 *
 202 *             ----------------------------------------
 203 *             | Dinode |                             |
 204 *             |        |                            4|
 205 *             |        |0 1 2 3 4 5                 9|
 206 *             |        |                            6|
 207 *             ----------------------------------------
 208 *                       |
 209 *                       |
 210 *                       V
 211 *             ----------------------------------------
 212 *             | Indirect Block                       |
 213 *             |                                     5|
 214 *             |            4 4 4 4 4 5 5            1|
 215 *             |0           5 6 7 8 9 0 1            2|
 216 *             ----------------------------------------
 217 *                                |
 218 *                                |
 219 *                                V
 220 *             ----------------------------------------
 221 *             | Indirect Block                       |
 222 *             |                         1 1 1 1 1   5|
 223 *             |                         6 6 6 6 6   1|
 224 *             |0                        3 4 5 6 7   2|
 225 *             ----------------------------------------
 226 *                                           |
 227 *                                           |
 228 *                                           V
 229 *             ----------------------------------------
 230 *             | Data block containing offset         |
 231 *             |            101342453                 |
 232 *             |                                      |
 233 *             |                                      |
 234 *             ----------------------------------------
 235 *
 236 */
 237
 238static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 239                          struct metapath *mp, unsigned int height)
 240{
 241        unsigned int i;
 242
 243        mp->mp_fheight = height;
 244        for (i = height; i--;)
 245                mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 246}
 247
 248static inline unsigned int metapath_branch_start(const struct metapath *mp)
 249{
 250        if (mp->mp_list[0] == 0)
 251                return 2;
 252        return 1;
 253}
 254
 255/**
 256 * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 257 * @height: The metadata height (0 = dinode)
 258 * @mp: The metapath
 259 */
 260static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 261{
 262        struct buffer_head *bh = mp->mp_bh[height];
 263        if (height == 0)
 264                return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 265        return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 266}
 267
 268/**
 269 * metapointer - Return pointer to start of metadata in a buffer
 270 * @height: The metadata height (0 = dinode)
 271 * @mp: The metapath
 272 *
 273 * Return a pointer to the block number of the next height of the metadata
 274 * tree given a buffer containing the pointer to the current height of the
 275 * metadata tree.
 276 */
 277
 278static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 279{
 280        __be64 *p = metaptr1(height, mp);
 281        return p + mp->mp_list[height];
 282}
 283
 284static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
 285{
 286        const struct buffer_head *bh = mp->mp_bh[height];
 287        return (const __be64 *)(bh->b_data + bh->b_size);
 288}
 289
 290static void clone_metapath(struct metapath *clone, struct metapath *mp)
 291{
 292        unsigned int hgt;
 293
 294        *clone = *mp;
 295        for (hgt = 0; hgt < mp->mp_aheight; hgt++)
 296                get_bh(clone->mp_bh[hgt]);
 297}
 298
 299static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 300{
 301        const __be64 *t;
 302
 303        for (t = start; t < end; t++) {
 304                struct buffer_head *rabh;
 305
 306                if (!*t)
 307                        continue;
 308
 309                rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 310                if (trylock_buffer(rabh)) {
 311                        if (!buffer_uptodate(rabh)) {
 312                                rabh->b_end_io = end_buffer_read_sync;
 313                                submit_bh(REQ_OP_READ,
 314                                          REQ_RAHEAD | REQ_META | REQ_PRIO,
 315                                          rabh);
 316                                continue;
 317                        }
 318                        unlock_buffer(rabh);
 319                }
 320                brelse(rabh);
 321        }
 322}
 323
 324static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 325                             unsigned int x, unsigned int h)
 326{
 327        for (; x < h; x++) {
 328                __be64 *ptr = metapointer(x, mp);
 329                u64 dblock = be64_to_cpu(*ptr);
 330                int ret;
 331
 332                if (!dblock)
 333                        break;
 334                ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
 335                if (ret)
 336                        return ret;
 337        }
 338        mp->mp_aheight = x + 1;
 339        return 0;
 340}
 341
 342/**
 343 * lookup_metapath - Walk the metadata tree to a specific point
 344 * @ip: The inode
 345 * @mp: The metapath
 346 *
 347 * Assumes that the inode's buffer has already been looked up and
 348 * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 349 * by find_metapath().
 350 *
 351 * If this function encounters part of the tree which has not been
 352 * allocated, it returns the current height of the tree at the point
 353 * at which it found the unallocated block. Blocks which are found are
 354 * added to the mp->mp_bh[] list.
 355 *
 356 * Returns: error
 357 */
 358
 359static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 360{
 361        return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 362}
 363
 364/**
 365 * fillup_metapath - fill up buffers for the metadata path to a specific height
 366 * @ip: The inode
 367 * @mp: The metapath
 368 * @h: The height to which it should be mapped
 369 *
 370 * Similar to lookup_metapath, but does lookups for a range of heights
 371 *
 372 * Returns: error or the number of buffers filled
 373 */
 374
 375static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 376{
 377        unsigned int x = 0;
 378        int ret;
 379
 380        if (h) {
 381                /* find the first buffer we need to look up. */
 382                for (x = h - 1; x > 0; x--) {
 383                        if (mp->mp_bh[x])
 384                                break;
 385                }
 386        }
 387        ret = __fillup_metapath(ip, mp, x, h);
 388        if (ret)
 389                return ret;
 390        return mp->mp_aheight - x - 1;
 391}
 392
 393static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
 394{
 395        sector_t factor = 1, block = 0;
 396        int hgt;
 397
 398        for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
 399                if (hgt < mp->mp_aheight)
 400                        block += mp->mp_list[hgt] * factor;
 401                factor *= sdp->sd_inptrs;
 402        }
 403        return block;
 404}
 405
 406static void release_metapath(struct metapath *mp)
 407{
 408        int i;
 409
 410        for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 411                if (mp->mp_bh[i] == NULL)
 412                        break;
 413                brelse(mp->mp_bh[i]);
 414                mp->mp_bh[i] = NULL;
 415        }
 416}
 417
 418/**
 419 * gfs2_extent_length - Returns length of an extent of blocks
 420 * @bh: The metadata block
 421 * @ptr: Current position in @bh
 422 * @limit: Max extent length to return
 423 * @eob: Set to 1 if we hit "end of block"
 424 *
 425 * Returns: The length of the extent (minimum of one block)
 426 */
 427
 428static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
 429{
 430        const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 431        const __be64 *first = ptr;
 432        u64 d = be64_to_cpu(*ptr);
 433
 434        *eob = 0;
 435        do {
 436                ptr++;
 437                if (ptr >= end)
 438                        break;
 439                d++;
 440        } while(be64_to_cpu(*ptr) == d);
 441        if (ptr >= end)
 442                *eob = 1;
 443        return ptr - first;
 444}
 445
 446enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
 447
 448/*
 449 * gfs2_metadata_walker - walk an indirect block
 450 * @mp: Metapath to indirect block
 451 * @ptrs: Number of pointers to look at
 452 *
 453 * When returning WALK_FOLLOW, the walker must update @mp to point at the right
 454 * indirect block to follow.
 455 */
 456typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
 457                                                   unsigned int ptrs);
 458
 459/*
 460 * gfs2_walk_metadata - walk a tree of indirect blocks
 461 * @inode: The inode
 462 * @mp: Starting point of walk
 463 * @max_len: Maximum number of blocks to walk
 464 * @walker: Called during the walk
 465 *
 466 * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
 467 * past the end of metadata, and a negative error code otherwise.
 468 */
 469
 470static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
 471                u64 max_len, gfs2_metadata_walker walker)
 472{
 473        struct gfs2_inode *ip = GFS2_I(inode);
 474        struct gfs2_sbd *sdp = GFS2_SB(inode);
 475        u64 factor = 1;
 476        unsigned int hgt;
 477        int ret;
 478
 479        /*
 480         * The walk starts in the lowest allocated indirect block, which may be
 481         * before the position indicated by @mp.  Adjust @max_len accordingly
 482         * to avoid a short walk.
 483         */
 484        for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
 485                max_len += mp->mp_list[hgt] * factor;
 486                mp->mp_list[hgt] = 0;
 487                factor *= sdp->sd_inptrs;
 488        }
 489
 490        for (;;) {
 491                u16 start = mp->mp_list[hgt];
 492                enum walker_status status;
 493                unsigned int ptrs;
 494                u64 len;
 495
 496                /* Walk indirect block. */
 497                ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
 498                len = ptrs * factor;
 499                if (len > max_len)
 500                        ptrs = DIV_ROUND_UP_ULL(max_len, factor);
 501                status = walker(mp, ptrs);
 502                switch (status) {
 503                case WALK_STOP:
 504                        return 1;
 505                case WALK_FOLLOW:
 506                        BUG_ON(mp->mp_aheight == mp->mp_fheight);
 507                        ptrs = mp->mp_list[hgt] - start;
 508                        len = ptrs * factor;
 509                        break;
 510                case WALK_CONTINUE:
 511                        break;
 512                }
 513                if (len >= max_len)
 514                        break;
 515                max_len -= len;
 516                if (status == WALK_FOLLOW)
 517                        goto fill_up_metapath;
 518
 519lower_metapath:
 520                /* Decrease height of metapath. */
 521                brelse(mp->mp_bh[hgt]);
 522                mp->mp_bh[hgt] = NULL;
 523                mp->mp_list[hgt] = 0;
 524                if (!hgt)
 525                        break;
 526                hgt--;
 527                factor *= sdp->sd_inptrs;
 528
 529                /* Advance in metadata tree. */
 530                (mp->mp_list[hgt])++;
 531                if (hgt) {
 532                        if (mp->mp_list[hgt] >= sdp->sd_inptrs)
 533                                goto lower_metapath;
 534                } else {
 535                        if (mp->mp_list[hgt] >= sdp->sd_diptrs)
 536                                break;
 537                }
 538
 539fill_up_metapath:
 540                /* Increase height of metapath. */
 541                ret = fillup_metapath(ip, mp, ip->i_height - 1);
 542                if (ret < 0)
 543                        return ret;
 544                hgt += ret;
 545                for (; ret; ret--)
 546                        do_div(factor, sdp->sd_inptrs);
 547                mp->mp_aheight = hgt + 1;
 548        }
 549        return 0;
 550}
 551
 552static enum walker_status gfs2_hole_walker(struct metapath *mp,
 553                                           unsigned int ptrs)
 554{
 555        const __be64 *start, *ptr, *end;
 556        unsigned int hgt;
 557
 558        hgt = mp->mp_aheight - 1;
 559        start = metapointer(hgt, mp);
 560        end = start + ptrs;
 561
 562        for (ptr = start; ptr < end; ptr++) {
 563                if (*ptr) {
 564                        mp->mp_list[hgt] += ptr - start;
 565                        if (mp->mp_aheight == mp->mp_fheight)
 566                                return WALK_STOP;
 567                        return WALK_FOLLOW;
 568                }
 569        }
 570        return WALK_CONTINUE;
 571}
 572
 573/**
 574 * gfs2_hole_size - figure out the size of a hole
 575 * @inode: The inode
 576 * @lblock: The logical starting block number
 577 * @len: How far to look (in blocks)
 578 * @mp: The metapath at lblock
 579 * @iomap: The iomap to store the hole size in
 580 *
 581 * This function modifies @mp.
 582 *
 583 * Returns: errno on error
 584 */
 585static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
 586                          struct metapath *mp, struct iomap *iomap)
 587{
 588        struct metapath clone;
 589        u64 hole_size;
 590        int ret;
 591
 592        clone_metapath(&clone, mp);
 593        ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
 594        if (ret < 0)
 595                goto out;
 596
 597        if (ret == 1)
 598                hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
 599        else
 600                hole_size = len;
 601        iomap->length = hole_size << inode->i_blkbits;
 602        ret = 0;
 603
 604out:
 605        release_metapath(&clone);
 606        return ret;
 607}
 608
 609static inline __be64 *gfs2_indirect_init(struct metapath *mp,
 610                                         struct gfs2_glock *gl, unsigned int i,
 611                                         unsigned offset, u64 bn)
 612{
 613        __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 614                       ((i > 1) ? sizeof(struct gfs2_meta_header) :
 615                                 sizeof(struct gfs2_dinode)));
 616        BUG_ON(i < 1);
 617        BUG_ON(mp->mp_bh[i] != NULL);
 618        mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 619        gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 620        gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 621        gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 622        ptr += offset;
 623        *ptr = cpu_to_be64(bn);
 624        return ptr;
 625}
 626
 627enum alloc_state {
 628        ALLOC_DATA = 0,
 629        ALLOC_GROW_DEPTH = 1,
 630        ALLOC_GROW_HEIGHT = 2,
 631        /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 632};
 633
 634/**
 635 * __gfs2_iomap_alloc - Build a metadata tree of the requested height
 636 * @inode: The GFS2 inode
 637 * @iomap: The iomap structure
 638 * @mp: The metapath, with proper height information calculated
 639 *
 640 * In this routine we may have to alloc:
 641 *   i) Indirect blocks to grow the metadata tree height
 642 *  ii) Indirect blocks to fill in lower part of the metadata tree
 643 * iii) Data blocks
 644 *
 645 * This function is called after __gfs2_iomap_get, which works out the
 646 * total number of blocks which we need via gfs2_alloc_size.
 647 *
 648 * We then do the actual allocation asking for an extent at a time (if
 649 * enough contiguous free blocks are available, there will only be one
 650 * allocation request per call) and uses the state machine to initialise
 651 * the blocks in order.
 652 *
 653 * Right now, this function will allocate at most one indirect block
 654 * worth of data -- with a default block size of 4K, that's slightly
 655 * less than 2M.  If this limitation is ever removed to allow huge
 656 * allocations, we would probably still want to limit the iomap size we
 657 * return to avoid stalling other tasks during huge writes; the next
 658 * iomap iteration would then find the blocks already allocated.
 659 *
 660 * Returns: errno on error
 661 */
 662
 663static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 664                              struct metapath *mp)
 665{
 666        struct gfs2_inode *ip = GFS2_I(inode);
 667        struct gfs2_sbd *sdp = GFS2_SB(inode);
 668        struct buffer_head *dibh = mp->mp_bh[0];
 669        u64 bn;
 670        unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 671        size_t dblks = iomap->length >> inode->i_blkbits;
 672        const unsigned end_of_metadata = mp->mp_fheight - 1;
 673        int ret;
 674        enum alloc_state state;
 675        __be64 *ptr;
 676        __be64 zero_bn = 0;
 677
 678        BUG_ON(mp->mp_aheight < 1);
 679        BUG_ON(dibh == NULL);
 680        BUG_ON(dblks < 1);
 681
 682        gfs2_trans_add_meta(ip->i_gl, dibh);
 683
 684        down_write(&ip->i_rw_mutex);
 685
 686        if (mp->mp_fheight == mp->mp_aheight) {
 687                /* Bottom indirect block exists */
 688                state = ALLOC_DATA;
 689        } else {
 690                /* Need to allocate indirect blocks */
 691                if (mp->mp_fheight == ip->i_height) {
 692                        /* Writing into existing tree, extend tree down */
 693                        iblks = mp->mp_fheight - mp->mp_aheight;
 694                        state = ALLOC_GROW_DEPTH;
 695                } else {
 696                        /* Building up tree height */
 697                        state = ALLOC_GROW_HEIGHT;
 698                        iblks = mp->mp_fheight - ip->i_height;
 699                        branch_start = metapath_branch_start(mp);
 700                        iblks += (mp->mp_fheight - branch_start);
 701                }
 702        }
 703
 704        /* start of the second part of the function (state machine) */
 705
 706        blks = dblks + iblks;
 707        i = mp->mp_aheight;
 708        do {
 709                n = blks - alloced;
 710                ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 711                if (ret)
 712                        goto out;
 713                alloced += n;
 714                if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 715                        gfs2_trans_remove_revoke(sdp, bn, n);
 716                switch (state) {
 717                /* Growing height of tree */
 718                case ALLOC_GROW_HEIGHT:
 719                        if (i == 1) {
 720                                ptr = (__be64 *)(dibh->b_data +
 721                                                 sizeof(struct gfs2_dinode));
 722                                zero_bn = *ptr;
 723                        }
 724                        for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 725                             i++, n--)
 726                                gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 727                        if (i - 1 == mp->mp_fheight - ip->i_height) {
 728                                i--;
 729                                gfs2_buffer_copy_tail(mp->mp_bh[i],
 730                                                sizeof(struct gfs2_meta_header),
 731                                                dibh, sizeof(struct gfs2_dinode));
 732                                gfs2_buffer_clear_tail(dibh,
 733                                                sizeof(struct gfs2_dinode) +
 734                                                sizeof(__be64));
 735                                ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 736                                        sizeof(struct gfs2_meta_header));
 737                                *ptr = zero_bn;
 738                                state = ALLOC_GROW_DEPTH;
 739                                for(i = branch_start; i < mp->mp_fheight; i++) {
 740                                        if (mp->mp_bh[i] == NULL)
 741                                                break;
 742                                        brelse(mp->mp_bh[i]);
 743                                        mp->mp_bh[i] = NULL;
 744                                }
 745                                i = branch_start;
 746                        }
 747                        if (n == 0)
 748                                break;
 749                        fallthrough;    /* To branching from existing tree */
 750                case ALLOC_GROW_DEPTH:
 751                        if (i > 1 && i < mp->mp_fheight)
 752                                gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 753                        for (; i < mp->mp_fheight && n > 0; i++, n--)
 754                                gfs2_indirect_init(mp, ip->i_gl, i,
 755                                                   mp->mp_list[i-1], bn++);
 756                        if (i == mp->mp_fheight)
 757                                state = ALLOC_DATA;
 758                        if (n == 0)
 759                                break;
 760                        fallthrough;    /* To tree complete, adding data blocks */
 761                case ALLOC_DATA:
 762                        BUG_ON(n > dblks);
 763                        BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 764                        gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 765                        dblks = n;
 766                        ptr = metapointer(end_of_metadata, mp);
 767                        iomap->addr = bn << inode->i_blkbits;
 768                        iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
 769                        while (n-- > 0)
 770                                *ptr++ = cpu_to_be64(bn++);
 771                        break;
 772                }
 773        } while (iomap->addr == IOMAP_NULL_ADDR);
 774
 775        iomap->type = IOMAP_MAPPED;
 776        iomap->length = (u64)dblks << inode->i_blkbits;
 777        ip->i_height = mp->mp_fheight;
 778        gfs2_add_inode_blocks(&ip->i_inode, alloced);
 779        gfs2_dinode_out(ip, dibh->b_data);
 780out:
 781        up_write(&ip->i_rw_mutex);
 782        return ret;
 783}
 784
 785#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
 786
 787/**
 788 * gfs2_alloc_size - Compute the maximum allocation size
 789 * @inode: The inode
 790 * @mp: The metapath
 791 * @size: Requested size in blocks
 792 *
 793 * Compute the maximum size of the next allocation at @mp.
 794 *
 795 * Returns: size in blocks
 796 */
 797static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
 798{
 799        struct gfs2_inode *ip = GFS2_I(inode);
 800        struct gfs2_sbd *sdp = GFS2_SB(inode);
 801        const __be64 *first, *ptr, *end;
 802
 803        /*
 804         * For writes to stuffed files, this function is called twice via
 805         * __gfs2_iomap_get, before and after unstuffing. The size we return the
 806         * first time needs to be large enough to get the reservation and
 807         * allocation sizes right.  The size we return the second time must
 808         * be exact or else __gfs2_iomap_alloc won't do the right thing.
 809         */
 810
 811        if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
 812                unsigned int maxsize = mp->mp_fheight > 1 ?
 813                        sdp->sd_inptrs : sdp->sd_diptrs;
 814                maxsize -= mp->mp_list[mp->mp_fheight - 1];
 815                if (size > maxsize)
 816                        size = maxsize;
 817                return size;
 818        }
 819
 820        first = metapointer(ip->i_height - 1, mp);
 821        end = metaend(ip->i_height - 1, mp);
 822        if (end - first > size)
 823                end = first + size;
 824        for (ptr = first; ptr < end; ptr++) {
 825                if (*ptr)
 826                        break;
 827        }
 828        return ptr - first;
 829}
 830
 831/**
 832 * __gfs2_iomap_get - Map blocks from an inode to disk blocks
 833 * @inode: The inode
 834 * @pos: Starting position in bytes
 835 * @length: Length to map, in bytes
 836 * @flags: iomap flags
 837 * @iomap: The iomap structure
 838 * @mp: The metapath
 839 *
 840 * Returns: errno
 841 */
 842static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 843                            unsigned flags, struct iomap *iomap,
 844                            struct metapath *mp)
 845{
 846        struct gfs2_inode *ip = GFS2_I(inode);
 847        struct gfs2_sbd *sdp = GFS2_SB(inode);
 848        loff_t size = i_size_read(inode);
 849        __be64 *ptr;
 850        sector_t lblock;
 851        sector_t lblock_stop;
 852        int ret;
 853        int eob;
 854        u64 len;
 855        struct buffer_head *dibh = NULL, *bh;
 856        u8 height;
 857
 858        if (!length)
 859                return -EINVAL;
 860
 861        down_read(&ip->i_rw_mutex);
 862
 863        ret = gfs2_meta_inode_buffer(ip, &dibh);
 864        if (ret)
 865                goto unlock;
 866        mp->mp_bh[0] = dibh;
 867
 868        if (gfs2_is_stuffed(ip)) {
 869                if (flags & IOMAP_WRITE) {
 870                        loff_t max_size = gfs2_max_stuffed_size(ip);
 871
 872                        if (pos + length > max_size)
 873                                goto unstuff;
 874                        iomap->length = max_size;
 875                } else {
 876                        if (pos >= size) {
 877                                if (flags & IOMAP_REPORT) {
 878                                        ret = -ENOENT;
 879                                        goto unlock;
 880                                } else {
 881                                        iomap->offset = pos;
 882                                        iomap->length = length;
 883                                        goto hole_found;
 884                                }
 885                        }
 886                        iomap->length = size;
 887                }
 888                iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 889                              sizeof(struct gfs2_dinode);
 890                iomap->type = IOMAP_INLINE;
 891                iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
 892                goto out;
 893        }
 894
 895unstuff:
 896        lblock = pos >> inode->i_blkbits;
 897        iomap->offset = lblock << inode->i_blkbits;
 898        lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 899        len = lblock_stop - lblock + 1;
 900        iomap->length = len << inode->i_blkbits;
 901
 902        height = ip->i_height;
 903        while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
 904                height++;
 905        find_metapath(sdp, lblock, mp, height);
 906        if (height > ip->i_height || gfs2_is_stuffed(ip))
 907                goto do_alloc;
 908
 909        ret = lookup_metapath(ip, mp);
 910        if (ret)
 911                goto unlock;
 912
 913        if (mp->mp_aheight != ip->i_height)
 914                goto do_alloc;
 915
 916        ptr = metapointer(ip->i_height - 1, mp);
 917        if (*ptr == 0)
 918                goto do_alloc;
 919
 920        bh = mp->mp_bh[ip->i_height - 1];
 921        len = gfs2_extent_length(bh, ptr, len, &eob);
 922
 923        iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 924        iomap->length = len << inode->i_blkbits;
 925        iomap->type = IOMAP_MAPPED;
 926        iomap->flags |= IOMAP_F_MERGED;
 927        if (eob)
 928                iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 929
 930out:
 931        iomap->bdev = inode->i_sb->s_bdev;
 932unlock:
 933        up_read(&ip->i_rw_mutex);
 934        return ret;
 935
 936do_alloc:
 937        if (flags & IOMAP_REPORT) {
 938                if (pos >= size)
 939                        ret = -ENOENT;
 940                else if (height == ip->i_height)
 941                        ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 942                else
 943                        iomap->length = size - pos;
 944        } else if (flags & IOMAP_WRITE) {
 945                u64 alloc_size;
 946
 947                if (flags & IOMAP_DIRECT)
 948                        goto out;  /* (see gfs2_file_direct_write) */
 949
 950                len = gfs2_alloc_size(inode, mp, len);
 951                alloc_size = len << inode->i_blkbits;
 952                if (alloc_size < iomap->length)
 953                        iomap->length = alloc_size;
 954        } else {
 955                if (pos < size && height == ip->i_height)
 956                        ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 957        }
 958hole_found:
 959        iomap->addr = IOMAP_NULL_ADDR;
 960        iomap->type = IOMAP_HOLE;
 961        goto out;
 962}
 963
 964static int gfs2_write_lock(struct inode *inode)
 965{
 966        struct gfs2_inode *ip = GFS2_I(inode);
 967        struct gfs2_sbd *sdp = GFS2_SB(inode);
 968        int error;
 969
 970        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
 971        error = gfs2_glock_nq(&ip->i_gh);
 972        if (error)
 973                goto out_uninit;
 974        if (&ip->i_inode == sdp->sd_rindex) {
 975                struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 976
 977                error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
 978                                           GL_NOCACHE, &m_ip->i_gh);
 979                if (error)
 980                        goto out_unlock;
 981        }
 982        return 0;
 983
 984out_unlock:
 985        gfs2_glock_dq(&ip->i_gh);
 986out_uninit:
 987        gfs2_holder_uninit(&ip->i_gh);
 988        return error;
 989}
 990
 991static void gfs2_write_unlock(struct inode *inode)
 992{
 993        struct gfs2_inode *ip = GFS2_I(inode);
 994        struct gfs2_sbd *sdp = GFS2_SB(inode);
 995
 996        if (&ip->i_inode == sdp->sd_rindex) {
 997                struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 998
 999                gfs2_glock_dq_uninit(&m_ip->i_gh);
1000        }

1001        gfs2_glock_dq_uninit(&ip->i_gh);
1002}
1003
1004static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
1005                                   unsigned len)
1006{
1007        unsigned int blockmask = i_blocksize(inode) - 1;
1008        struct gfs2_sbd *sdp = GFS2_SB(inode);
1009        unsigned int blocks;
1010
1011        blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
1012        return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
1013}
1014
1015static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
1016                                 unsigned copied, struct page *page)
1017{
1018        struct gfs2_trans *tr = current->journal_info;
1019        struct gfs2_inode *ip = GFS2_I(inode);
1020        struct gfs2_sbd *sdp = GFS2_SB(inode);
1021
1022        if (page && !gfs2_is_stuffed(ip))
1023                gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
1024
1025        if (tr->tr_num_buf_new)
1026                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1027
1028        gfs2_trans_end(sdp);
1029}
1030
1031static const struct iomap_page_ops gfs2_iomap_page_ops = {
1032        .page_prepare = gfs2_iomap_page_prepare,
1033        .page_done = gfs2_iomap_page_done,
1034};
1035
1036static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
1037                                  loff_t length, unsigned flags,
1038                                  struct iomap *iomap,
1039                                  struct metapath *mp)
1040{
1041        struct gfs2_inode *ip = GFS2_I(inode);
1042        struct gfs2_sbd *sdp = GFS2_SB(inode);
1043        bool unstuff;
1044        int ret;
1045
1046        unstuff = gfs2_is_stuffed(ip) &&
1047                  pos + length > gfs2_max_stuffed_size(ip);
1048
1049        if (unstuff || iomap->type == IOMAP_HOLE) {
1050                unsigned int data_blocks, ind_blocks;
1051                struct gfs2_alloc_parms ap = {};
1052                unsigned int rblocks;
1053                struct gfs2_trans *tr;
1054
1055                gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1056                                       &ind_blocks);
1057                ap.target = data_blocks + ind_blocks;
1058                ret = gfs2_quota_lock_check(ip, &ap);
1059                if (ret)
1060                        return ret;
1061
1062                ret = gfs2_inplace_reserve(ip, &ap);
1063                if (ret)
1064                        goto out_qunlock;
1065
1066                rblocks = RES_DINODE + ind_blocks;
1067                if (gfs2_is_jdata(ip))
1068                        rblocks += data_blocks;
1069                if (ind_blocks || data_blocks)
1070                        rblocks += RES_STATFS + RES_QUOTA;
1071                if (inode == sdp->sd_rindex)
1072                        rblocks += 2 * RES_STATFS;
1073                rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1074
1075                ret = gfs2_trans_begin(sdp, rblocks,
1076                                       iomap->length >> inode->i_blkbits);
1077                if (ret)
1078                        goto out_trans_fail;
1079
1080                if (unstuff) {
1081                        ret = gfs2_unstuff_dinode(ip);
1082                        if (ret)
1083                                goto out_trans_end;
1084                        release_metapath(mp);
1085                        ret = __gfs2_iomap_get(inode, iomap->offset,
1086                                               iomap->length, flags, iomap, mp);
1087                        if (ret)
1088                                goto out_trans_end;
1089                }
1090
1091                if (iomap->type == IOMAP_HOLE) {
1092                        ret = __gfs2_iomap_alloc(inode, iomap, mp);
1093                        if (ret) {
1094                                gfs2_trans_end(sdp);
1095                                gfs2_inplace_release(ip);
1096                                punch_hole(ip, iomap->offset, iomap->length);
1097                                goto out_qunlock;
1098                        }
1099                }
1100
1101                tr = current->journal_info;
1102                if (tr->tr_num_buf_new)
1103                        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1104
1105                gfs2_trans_end(sdp);
1106        }
1107
1108        if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1109                iomap->page_ops = &gfs2_iomap_page_ops;
1110        return 0;
1111
1112out_trans_end:
1113        gfs2_trans_end(sdp);
1114out_trans_fail:
1115        gfs2_inplace_release(ip);
1116out_qunlock:
1117        gfs2_quota_unlock(ip);
1118        return ret;
1119}
1120
1121static inline bool gfs2_iomap_need_write_lock(unsigned flags)
1122{
1123        return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);
1124}
1125
1126static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1127                            unsigned flags, struct iomap *iomap,
1128                            struct iomap *srcmap)
1129{
1130        struct gfs2_inode *ip = GFS2_I(inode);
1131        struct metapath mp = { .mp_aheight = 1, };
1132        int ret;
1133
1134        if (gfs2_is_jdata(ip))
1135                iomap->flags |= IOMAP_F_BUFFER_HEAD;
1136
1137        trace_gfs2_iomap_start(ip, pos, length, flags);
1138        if (gfs2_iomap_need_write_lock(flags)) {
1139                ret = gfs2_write_lock(inode);
1140                if (ret)
1141                        goto out;
1142        }
1143
1144        ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1145        if (ret)
1146                goto out_unlock;
1147
1148        switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1149        case IOMAP_WRITE:
1150                if (flags & IOMAP_DIRECT) {
1151                        /*
1152                         * Silently fall back to buffered I/O for stuffed files
1153                         * or if we've got a hole (see gfs2_file_direct_write).
1154                         */
1155                        if (iomap->type != IOMAP_MAPPED)
1156                                ret = -ENOTBLK;
1157                        goto out_unlock;
1158                }
1159                break;
1160        case IOMAP_ZERO:
1161                if (iomap->type == IOMAP_HOLE)
1162                        goto out_unlock;
1163                break;
1164        default:
1165                goto out_unlock;
1166        }
1167
1168        ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1169
1170out_unlock:
1171        if (ret && gfs2_iomap_need_write_lock(flags))
1172                gfs2_write_unlock(inode);
1173        release_metapath(&mp);
1174out:
1175        trace_gfs2_iomap_end(ip, iomap, ret);
1176        return ret;
1177}
1178
1179static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1180                          ssize_t written, unsigned flags, struct iomap *iomap)
1181{
1182        struct gfs2_inode *ip = GFS2_I(inode);
1183        struct gfs2_sbd *sdp = GFS2_SB(inode);
1184
1185        switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1186        case IOMAP_WRITE:
1187                if (flags & IOMAP_DIRECT)
1188                        return 0;
1189                break;
1190        case IOMAP_ZERO:
1191                 if (iomap->type == IOMAP_HOLE)
1192                         return 0;
1193                 break;
1194        default:
1195                 return 0;
1196        }
1197
1198        if (!gfs2_is_stuffed(ip))
1199                gfs2_ordered_add_inode(ip);
1200
1201        if (inode == sdp->sd_rindex)
1202                adjust_fs_space(inode);
1203
1204        gfs2_inplace_release(ip);
1205
1206        if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1207                gfs2_quota_unlock(ip);
1208
1209        if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1210                /* Deallocate blocks that were just allocated. */
1211                loff_t blockmask = i_blocksize(inode) - 1;
1212                loff_t end = (pos + length) & ~blockmask;
1213
1214                pos = (pos + written + blockmask) & ~blockmask;
1215                if (pos < end) {
1216                        truncate_pagecache_range(inode, pos, end - 1);
1217                        punch_hole(ip, pos, end - pos);
1218                }
1219        }
1220
1221        if (unlikely(!written))
1222                goto out_unlock;
1223
1224        if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1225                mark_inode_dirty(inode);
1226        set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1227
1228out_unlock:
1229        if (gfs2_iomap_need_write_lock(flags))
1230                gfs2_write_unlock(inode);
1231        return 0;
1232}
1233
1234const struct iomap_ops gfs2_iomap_ops = {
1235        .iomap_begin = gfs2_iomap_begin,
1236        .iomap_end = gfs2_iomap_end,
1237};
1238
1239/**
1240 * gfs2_block_map - Map one or more blocks of an inode to a disk block
1241 * @inode: The inode
1242 * @lblock: The logical block number
1243 * @bh_map: The bh to be mapped
1244 * @create: True if its ok to alloc blocks to satify the request
1245 *
1246 * The size of the requested mapping is defined in bh_map->b_size.
1247 *
1248 * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1249 * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1250 * bh_map->b_size to indicate the size of the mapping when @lblock and
1251 * successive blocks are mapped, up to the requested size.
1252 *
1253 * Sets buffer_boundary() if a read of metadata will be required
1254 * before the next block can be mapped. Sets buffer_new() if new
1255 * blocks were allocated.
1256 *
1257 * Returns: errno
1258 */
1259
1260int gfs2_block_map(struct inode *inode, sector_t lblock,
1261                   struct buffer_head *bh_map, int create)
1262{
1263        struct gfs2_inode *ip = GFS2_I(inode);
1264        loff_t pos = (loff_t)lblock << inode->i_blkbits;
1265        loff_t length = bh_map->b_size;
1266        struct iomap iomap = { };
1267        int ret;
1268
1269        clear_buffer_mapped(bh_map);
1270        clear_buffer_new(bh_map);
1271        clear_buffer_boundary(bh_map);
1272        trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1273
1274        if (!create)
1275                ret = gfs2_iomap_get(inode, pos, length, &iomap);
1276        else
1277                ret = gfs2_iomap_alloc(inode, pos, length, &iomap);
1278        if (ret)
1279                goto out;
1280
1281        if (iomap.length > bh_map->b_size) {
1282                iomap.length = bh_map->b_size;
1283                iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1284        }
1285        if (iomap.addr != IOMAP_NULL_ADDR)
1286                map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1287        bh_map->b_size = iomap.length;
1288        if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1289                set_buffer_boundary(bh_map);
1290        if (iomap.flags & IOMAP_F_NEW)
1291                set_buffer_new(bh_map);
1292
1293out:
1294        trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1295        return ret;
1296}
1297
1298int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
1299                    unsigned int *extlen)
1300{
1301        unsigned int blkbits = inode->i_blkbits;
1302        struct iomap iomap = { };
1303        unsigned int len;
1304        int ret;
1305
1306        ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits,
1307                             &iomap);
1308        if (ret)
1309                return ret;
1310        if (iomap.type != IOMAP_MAPPED)
1311                return -EIO;
1312        *dblock = iomap.addr >> blkbits;
1313        len = iomap.length >> blkbits;
1314        if (len < *extlen)
1315                *extlen = len;
1316        return 0;
1317}
1318
1319int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
1320                      unsigned int *extlen, bool *new)
1321{
1322        unsigned int blkbits = inode->i_blkbits;
1323        struct iomap iomap = { };
1324        unsigned int len;
1325        int ret;
1326
1327        ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits,
1328                               &iomap);
1329        if (ret)
1330                return ret;
1331        if (iomap.type != IOMAP_MAPPED)
1332                return -EIO;
1333        *dblock = iomap.addr >> blkbits;
1334        len = iomap.length >> blkbits;
1335        if (len < *extlen)
1336                *extlen = len;
1337        *new = iomap.flags & IOMAP_F_NEW;
1338        return 0;
1339}
1340
1341/*
1342 * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1343 * uses iomap write to perform its actions, which begin their own transactions
1344 * (iomap_begin, page_prepare, etc.)
1345 */
1346static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1347                                 unsigned int length)
1348{
1349        BUG_ON(current->journal_info);
1350        return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
1351}
1352
1353#define GFS2_JTRUNC_REVOKES 8192
1354
1355/**
1356 * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1357 * @inode: The inode being truncated
1358 * @oldsize: The original (larger) size
1359 * @newsize: The new smaller size
1360 *
1361 * With jdata files, we have to journal a revoke for each block which is
1362 * truncated. As a result, we need to split this into separate transactions
1363 * if the number of pages being truncated gets too large.
1364 */
1365
1366static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1367{
1368        struct gfs2_sbd *sdp = GFS2_SB(inode);
1369        u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1370        u64 chunk;
1371        int error;
1372
1373        while (oldsize != newsize) {
1374                struct gfs2_trans *tr;
1375                unsigned int offs;
1376
1377                chunk = oldsize - newsize;
1378                if (chunk > max_chunk)
1379                        chunk = max_chunk;
1380
1381                offs = oldsize & ~PAGE_MASK;
1382                if (offs && chunk > PAGE_SIZE)
1383                        chunk = offs + ((chunk - offs) & PAGE_MASK);
1384
1385                truncate_pagecache(inode, oldsize - chunk);
1386                oldsize -= chunk;
1387
1388                tr = current->journal_info;
1389                if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1390                        continue;
1391
1392                gfs2_trans_end(sdp);
1393                error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1394                if (error)
1395                        return error;
1396        }
1397
1398        return 0;
1399}
1400
1401static int trunc_start(struct inode *inode, u64 newsize)
1402{
1403        struct gfs2_inode *ip = GFS2_I(inode);
1404        struct gfs2_sbd *sdp = GFS2_SB(inode);
1405        struct buffer_head *dibh = NULL;
1406        int journaled = gfs2_is_jdata(ip);
1407        u64 oldsize = inode->i_size;
1408        int error;
1409
1410        if (!gfs2_is_stuffed(ip)) {
1411                unsigned int blocksize = i_blocksize(inode);
1412                unsigned int offs = newsize & (blocksize - 1);
1413                if (offs) {
1414                        error = gfs2_block_zero_range(inode, newsize,
1415                                                      blocksize - offs);
1416                        if (error)
1417                                return error;
1418                }
1419        }
1420        if (journaled)
1421                error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1422        else
1423                error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1424        if (error)
1425                return error;
1426
1427        error = gfs2_meta_inode_buffer(ip, &dibh);
1428        if (error)
1429                goto out;
1430
1431        gfs2_trans_add_meta(ip->i_gl, dibh);
1432
1433        if (gfs2_is_stuffed(ip))
1434                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1435        else
1436                ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1437
1438        i_size_write(inode, newsize);
1439        ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1440        gfs2_dinode_out(ip, dibh->b_data);
1441
1442        if (journaled)
1443                error = gfs2_journaled_truncate(inode, oldsize, newsize);
1444        else
1445                truncate_pagecache(inode, newsize);
1446
1447out:
1448        brelse(dibh);
1449        if (current->journal_info)
1450                gfs2_trans_end(sdp);
1451        return error;
1452}
1453
1454int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
1455                   struct iomap *iomap)
1456{
1457        struct metapath mp = { .mp_aheight = 1, };
1458        int ret;
1459
1460        ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp);
1461        release_metapath(&mp);
1462        return ret;
1463}
1464
1465int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
1466                     struct iomap *iomap)
1467{
1468        struct metapath mp = { .mp_aheight = 1, };
1469        int ret;
1470
1471        ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1472        if (!ret && iomap->type == IOMAP_HOLE)
1473                ret = __gfs2_iomap_alloc(inode, iomap, &mp);
1474        release_metapath(&mp);
1475        return ret;
1476}
1477
1478/**
1479 * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1480 * @ip: inode
1481 * @rd_gh: holder of resource group glock
1482 * @bh: buffer head to sweep
1483 * @start: starting point in bh
1484 * @end: end point in bh
1485 * @meta: true if bh points to metadata (rather than data)
1486 * @btotal: place to keep count of total blocks freed
1487 *
1488 * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1489 * free, and free them all. However, we do it one rgrp at a time. If this
1490 * block has references to multiple rgrps, we break it into individual
1491 * transactions. This allows other processes to use the rgrps while we're
1492 * focused on a single one, for better concurrency / performance.
1493 * At every transaction boundary, we rewrite the inode into the journal.
1494 * That way the bitmaps are kept consistent with the inode and we can recover
1495 * if we're interrupted by power-outages.
1496 *
1497 * Returns: 0, or return code if an error occurred.
1498 *          *btotal has the total number of blocks freed
1499 */
1500static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1501                              struct buffer_head *bh, __be64 *start, __be64 *end,
1502                              bool meta, u32 *btotal)
1503{
1504        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1505        struct gfs2_rgrpd *rgd;
1506        struct gfs2_trans *tr;
1507        __be64 *p;
1508        int blks_outside_rgrp;
1509        u64 bn, bstart, isize_blks;
1510        s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1511        int ret = 0;
1512        bool buf_in_tr = false; /* buffer was added to transaction */
1513
1514more_rgrps:
1515        rgd = NULL;
1516        if (gfs2_holder_initialized(rd_gh)) {
1517                rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1518                gfs2_assert_withdraw(sdp,
1519                             gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1520        }
1521        blks_outside_rgrp = 0;
1522        bstart = 0;
1523        blen = 0;
1524
1525        for (p = start; p < end; p++) {
1526                if (!*p)
1527                        continue;
1528                bn = be64_to_cpu(*p);
1529
1530                if (rgd) {
1531                        if (!rgrp_contains_block(rgd, bn)) {
1532                                blks_outside_rgrp++;
1533                                continue;
1534                        }
1535                } else {
1536                        rgd = gfs2_blk2rgrpd(sdp, bn, true);
1537                        if (unlikely(!rgd)) {
1538                                ret = -EIO;
1539                                goto out;
1540                        }
1541                        ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1542                                                 LM_FLAG_NODE_SCOPE, rd_gh);
1543                        if (ret)
1544                                goto out;
1545
1546                        /* Must be done with the rgrp glock held: */
1547                        if (gfs2_rs_active(&ip->i_res) &&
1548                            rgd == ip->i_res.rs_rgd)
1549                                gfs2_rs_deltree(&ip->i_res);
1550                }
1551
1552                /* The size of our transactions will be unknown until we
1553                   actually process all the metadata blocks that relate to
1554                   the rgrp. So we estimate. We know it can't be more than
1555                   the dinode's i_blocks and we don't want to exceed the
1556                   journal flush threshold, sd_log_thresh2. */
1557                if (current->journal_info == NULL) {
1558                        unsigned int jblocks_rqsted, revokes;
1559
1560                        jblocks_rqsted = rgd->rd_length + RES_DINODE +
1561                                RES_INDIRECT;
1562                        isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1563                        if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1564                                jblocks_rqsted +=
1565                                        atomic_read(&sdp->sd_log_thresh2);
1566                        else
1567                                jblocks_rqsted += isize_blks;
1568                        revokes = jblocks_rqsted;
1569                        if (meta)
1570                                revokes += end - start;
1571                        else if (ip->i_depth)
1572                                revokes += sdp->sd_inptrs;
1573                        ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1574                        if (ret)
1575                                goto out_unlock;
1576                        down_write(&ip->i_rw_mutex);
1577                }
1578                /* check if we will exceed the transaction blocks requested */
1579                tr = current->journal_info;
1580                if (tr->tr_num_buf_new + RES_STATFS +
1581                    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1582                        /* We set blks_outside_rgrp to ensure the loop will
1583                           be repeated for the same rgrp, but with a new
1584                           transaction. */
1585                        blks_outside_rgrp++;
1586                        /* This next part is tricky. If the buffer was added
1587                           to the transaction, we've already set some block
1588                           pointers to 0, so we better follow through and free
1589                           them, or we will introduce corruption (so break).
1590                           This may be impossible, or at least rare, but I
1591                           decided to cover the case regardless.
1592
1593                           If the buffer was not added to the transaction
1594                           (this call), doing so would exceed our transaction
1595                           size, so we need to end the transaction and start a
1596                           new one (so goto). */
1597
1598                        if (buf_in_tr)
1599                                break;
1600                        goto out_unlock;
1601                }
1602
1603                gfs2_trans_add_meta(ip->i_gl, bh);
1604                buf_in_tr = true;
1605                *p = 0;
1606                if (bstart + blen == bn) {
1607                        blen++;
1608                        continue;
1609                }
1610                if (bstart) {
1611                        __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1612                        (*btotal) += blen;
1613                        gfs2_add_inode_blocks(&ip->i_inode, -blen);
1614                }
1615                bstart = bn;
1616                blen = 1;
1617        }
1618        if (bstart) {
1619                __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1620                (*btotal) += blen;
1621                gfs2_add_inode_blocks(&ip->i_inode, -blen);
1622        }
1623out_unlock:
1624        if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1625                                            outside the rgrp we just processed,
1626                                            do it all over again. */
1627                if (current->journal_info) {
1628                        struct buffer_head *dibh;
1629
1630                        ret = gfs2_meta_inode_buffer(ip, &dibh);
1631                        if (ret)
1632                                goto out;
1633
1634                        /* Every transaction boundary, we rewrite the dinode
1635                           to keep its di_blocks current in case of failure. */
1636                        ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1637                                current_time(&ip->i_inode);
1638                        gfs2_trans_add_meta(ip->i_gl, dibh);
1639                        gfs2_dinode_out(ip, dibh->b_data);
1640                        brelse(dibh);
1641                        up_write(&ip->i_rw_mutex);
1642                        gfs2_trans_end(sdp);
1643                        buf_in_tr = false;
1644                }
1645                gfs2_glock_dq_uninit(rd_gh);
1646                cond_resched();
1647                goto more_rgrps;
1648        }
1649out:
1650        return ret;
1651}
1652
1653static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1654{
1655        if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1656                return false;
1657        return true;
1658}
1659
1660/**
1661 * find_nonnull_ptr - find a non-null pointer given a metapath and height
1662 * @sdp: The superblock
1663 * @mp: starting metapath
1664 * @h: desired height to search
1665 * @end_list: See punch_hole().
1666 * @end_aligned: See punch_hole().
1667 *
1668 * Assumes the metapath is valid (with buffers) out to height h.
1669 * Returns: true if a non-null pointer was found in the metapath buffer
1670 *          false if all remaining pointers are NULL in the buffer
1671 */
1672static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1673                             unsigned int h,
1674                             __u16 *end_list, unsigned int end_aligned)
1675{
1676        struct buffer_head *bh = mp->mp_bh[h];
1677        __be64 *first, *ptr, *end;
1678
1679        first = metaptr1(h, mp);
1680        ptr = first + mp->mp_list[h];
1681        end = (__be64 *)(bh->b_data + bh->b_size);
1682        if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1683                bool keep_end = h < end_aligned;
1684                end = first + end_list[h] + keep_end;
1685        }
1686
1687        while (ptr < end) {
1688                if (*ptr) { /* if we have a non-null pointer */
1689                        mp->mp_list[h] = ptr - first;
1690                        h++;
1691                        if (h < GFS2_MAX_META_HEIGHT)
1692                                mp->mp_list[h] = 0;
1693                        return true;
1694                }
1695                ptr++;
1696        }
1697        return false;
1698}
1699
1700enum dealloc_states {
1701        DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1702        DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1703        DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1704        DEALLOC_DONE = 3,       /* process complete */
1705};
1706
1707static inline void
1708metapointer_range(struct metapath *mp, int height,
1709                  __u16 *start_list, unsigned int start_aligned,
1710                  __u16 *end_list, unsigned int end_aligned,
1711                  __be64 **start, __be64 **end)
1712{
1713        struct buffer_head *bh = mp->mp_bh[height];
1714        __be64 *first;
1715
1716        first = metaptr1(height, mp);
1717        *start = first;
1718        if (mp_eq_to_hgt(mp, start_list, height)) {
1719                bool keep_start = height < start_aligned;
1720                *start = first + start_list[height] + keep_start;
1721        }
1722        *end = (__be64 *)(bh->b_data + bh->b_size);
1723        if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1724                bool keep_end = height < end_aligned;
1725                *end = first + end_list[height] + keep_end;
1726        }
1727}
1728
1729static inline bool walk_done(struct gfs2_sbd *sdp,
1730                             struct metapath *mp, int height,
1731                             __u16 *end_list, unsigned int end_aligned)
1732{
1733        __u16 end;
1734
1735        if (end_list) {
1736                bool keep_end = height < end_aligned;
1737                if (!mp_eq_to_hgt(mp, end_list, height))
1738                        return false;
1739                end = end_list[height] + keep_end;
1740        } else
1741                end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1742        return mp->mp_list[height] >= end;
1743}
1744
1745/**
1746 * punch_hole - deallocate blocks in a file
1747 * @ip: inode to truncate
1748 * @offset: the start of the hole
1749 * @length: the size of the hole (or 0 for truncate)
1750 *
1751 * Punch a hole into a file or truncate a file at a given position.  This
1752 * function operates in whole blocks (@offset and @length are rounded
1753 * accordingly); partially filled blocks must be cleared otherwise.
1754 *
1755 * This function works from the bottom up, and from the right to the left. In
1756 * other words, it strips off the highest layer (data) before stripping any of
1757 * the metadata. Doing it this way is best in case the operation is interrupted
1758 * by power failure, etc.  The dinode is rewritten in every transaction to
1759 * guarantee integrity.
1760 */
1761static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1762{
1763        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1764        u64 maxsize = sdp->sd_heightsize[ip->i_height];
1765        struct metapath mp = {};
1766        struct buffer_head *dibh, *bh;
1767        struct gfs2_holder rd_gh;
1768        unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1769        u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1770        __u16 start_list[GFS2_MAX_META_HEIGHT];
1771        __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1772        unsigned int start_aligned, end_aligned;
1773        unsigned int strip_h = ip->i_height - 1;
1774        u32 btotal = 0;
1775        int ret, state;
1776        int mp_h; /* metapath buffers are read in to this height */
1777        u64 prev_bnr = 0;
1778        __be64 *start, *end;
1779
1780        if (offset >= maxsize) {
1781                /*
1782                 * The starting point lies beyond the allocated meta-data;
1783                 * there are no blocks do deallocate.
1784                 */
1785                return 0;
1786        }
1787
1788        /*
1789         * The start position of the hole is defined by lblock, start_list, and
1790         * start_aligned.  The end position of the hole is defined by lend,
1791         * end_list, and end_aligned.
1792         *
1793         * start_aligned and end_aligned define down to which height the start
1794         * and end positions are aligned to the metadata tree (i.e., the
1795         * position is a multiple of the metadata granularity at the height
1796         * above).  This determines at which heights additional meta pointers
1797         * needs to be preserved for the remaining data.
1798         */
1799
1800        if (length) {
1801                u64 end_offset = offset + length;
1802                u64 lend;
1803
1804                /*
1805                 * Clip the end at the maximum file size for the given height:
1806                 * that's how far the metadata goes; files bigger than that
1807                 * will have additional layers of indirection.
1808                 */
1809                if (end_offset > maxsize)
1810                        end_offset = maxsize;
1811                lend = end_offset >> bsize_shift;
1812
1813                if (lblock >= lend)
1814                        return 0;
1815
1816                find_metapath(sdp, lend, &mp, ip->i_height);
1817                end_list = __end_list;
1818                memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1819
1820                for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1821                        if (end_list[mp_h])
1822                                break;
1823                }
1824                end_aligned = mp_h;
1825        }
1826
1827        find_metapath(sdp, lblock, &mp, ip->i_height);
1828        memcpy(start_list, mp.mp_list, sizeof(start_list));
1829
1830        for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1831                if (start_list[mp_h])
1832                        break;
1833        }
1834        start_aligned = mp_h;
1835
1836        ret = gfs2_meta_inode_buffer(ip, &dibh);
1837        if (ret)
1838                return ret;
1839
1840        mp.mp_bh[0] = dibh;
1841        ret = lookup_metapath(ip, &mp);
1842        if (ret)
1843                goto out_metapath;
1844
1845        /* issue read-ahead on metadata */
1846        for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1847                metapointer_range(&mp, mp_h, start_list, start_aligned,
1848                                  end_list, end_aligned, &start, &end);
1849                gfs2_metapath_ra(ip->i_gl, start, end);
1850        }
1851
1852        if (mp.mp_aheight == ip->i_height)
1853                state = DEALLOC_MP_FULL; /* We have a complete metapath */
1854        else
1855                state = DEALLOC_FILL_MP; /* deal with partial metapath */
1856
1857        ret = gfs2_rindex_update(sdp);
1858        if (ret)
1859                goto out_metapath;
1860
1861        ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1862        if (ret)
1863                goto out_metapath;
1864        gfs2_holder_mark_uninitialized(&rd_gh);
1865
1866        mp_h = strip_h;
1867
1868        while (state != DEALLOC_DONE) {
1869                switch (state) {
1870                /* Truncate a full metapath at the given strip height.
1871                 * Note that strip_h == mp_h in order to be in this state. */
1872                case DEALLOC_MP_FULL:
1873                        bh = mp.mp_bh[mp_h];
1874                        gfs2_assert_withdraw(sdp, bh);
1875                        if (gfs2_assert_withdraw(sdp,
1876                                                 prev_bnr != bh->b_blocknr)) {
1877                                fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
1878                                         "s_h:%u, mp_h:%u\n",
1879                                       (unsigned long long)ip->i_no_addr,
1880                                       prev_bnr, ip->i_height, strip_h, mp_h);
1881                        }
1882                        prev_bnr = bh->b_blocknr;
1883
1884                        if (gfs2_metatype_check(sdp, bh,
1885                                                (mp_h ? GFS2_METATYPE_IN :
1886                                                        GFS2_METATYPE_DI))) {
1887                                ret = -EIO;
1888                                goto out;
1889                        }
1890
1891                        /*
1892                         * Below, passing end_aligned as 0 gives us the
1893                         * metapointer range excluding the end point: the end
1894                         * point is the first metapath we must not deallocate!
1895                         */
1896
1897                        metapointer_range(&mp, mp_h, start_list, start_aligned,
1898                                          end_list, 0 /* end_aligned */,
1899                                          &start, &end);
1900                        ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1901                                                 start, end,
1902                                                 mp_h != ip->i_height - 1,
1903                                                 &btotal);
1904
1905                        /* If we hit an error or just swept dinode buffer,
1906                           just exit. */
1907                        if (ret || !mp_h) {
1908                                state = DEALLOC_DONE;
1909                                break;
1910                        }
1911                        state = DEALLOC_MP_LOWER;
1912                        break;
1913
1914                /* lower the metapath strip height */
1915                case DEALLOC_MP_LOWER:
1916                        /* We're done with the current buffer, so release it,
1917                           unless it's the dinode buffer. Then back up to the
1918                           previous pointer. */
1919                        if (mp_h) {
1920                                brelse(mp.mp_bh[mp_h]);
1921                                mp.mp_bh[mp_h] = NULL;
1922                        }
1923                        /* If we can't get any lower in height, we've stripped
1924                           off all we can. Next step is to back up and start
1925                           stripping the previous level of metadata. */
1926                        if (mp_h == 0) {
1927                                strip_h--;
1928                                memcpy(mp.mp_list, start_list, sizeof(start_list));
1929                                mp_h = strip_h;
1930                                state = DEALLOC_FILL_MP;
1931                                break;
1932                        }
1933                        mp.mp_list[mp_h] = 0;
1934                        mp_h--; /* search one metadata height down */
1935                        mp.mp_list[mp_h]++;
1936                        if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1937                                break;
1938                        /* Here we've found a part of the metapath that is not
1939                         * allocated. We need to search at that height for the
1940                         * next non-null pointer. */
1941                        if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1942                                state = DEALLOC_FILL_MP;
1943                                mp_h++;
1944                        }
1945                        /* No more non-null pointers at this height. Back up
1946                           to the previous height and try again. */
1947                        break; /* loop around in the same state */
1948
1949                /* Fill the metapath with buffers to the given height. */
1950                case DEALLOC_FILL_MP:
1951                        /* Fill the buffers out to the current height. */
1952                        ret = fillup_metapath(ip, &mp, mp_h);
1953                        if (ret < 0)
1954                                goto out;
1955
1956                        /* On the first pass, issue read-ahead on metadata. */
1957                        if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1958                                unsigned int height = mp.mp_aheight - 1;
1959
1960                                /* No read-ahead for data blocks. */
1961                                if (mp.mp_aheight - 1 == strip_h)
1962                                        height--;
1963
1964                                for (; height >= mp.mp_aheight - ret; height--) {
1965                                        metapointer_range(&mp, height,
1966                                                          start_list, start_aligned,
1967                                                          end_list, end_aligned,
1968                                                          &start, &end);
1969                                        gfs2_metapath_ra(ip->i_gl, start, end);
1970                                }
1971                        }
1972
1973                        /* If buffers found for the entire strip height */
1974                        if (mp.mp_aheight - 1 == strip_h) {
1975                                state = DEALLOC_MP_FULL;
1976                                break;
1977                        }
1978                        if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1979                                mp_h = mp.mp_aheight - 1;
1980
1981                        /* If we find a non-null block pointer, crawl a bit
1982                           higher up in the metapath and try again, otherwise
1983                           we need to look lower for a new starting point. */
1984                        if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1985                                mp_h++;
1986                        else
1987                                state = DEALLOC_MP_LOWER;
1988                        break;
1989                }
1990        }
1991
1992        if (btotal) {
1993                if (current->journal_info == NULL) {
1994                        ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1995                                               RES_QUOTA, 0);
1996                        if (ret)
1997                                goto out;
1998                        down_write(&ip->i_rw_mutex);
1999                }
2000                gfs2_statfs_change(sdp, 0, +btotal, 0);

2001                gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
2002                                  ip->i_inode.i_gid);
2003                ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2004                gfs2_trans_add_meta(ip->i_gl, dibh);
2005                gfs2_dinode_out(ip, dibh->b_data);
2006                up_write(&ip->i_rw_mutex);
2007                gfs2_trans_end(sdp);
2008        }
2009
2010out:
2011        if (gfs2_holder_initialized(&rd_gh))
2012                gfs2_glock_dq_uninit(&rd_gh);
2013        if (current->journal_info) {
2014                up_write(&ip->i_rw_mutex);
2015                gfs2_trans_end(sdp);
2016                cond_resched();
2017        }
2018        gfs2_quota_unhold(ip);
2019out_metapath:
2020        release_metapath(&mp);
2021        return ret;
2022}
2023
2024static int trunc_end(struct gfs2_inode *ip)
2025{
2026        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2027        struct buffer_head *dibh;
2028        int error;
2029
2030        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2031        if (error)
2032                return error;
2033
2034        down_write(&ip->i_rw_mutex);
2035
2036        error = gfs2_meta_inode_buffer(ip, &dibh);
2037        if (error)
2038                goto out;
2039
2040        if (!i_size_read(&ip->i_inode)) {
2041                ip->i_height = 0;
2042                ip->i_goal = ip->i_no_addr;
2043                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
2044                gfs2_ordered_del_inode(ip);
2045        }
2046        ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2047        ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
2048
2049        gfs2_trans_add_meta(ip->i_gl, dibh);
2050        gfs2_dinode_out(ip, dibh->b_data);
2051        brelse(dibh);
2052
2053out:
2054        up_write(&ip->i_rw_mutex);
2055        gfs2_trans_end(sdp);
2056        return error;
2057}
2058
2059/**
2060 * do_shrink - make a file smaller
2061 * @inode: the inode
2062 * @newsize: the size to make the file
2063 *
2064 * Called with an exclusive lock on @inode. The @size must
2065 * be equal to or smaller than the current inode size.
2066 *
2067 * Returns: errno
2068 */
2069
2070static int do_shrink(struct inode *inode, u64 newsize)
2071{
2072        struct gfs2_inode *ip = GFS2_I(inode);
2073        int error;
2074
2075        error = trunc_start(inode, newsize);
2076        if (error < 0)
2077                return error;
2078        if (gfs2_is_stuffed(ip))
2079                return 0;
2080
2081        error = punch_hole(ip, newsize, 0);
2082        if (error == 0)
2083                error = trunc_end(ip);
2084
2085        return error;
2086}
2087
2088void gfs2_trim_blocks(struct inode *inode)
2089{
2090        int ret;
2091
2092        ret = do_shrink(inode, inode->i_size);
2093        WARN_ON(ret != 0);
2094}
2095
2096/**
2097 * do_grow - Touch and update inode size
2098 * @inode: The inode
2099 * @size: The new size
2100 *
2101 * This function updates the timestamps on the inode and
2102 * may also increase the size of the inode. This function
2103 * must not be called with @size any smaller than the current
2104 * inode size.
2105 *
2106 * Although it is not strictly required to unstuff files here,
2107 * earlier versions of GFS2 have a bug in the stuffed file reading
2108 * code which will result in a buffer overrun if the size is larger
2109 * than the max stuffed file size. In order to prevent this from
2110 * occurring, such files are unstuffed, but in other cases we can
2111 * just update the inode size directly.
2112 *
2113 * Returns: 0 on success, or -ve on error
2114 */
2115
2116static int do_grow(struct inode *inode, u64 size)
2117{
2118        struct gfs2_inode *ip = GFS2_I(inode);
2119        struct gfs2_sbd *sdp = GFS2_SB(inode);
2120        struct gfs2_alloc_parms ap = { .target = 1, };
2121        struct buffer_head *dibh;
2122        int error;
2123        int unstuff = 0;
2124
2125        if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2126                error = gfs2_quota_lock_check(ip, &ap);
2127                if (error)
2128                        return error;
2129
2130                error = gfs2_inplace_reserve(ip, &ap);
2131                if (error)
2132                        goto do_grow_qunlock;
2133                unstuff = 1;
2134        }
2135
2136        error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2137                                 (unstuff &&
2138                                  gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2139                                 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2140                                  0 : RES_QUOTA), 0);
2141        if (error)
2142                goto do_grow_release;
2143
2144        if (unstuff) {
2145                error = gfs2_unstuff_dinode(ip);
2146                if (error)
2147                        goto do_end_trans;
2148        }
2149
2150        error = gfs2_meta_inode_buffer(ip, &dibh);
2151        if (error)
2152                goto do_end_trans;
2153
2154        truncate_setsize(inode, size);
2155        ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2156        gfs2_trans_add_meta(ip->i_gl, dibh);
2157        gfs2_dinode_out(ip, dibh->b_data);
2158        brelse(dibh);
2159
2160do_end_trans:
2161        gfs2_trans_end(sdp);
2162do_grow_release:
2163        if (unstuff) {
2164                gfs2_inplace_release(ip);
2165do_grow_qunlock:
2166                gfs2_quota_unlock(ip);
2167        }
2168        return error;
2169}
2170
2171/**
2172 * gfs2_setattr_size - make a file a given size
2173 * @inode: the inode
2174 * @newsize: the size to make the file
2175 *
2176 * The file size can grow, shrink, or stay the same size. This
2177 * is called holding i_rwsem and an exclusive glock on the inode
2178 * in question.
2179 *
2180 * Returns: errno
2181 */
2182
2183int gfs2_setattr_size(struct inode *inode, u64 newsize)
2184{
2185        struct gfs2_inode *ip = GFS2_I(inode);
2186        int ret;
2187
2188        BUG_ON(!S_ISREG(inode->i_mode));
2189
2190        ret = inode_newsize_ok(inode, newsize);
2191        if (ret)
2192                return ret;
2193
2194        inode_dio_wait(inode);
2195
2196        ret = gfs2_qa_get(ip);
2197        if (ret)
2198                goto out;
2199
2200        if (newsize >= inode->i_size) {
2201                ret = do_grow(inode, newsize);
2202                goto out;
2203        }
2204
2205        ret = do_shrink(inode, newsize);
2206out:
2207        gfs2_rs_delete(ip, NULL);
2208        gfs2_qa_put(ip);
2209        return ret;
2210}
2211
2212int gfs2_truncatei_resume(struct gfs2_inode *ip)
2213{
2214        int error;
2215        error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2216        if (!error)
2217                error = trunc_end(ip);
2218        return error;
2219}
2220
2221int gfs2_file_dealloc(struct gfs2_inode *ip)
2222{
2223        return punch_hole(ip, 0, 0);
2224}
2225
2226/**
2227 * gfs2_free_journal_extents - Free cached journal bmap info
2228 * @jd: The journal
2229 *
2230 */
2231
2232void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2233{
2234        struct gfs2_journal_extent *jext;
2235
2236        while(!list_empty(&jd->extent_list)) {
2237                jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2238                list_del(&jext->list);
2239                kfree(jext);
2240        }
2241}
2242
2243/**
2244 * gfs2_add_jextent - Add or merge a new extent to extent cache
2245 * @jd: The journal descriptor
2246 * @lblock: The logical block at start of new extent
2247 * @dblock: The physical block at start of new extent
2248 * @blocks: Size of extent in fs blocks
2249 *
2250 * Returns: 0 on success or -ENOMEM
2251 */
2252
2253static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2254{
2255        struct gfs2_journal_extent *jext;
2256
2257        if (!list_empty(&jd->extent_list)) {
2258                jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2259                if ((jext->dblock + jext->blocks) == dblock) {
2260                        jext->blocks += blocks;
2261                        return 0;
2262                }
2263        }
2264
2265        jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2266        if (jext == NULL)
2267                return -ENOMEM;
2268        jext->dblock = dblock;
2269        jext->lblock = lblock;
2270        jext->blocks = blocks;
2271        list_add_tail(&jext->list, &jd->extent_list);
2272        jd->nr_extents++;
2273        return 0;
2274}
2275
2276/**
2277 * gfs2_map_journal_extents - Cache journal bmap info
2278 * @sdp: The super block
2279 * @jd: The journal to map
2280 *
2281 * Create a reusable "extent" mapping from all logical
2282 * blocks to all physical blocks for the given journal.  This will save
2283 * us time when writing journal blocks.  Most journals will have only one
2284 * extent that maps all their logical blocks.  That's because gfs2.mkfs
2285 * arranges the journal blocks sequentially to maximize performance.
2286 * So the extent would map the first block for the entire file length.
2287 * However, gfs2_jadd can happen while file activity is happening, so
2288 * those journals may not be sequential.  Less likely is the case where
2289 * the users created their own journals by mounting the metafs and
2290 * laying it out.  But it's still possible.  These journals might have
2291 * several extents.
2292 *
2293 * Returns: 0 on success, or error on failure
2294 */
2295
2296int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2297{
2298        u64 lblock = 0;
2299        u64 lblock_stop;
2300        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2301        struct buffer_head bh;
2302        unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2303        u64 size;
2304        int rc;
2305        ktime_t start, end;
2306
2307        start = ktime_get();
2308        lblock_stop = i_size_read(jd->jd_inode) >> shift;
2309        size = (lblock_stop - lblock) << shift;
2310        jd->nr_extents = 0;
2311        WARN_ON(!list_empty(&jd->extent_list));
2312
2313        do {
2314                bh.b_state = 0;
2315                bh.b_blocknr = 0;
2316                bh.b_size = size;
2317                rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2318                if (rc || !buffer_mapped(&bh))
2319                        goto fail;
2320                rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2321                if (rc)
2322                        goto fail;
2323                size -= bh.b_size;
2324                lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2325        } while(size > 0);
2326
2327        end = ktime_get();
2328        fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2329                jd->nr_extents, ktime_ms_delta(end, start));
2330        return 0;
2331
2332fail:
2333        fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2334                rc, jd->jd_jid,
2335                (unsigned long long)(i_size_read(jd->jd_inode) - size),
2336                jd->nr_extents);
2337        fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2338                rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2339                bh.b_state, (unsigned long long)bh.b_size);
2340        gfs2_free_journal_extents(jd);
2341        return rc;
2342}
2343
2344/**
2345 * gfs2_write_alloc_required - figure out if a write will require an allocation
2346 * @ip: the file being written to
2347 * @offset: the offset to write to
2348 * @len: the number of bytes being written
2349 *
2350 * Returns: 1 if an alloc is required, 0 otherwise
2351 */
2352
2353int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2354                              unsigned int len)
2355{
2356        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2357        struct buffer_head bh;
2358        unsigned int shift;
2359        u64 lblock, lblock_stop, size;
2360        u64 end_of_file;
2361
2362        if (!len)
2363                return 0;
2364
2365        if (gfs2_is_stuffed(ip)) {
2366                if (offset + len > gfs2_max_stuffed_size(ip))
2367                        return 1;
2368                return 0;
2369        }
2370
2371        shift = sdp->sd_sb.sb_bsize_shift;
2372        BUG_ON(gfs2_is_dir(ip));
2373        end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2374        lblock = offset >> shift;
2375        lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2376        if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2377                return 1;
2378
2379        size = (lblock_stop - lblock) << shift;
2380        do {
2381                bh.b_state = 0;
2382                bh.b_size = size;
2383                gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2384                if (!buffer_mapped(&bh))
2385                        return 1;
2386                size -= bh.b_size;
2387                lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2388        } while(size > 0);
2389
2390        return 0;
2391}
2392
2393static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2394{
2395        struct gfs2_inode *ip = GFS2_I(inode);
2396        struct buffer_head *dibh;
2397        int error;
2398
2399        if (offset >= inode->i_size)
2400                return 0;
2401        if (offset + length > inode->i_size)
2402                length = inode->i_size - offset;
2403
2404        error = gfs2_meta_inode_buffer(ip, &dibh);
2405        if (error)
2406                return error;
2407        gfs2_trans_add_meta(ip->i_gl, dibh);
2408        memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2409               length);
2410        brelse(dibh);
2411        return 0;
2412}
2413
2414static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2415                                         loff_t length)
2416{
2417        struct gfs2_sbd *sdp = GFS2_SB(inode);
2418        loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2419        int error;
2420
2421        while (length) {
2422                struct gfs2_trans *tr;
2423                loff_t chunk;
2424                unsigned int offs;
2425
2426                chunk = length;
2427                if (chunk > max_chunk)
2428                        chunk = max_chunk;
2429
2430                offs = offset & ~PAGE_MASK;
2431                if (offs && chunk > PAGE_SIZE)
2432                        chunk = offs + ((chunk - offs) & PAGE_MASK);
2433
2434                truncate_pagecache_range(inode, offset, chunk);
2435                offset += chunk;
2436                length -= chunk;
2437
2438                tr = current->journal_info;
2439                if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2440                        continue;
2441
2442                gfs2_trans_end(sdp);
2443                error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2444                if (error)
2445                        return error;
2446        }
2447        return 0;
2448}
2449
2450int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2451{
2452        struct inode *inode = file_inode(file);
2453        struct gfs2_inode *ip = GFS2_I(inode);
2454        struct gfs2_sbd *sdp = GFS2_SB(inode);
2455        unsigned int blocksize = i_blocksize(inode);
2456        loff_t start, end;
2457        int error;
2458
2459        if (!gfs2_is_stuffed(ip)) {
2460                unsigned int start_off, end_len;
2461
2462                start_off = offset & (blocksize - 1);
2463                end_len = (offset + length) & (blocksize - 1);
2464                if (start_off) {
2465                        unsigned int len = length;
2466                        if (length > blocksize - start_off)
2467                                len = blocksize - start_off;
2468                        error = gfs2_block_zero_range(inode, offset, len);
2469                        if (error)
2470                                goto out;
2471                        if (start_off + length < blocksize)
2472                                end_len = 0;
2473                }
2474                if (end_len) {
2475                        error = gfs2_block_zero_range(inode,
2476                                offset + length - end_len, end_len);
2477                        if (error)
2478                                goto out;
2479                }
2480        }
2481
2482        start = round_down(offset, blocksize);
2483        end = round_up(offset + length, blocksize) - 1;
2484        error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2485        if (error)
2486                return error;
2487
2488        if (gfs2_is_jdata(ip))
2489                error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2490                                         GFS2_JTRUNC_REVOKES);
2491        else
2492                error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2493        if (error)
2494                return error;
2495
2496        if (gfs2_is_stuffed(ip)) {
2497                error = stuffed_zero_range(inode, offset, length);
2498                if (error)
2499                        goto out;
2500        }
2501
2502        if (gfs2_is_jdata(ip)) {
2503                BUG_ON(!current->journal_info);
2504                gfs2_journaled_truncate_range(inode, offset, length);
2505        } else
2506                truncate_pagecache_range(inode, offset, offset + length - 1);
2507
2508        file_update_time(file);
2509        mark_inode_dirty(inode);
2510
2511        if (current->journal_info)
2512                gfs2_trans_end(sdp);
2513
2514        if (!gfs2_is_stuffed(ip))
2515                error = punch_hole(ip, offset, length);
2516
2517out:
2518        if (current->journal_info)
2519                gfs2_trans_end(sdp);
2520        return error;
2521}
2522
2523static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
2524                loff_t offset)
2525{
2526        int ret;
2527
2528        if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
2529                return -EIO;
2530
2531        if (offset >= wpc->iomap.offset &&
2532            offset < wpc->iomap.offset + wpc->iomap.length)
2533                return 0;
2534
2535        memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2536        ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
2537        return ret;
2538}
2539
2540const struct iomap_writeback_ops gfs2_writeback_ops = {
2541        .map_blocks             = gfs2_map_blocks,
2542};
2543