LXR linux/fs/xfs/xfs_bmap

   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * Copyright (c) 2012 Red Hat, Inc.
   4 * All Rights Reserved.
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License as
   8 * published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it would be useful,
  11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 * GNU General Public License for more details.
  14 *
  15 * You should have received a copy of the GNU General Public License
  16 * along with this program; if not, write the Free Software Foundation,
  17 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18 */
  19#include "xfs.h"
  20#include "xfs_fs.h"
  21#include "xfs_shared.h"
  22#include "xfs_format.h"
  23#include "xfs_log_format.h"
  24#include "xfs_trans_resv.h"
  25#include "xfs_bit.h"
  26#include "xfs_mount.h"
  27#include "xfs_da_format.h"
  28#include "xfs_defer.h"
  29#include "xfs_inode.h"
  30#include "xfs_btree.h"
  31#include "xfs_trans.h"
  32#include "xfs_extfree_item.h"
  33#include "xfs_alloc.h"
  34#include "xfs_bmap.h"
  35#include "xfs_bmap_util.h"
  36#include "xfs_bmap_btree.h"
  37#include "xfs_rtalloc.h"
  38#include "xfs_error.h"
  39#include "xfs_quota.h"
  40#include "xfs_trans_space.h"
  41#include "xfs_trace.h"
  42#include "xfs_icache.h"
  43#include "xfs_log.h"
  44#include "xfs_rmap_btree.h"
  45#include "xfs_iomap.h"
  46#include "xfs_reflink.h"
  47#include "xfs_refcount.h"
  48
  49/* Kernel only BMAP related definitions and functions */
  50
  51/*
  52 * Convert the given file system block to a disk block.  We have to treat it
  53 * differently based on whether the file is a real time file or not, because the
  54 * bmap code does.
  55 */
  56xfs_daddr_t
  57xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
  58{
  59        return (XFS_IS_REALTIME_INODE(ip) ? \
  60                 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
  61                 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
  62}
  63
  64/*
  65 * Routine to zero an extent on disk allocated to the specific inode.
  66 *
  67 * The VFS functions take a linearised filesystem block offset, so we have to
  68 * convert the sparse xfs fsb to the right format first.
  69 * VFS types are real funky, too.
  70 */
  71int
  72xfs_zero_extent(
  73        struct xfs_inode *ip,
  74        xfs_fsblock_t   start_fsb,
  75        xfs_off_t       count_fsb)
  76{
  77        struct xfs_mount *mp = ip->i_mount;
  78        xfs_daddr_t     sector = xfs_fsb_to_db(ip, start_fsb);
  79        sector_t        block = XFS_BB_TO_FSBT(mp, sector);
  80
  81        return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
  82                block << (mp->m_super->s_blocksize_bits - 9),
  83                count_fsb << (mp->m_super->s_blocksize_bits - 9),
  84                GFP_NOFS, true);
  85}
  86
  87int
  88xfs_bmap_rtalloc(
  89        struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
  90{
  91        xfs_alloctype_t atype = 0;      /* type for allocation routines */
  92        int             error;          /* error return value */
  93        xfs_mount_t     *mp;            /* mount point structure */
  94        xfs_extlen_t    prod = 0;       /* product factor for allocators */
  95        xfs_extlen_t    ralen = 0;      /* realtime allocation length */
  96        xfs_extlen_t    align;          /* minimum allocation alignment */
  97        xfs_rtblock_t   rtb;
  98
  99        mp = ap->ip->i_mount;
 100        align = xfs_get_extsz_hint(ap->ip);
 101        prod = align / mp->m_sb.sb_rextsize;
 102        error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 103                                        align, 1, ap->eof, 0,
 104                                        ap->conv, &ap->offset, &ap->length);
 105        if (error)
 106                return error;
 107        ASSERT(ap->length);
 108        ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
 109
 110        /*
 111         * If the offset & length are not perfectly aligned
 112         * then kill prod, it will just get us in trouble.
 113         */
 114        if (do_mod(ap->offset, align) || ap->length % align)
 115                prod = 1;
 116        /*
 117         * Set ralen to be the actual requested length in rtextents.
 118         */
 119        ralen = ap->length / mp->m_sb.sb_rextsize;
 120        /*
 121         * If the old value was close enough to MAXEXTLEN that
 122         * we rounded up to it, cut it back so it's valid again.
 123         * Note that if it's a really large request (bigger than
 124         * MAXEXTLEN), we don't hear about that number, and can't
 125         * adjust the starting point to match it.
 126         */
 127        if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
 128                ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
 129
 130        /*
 131         * Lock out modifications to both the RT bitmap and summary inodes
 132         */
 133        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
 134        xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 135        xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
 136        xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
 137
 138        /*
 139         * If it's an allocation to an empty file at offset 0,
 140         * pick an extent that will space things out in the rt area.
 141         */
 142        if (ap->eof && ap->offset == 0) {
 143                xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
 144
 145                error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
 146                if (error)
 147                        return error;
 148                ap->blkno = rtx * mp->m_sb.sb_rextsize;
 149        } else {
 150                ap->blkno = 0;
 151        }
 152
 153        xfs_bmap_adjacent(ap);
 154
 155        /*
 156         * Realtime allocation, done through xfs_rtallocate_extent.
 157         */
 158        atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
 159        do_div(ap->blkno, mp->m_sb.sb_rextsize);
 160        rtb = ap->blkno;
 161        ap->length = ralen;
 162        if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
 163                                &ralen, atype, ap->wasdel, prod, &rtb)))
 164                return error;
 165        if (rtb == NULLFSBLOCK && prod > 1 &&
 166            (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
 167                                           ap->length, &ralen, atype,
 168                                           ap->wasdel, 1, &rtb)))
 169                return error;
 170        ap->blkno = rtb;
 171        if (ap->blkno != NULLFSBLOCK) {
 172                ap->blkno *= mp->m_sb.sb_rextsize;
 173                ralen *= mp->m_sb.sb_rextsize;
 174                ap->length = ralen;
 175                ap->ip->i_d.di_nblocks += ralen;
 176                xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
 177                if (ap->wasdel)
 178                        ap->ip->i_delayed_blks -= ralen;
 179                /*
 180                 * Adjust the disk quota also. This was reserved
 181                 * earlier.
 182                 */
 183                xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
 184                        ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
 185                                        XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
 186
 187                /* Zero the extent if we were asked to do so */
 188                if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) {
 189                        error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
 190                        if (error)
 191                                return error;
 192                }
 193        } else {
 194                ap->length = 0;
 195        }
 196        return 0;
 197}
 198
 199/*
 200 * Check if the endoff is outside the last extent. If so the caller will grow
 201 * the allocation to a stripe unit boundary.  All offsets are considered outside
 202 * the end of file for an empty fork, so 1 is returned in *eof in that case.
 203 */
 204int
 205xfs_bmap_eof(
 206        struct xfs_inode        *ip,
 207        xfs_fileoff_t           endoff,
 208        int                     whichfork,
 209        int                     *eof)
 210{
 211        struct xfs_bmbt_irec    rec;
 212        int                     error;
 213
 214        error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
 215        if (error || *eof)
 216                return error;
 217
 218        *eof = endoff >= rec.br_startoff + rec.br_blockcount;
 219        return 0;
 220}
 221
 222/*
 223 * Extent tree block counting routines.
 224 */
 225
 226/*
 227 * Count leaf blocks given a range of extent records.
 228 */
 229STATIC void
 230xfs_bmap_count_leaves(
 231        xfs_ifork_t             *ifp,
 232        xfs_extnum_t            idx,
 233        int                     numrecs,
 234        int                     *count)
 235{
 236        int             b;
 237
 238        for (b = 0; b < numrecs; b++) {
 239                xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
 240                *count += xfs_bmbt_get_blockcount(frp);
 241        }
 242}
 243
 244/*
 245 * Count leaf blocks given a range of extent records originally
 246 * in btree format.
 247 */
 248STATIC void
 249xfs_bmap_disk_count_leaves(
 250        struct xfs_mount        *mp,
 251        struct xfs_btree_block  *block,
 252        int                     numrecs,
 253        int                     *count)
 254{
 255        int             b;
 256        xfs_bmbt_rec_t  *frp;
 257
 258        for (b = 1; b <= numrecs; b++) {
 259                frp = XFS_BMBT_REC_ADDR(mp, block, b);
 260                *count += xfs_bmbt_disk_get_blockcount(frp);
 261        }
 262}
 263
 264/*
 265 * Recursively walks each level of a btree
 266 * to count total fsblocks in use.
 267 */
 268STATIC int                                     /* error */
 269xfs_bmap_count_tree(
 270        xfs_mount_t     *mp,            /* file system mount point */
 271        xfs_trans_t     *tp,            /* transaction pointer */
 272        xfs_ifork_t     *ifp,           /* inode fork pointer */
 273        xfs_fsblock_t   blockno,        /* file system block number */
 274        int             levelin,        /* level in btree */
 275        int             *count)         /* Count of blocks */
 276{
 277        int                     error;
 278        xfs_buf_t               *bp, *nbp;
 279        int                     level = levelin;
 280        __be64                  *pp;
 281        xfs_fsblock_t           bno = blockno;
 282        xfs_fsblock_t           nextbno;
 283        struct xfs_btree_block  *block, *nextblock;
 284        int                     numrecs;
 285
 286        error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
 287                                                &xfs_bmbt_buf_ops);
 288        if (error)
 289                return error;
 290        *count += 1;
 291        block = XFS_BUF_TO_BLOCK(bp);
 292
 293        if (--level) {
 294                /* Not at node above leaves, count this level of nodes */
 295                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 296                while (nextbno != NULLFSBLOCK) {
 297                        error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
 298                                                XFS_BMAP_BTREE_REF,
 299                                                &xfs_bmbt_buf_ops);
 300                        if (error)
 301                                return error;
 302                        *count += 1;
 303                        nextblock = XFS_BUF_TO_BLOCK(nbp);
 304                        nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
 305                        xfs_trans_brelse(tp, nbp);
 306                }
 307
 308                /* Dive to the next level */
 309                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
 310                bno = be64_to_cpu(*pp);
 311                if (unlikely((error =
 312                     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
 313                        xfs_trans_brelse(tp, bp);
 314                        XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
 315                                         XFS_ERRLEVEL_LOW, mp);
 316                        return -EFSCORRUPTED;
 317                }
 318                xfs_trans_brelse(tp, bp);
 319        } else {
 320                /* count all level 1 nodes and their leaves */
 321                for (;;) {
 322                        nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 323                        numrecs = be16_to_cpu(block->bb_numrecs);
 324                        xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
 325                        xfs_trans_brelse(tp, bp);
 326                        if (nextbno == NULLFSBLOCK)
 327                                break;
 328                        bno = nextbno;
 329                        error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
 330                                                XFS_BMAP_BTREE_REF,
 331                                                &xfs_bmbt_buf_ops);
 332                        if (error)
 333                                return error;
 334                        *count += 1;
 335                        block = XFS_BUF_TO_BLOCK(bp);
 336                }
 337        }
 338        return 0;
 339}
 340
 341/*
 342 * Count fsblocks of the given fork.
 343 */
 344static int                                      /* error */
 345xfs_bmap_count_blocks(
 346        xfs_trans_t             *tp,            /* transaction pointer */
 347        xfs_inode_t             *ip,            /* incore inode */
 348        int                     whichfork,      /* data or attr fork */
 349        int                     *count)         /* out: count of blocks */
 350{
 351        struct xfs_btree_block  *block; /* current btree block */
 352        xfs_fsblock_t           bno;    /* block # of "block" */
 353        xfs_ifork_t             *ifp;   /* fork structure */
 354        int                     level;  /* btree level, for checking */
 355        xfs_mount_t             *mp;    /* file system mount structure */
 356        __be64                  *pp;    /* pointer to block address */
 357
 358        bno = NULLFSBLOCK;
 359        mp = ip->i_mount;
 360        ifp = XFS_IFORK_PTR(ip, whichfork);
 361        if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
 362                xfs_bmap_count_leaves(ifp, 0,
 363                        ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
 364                        count);
 365                return 0;
 366        }
 367
 368        /*
 369         * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
 370         */
 371        block = ifp->if_broot;
 372        level = be16_to_cpu(block->bb_level);
 373        ASSERT(level > 0);
 374        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
 375        bno = be64_to_cpu(*pp);
 376        ASSERT(bno != NULLFSBLOCK);
 377        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
 378        ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
 379
 380        if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
 381                XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
 382                                 mp);
 383                return -EFSCORRUPTED;
 384        }
 385
 386        return 0;
 387}
 388
 389/*
 390 * returns 1 for success, 0 if we failed to map the extent.
 391 */
 392STATIC int
 393xfs_getbmapx_fix_eof_hole(
 394        xfs_inode_t             *ip,            /* xfs incore inode pointer */
 395        int                     whichfork,
 396        struct getbmapx         *out,           /* output structure */
 397        int                     prealloced,     /* this is a file with
 398                                                 * preallocated data space */
 399        __int64_t               end,            /* last block requested */
 400        xfs_fsblock_t           startblock,
 401        bool                    moretocome)
 402{
 403        __int64_t               fixlen;
 404        xfs_mount_t             *mp;            /* file system mount point */
 405        xfs_ifork_t             *ifp;           /* inode fork pointer */
 406        xfs_extnum_t            lastx;          /* last extent pointer */
 407        xfs_fileoff_t           fileblock;
 408
 409        if (startblock == HOLESTARTBLOCK) {
 410                mp = ip->i_mount;
 411                out->bmv_block = -1;
 412                fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
 413                fixlen -= out->bmv_offset;
 414                if (prealloced && out->bmv_offset + out->bmv_length == end) {
 415                        /* Came to hole at EOF. Trim it. */
 416                        if (fixlen <= 0)
 417                                return 0;
 418                        out->bmv_length = fixlen;
 419                }
 420        } else {
 421                if (startblock == DELAYSTARTBLOCK)
 422                        out->bmv_block = -2;
 423                else
 424                        out->bmv_block = xfs_fsb_to_db(ip, startblock);
 425                fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
 426                ifp = XFS_IFORK_PTR(ip, whichfork);
 427                if (!moretocome &&
 428                    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
 429                   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
 430                        out->bmv_oflags |= BMV_OF_LAST;
 431        }
 432
 433        return 1;
 434}
 435
 436/* Adjust the reported bmap around shared/unshared extent transitions. */
 437STATIC int
 438xfs_getbmap_adjust_shared(
 439        struct xfs_inode                *ip,
 440        int                             whichfork,
 441        struct xfs_bmbt_irec            *map,
 442        struct getbmapx                 *out,
 443        struct xfs_bmbt_irec            *next_map)
 444{
 445        struct xfs_mount                *mp = ip->i_mount;
 446        xfs_agnumber_t                  agno;
 447        xfs_agblock_t                   agbno;
 448        xfs_agblock_t                   ebno;
 449        xfs_extlen_t                    elen;
 450        xfs_extlen_t                    nlen;
 451        int                             error;
 452
 453        next_map->br_startblock = NULLFSBLOCK;
 454        next_map->br_startoff = NULLFILEOFF;
 455        next_map->br_blockcount = 0;
 456
 457        /* Only written data blocks can be shared. */
 458        if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK ||
 459            map->br_startblock == DELAYSTARTBLOCK ||
 460            map->br_startblock == HOLESTARTBLOCK ||
 461            ISUNWRITTEN(map))
 462                return 0;
 463
 464        agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
 465        agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
 466        error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount,
 467                        &ebno, &elen, true);
 468        if (error)
 469                return error;
 470
 471        if (ebno == NULLAGBLOCK) {
 472                /* No shared blocks at all. */
 473                return 0;
 474        } else if (agbno == ebno) {
 475                /*
 476                 * Shared extent at (agbno, elen).  Shrink the reported
 477                 * extent length and prepare to move the start of map[i]
 478                 * to agbno+elen, with the aim of (re)formatting the new
 479                 * map[i] the next time through the inner loop.
 480                 */
 481                out->bmv_length = XFS_FSB_TO_BB(mp, elen);
 482                out->bmv_oflags |= BMV_OF_SHARED;
 483                if (elen != map->br_blockcount) {
 484                        *next_map = *map;
 485                        next_map->br_startblock += elen;
 486                        next_map->br_startoff += elen;
 487                        next_map->br_blockcount -= elen;
 488                }
 489                map->br_blockcount -= elen;
 490        } else {
 491                /*
 492                 * There's an unshared extent (agbno, ebno - agbno)
 493                 * followed by shared extent at (ebno, elen).  Shrink
 494                 * the reported extent length to cover only the unshared
 495                 * extent and prepare to move up the start of map[i] to
 496                 * ebno, with the aim of (re)formatting the new map[i]
 497                 * the next time through the inner loop.
 498                 */
 499                *next_map = *map;
 500                nlen = ebno - agbno;
 501                out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
 502                next_map->br_startblock += nlen;
 503                next_map->br_startoff += nlen;
 504                next_map->br_blockcount -= nlen;
 505                map->br_blockcount -= nlen;
 506        }
 507
 508        return 0;
 509}
 510
 511/*
 512 * Get inode's extents as described in bmv, and format for output.
 513 * Calls formatter to fill the user's buffer until all extents
 514 * are mapped, until the passed-in bmv->bmv_count slots have
 515 * been filled, or until the formatter short-circuits the loop,
 516 * if it is tracking filled-in extents on its own.
 517 */
 518int                                             /* error code */
 519xfs_getbmap(
 520        xfs_inode_t             *ip,
 521        struct getbmapx         *bmv,           /* user bmap structure */
 522        xfs_bmap_format_t       formatter,      /* format to user */
 523        void                    *arg)           /* formatter arg */
 524{
 525        __int64_t               bmvend;         /* last block requested */
 526        int                     error = 0;      /* return value */
 527        __int64_t               fixlen;         /* length for -1 case */
 528        int                     i;              /* extent number */
 529        int                     lock;           /* lock state */
 530        xfs_bmbt_irec_t         *map;           /* buffer for user's data */
 531        xfs_mount_t             *mp;            /* file system mount point */
 532        int                     nex;            /* # of user extents can do */
 533        int                     nexleft;        /* # of user extents left */
 534        int                     subnex;         /* # of bmapi's can do */
 535        int                     nmap;           /* number of map entries */
 536        struct getbmapx         *out;           /* output structure */
 537        int                     whichfork;      /* data or attr fork */
 538        int                     prealloced;     /* this is a file with
 539                                                 * preallocated data space */
 540        int                     iflags;         /* interface flags */
 541        int                     bmapi_flags;    /* flags for xfs_bmapi */
 542        int                     cur_ext = 0;
 543        struct xfs_bmbt_irec    inject_map;
 544
 545        mp = ip->i_mount;
 546        iflags = bmv->bmv_iflags;
 547
 548#ifndef DEBUG
 549        /* Only allow CoW fork queries if we're debugging. */
 550        if (iflags & BMV_IF_COWFORK)
 551                return -EINVAL;
 552#endif
 553        if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
 554                return -EINVAL;
 555
 556        if (iflags & BMV_IF_ATTRFORK)
 557                whichfork = XFS_ATTR_FORK;
 558        else if (iflags & BMV_IF_COWFORK)
 559                whichfork = XFS_COW_FORK;
 560        else
 561                whichfork = XFS_DATA_FORK;
 562
 563        switch (whichfork) {
 564        case XFS_ATTR_FORK:
 565                if (XFS_IFORK_Q(ip)) {
 566                        if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
 567                            ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
 568                            ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
 569                                return -EINVAL;
 570                } else if (unlikely(
 571                           ip->i_d.di_aformat != 0 &&
 572                           ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
 573                        XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
 574                                         ip->i_mount);
 575                        return -EFSCORRUPTED;
 576                }
 577
 578                prealloced = 0;
 579                fixlen = 1LL << 32;
 580                break;
 581        case XFS_COW_FORK:
 582                if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
 583                        return -EINVAL;
 584
 585                if (xfs_get_cowextsz_hint(ip)) {
 586                        prealloced = 1;
 587                        fixlen = mp->m_super->s_maxbytes;
 588                } else {
 589                        prealloced = 0;
 590                        fixlen = XFS_ISIZE(ip);
 591                }
 592                break;
 593        default:
 594                if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
 595                    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
 596                    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
 597                        return -EINVAL;
 598
 599                if (xfs_get_extsz_hint(ip) ||
 600                    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
 601                        prealloced = 1;
 602                        fixlen = mp->m_super->s_maxbytes;
 603                } else {
 604                        prealloced = 0;
 605                        fixlen = XFS_ISIZE(ip);
 606                }
 607                break;
 608        }
 609
 610        if (bmv->bmv_length == -1) {
 611                fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
 612                bmv->bmv_length =
 613                        max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
 614        } else if (bmv->bmv_length == 0) {
 615                bmv->bmv_entries = 0;
 616                return 0;
 617        } else if (bmv->bmv_length < 0) {
 618                return -EINVAL;
 619        }
 620
 621        nex = bmv->bmv_count - 1;
 622        if (nex <= 0)
 623                return -EINVAL;
 624        bmvend = bmv->bmv_offset + bmv->bmv_length;
 625
 626
 627        if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
 628                return -ENOMEM;
 629        out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
 630        if (!out)
 631                return -ENOMEM;
 632
 633        xfs_ilock(ip, XFS_IOLOCK_SHARED);
 634        switch (whichfork) {
 635        case XFS_DATA_FORK:
 636                if (!(iflags & BMV_IF_DELALLOC) &&
 637                    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
 638                        error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
 639                        if (error)
 640                                goto out_unlock_iolock;
 641
 642                        /*
 643                         * Even after flushing the inode, there can still be
 644                         * delalloc blocks on the inode beyond EOF due to
 645                         * speculative preallocation.  These are not removed
 646                         * until the release function is called or the inode
 647                         * is inactivated.  Hence we cannot assert here that
 648                         * ip->i_delayed_blks == 0.
 649                         */
 650                }
 651
 652                lock = xfs_ilock_data_map_shared(ip);
 653                break;
 654        case XFS_COW_FORK:
 655                lock = XFS_ILOCK_SHARED;
 656                xfs_ilock(ip, lock);
 657                break;
 658        case XFS_ATTR_FORK:
 659                lock = xfs_ilock_attr_map_shared(ip);
 660                break;
 661        }
 662
 663        /*
 664         * Don't let nex be bigger than the number of extents
 665         * we can have assuming alternating holes and real extents.
 666         */
 667        if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
 668                nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
 669
 670        bmapi_flags = xfs_bmapi_aflag(whichfork);
 671        if (!(iflags & BMV_IF_PREALLOC))
 672                bmapi_flags |= XFS_BMAPI_IGSTATE;
 673
 674        /*
 675         * Allocate enough space to handle "subnex" maps at a time.
 676         */
 677        error = -ENOMEM;
 678        subnex = 16;
 679        map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
 680        if (!map)
 681                goto out_unlock_ilock;
 682
 683        bmv->bmv_entries = 0;
 684
 685        if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
 686            (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
 687                error = 0;
 688                goto out_free_map;
 689        }
 690
 691        nexleft = nex;
 692
 693        do {
 694                nmap = (nexleft > subnex) ? subnex : nexleft;
 695                error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
 696                                       XFS_BB_TO_FSB(mp, bmv->bmv_length),
 697                                       map, &nmap, bmapi_flags);
 698                if (error)
 699                        goto out_free_map;
 700                ASSERT(nmap <= subnex);
 701
 702                for (i = 0; i < nmap && nexleft && bmv->bmv_length &&
 703                                cur_ext < bmv->bmv_count; i++) {
 704                        out[cur_ext].bmv_oflags = 0;
 705                        if (map[i].br_state == XFS_EXT_UNWRITTEN)
 706                                out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
 707                        else if (map[i].br_startblock == DELAYSTARTBLOCK)
 708                                out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
 709                        out[cur_ext].bmv_offset =
 710                                XFS_FSB_TO_BB(mp, map[i].br_startoff);
 711                        out[cur_ext].bmv_length =
 712                                XFS_FSB_TO_BB(mp, map[i].br_blockcount);
 713                        out[cur_ext].bmv_unused1 = 0;
 714                        out[cur_ext].bmv_unused2 = 0;
 715
 716                        /*
 717                         * delayed allocation extents that start beyond EOF can
 718                         * occur due to speculative EOF allocation when the
 719                         * delalloc extent is larger than the largest freespace
 720                         * extent at conversion time. These extents cannot be
 721                         * converted by data writeback, so can exist here even
 722                         * if we are not supposed to be finding delalloc
 723                         * extents.
 724                         */
 725                        if (map[i].br_startblock == DELAYSTARTBLOCK &&
 726                            map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
 727                                ASSERT((iflags & BMV_IF_DELALLOC) != 0);
 728
 729                        if (map[i].br_startblock == HOLESTARTBLOCK &&
 730                            whichfork == XFS_ATTR_FORK) {
 731                                /* came to the end of attribute fork */
 732                                out[cur_ext].bmv_oflags |= BMV_OF_LAST;
 733                                goto out_free_map;
 734                        }
 735
 736                        /* Is this a shared block? */
 737                        error = xfs_getbmap_adjust_shared(ip, whichfork,
 738                                        &map[i], &out[cur_ext], &inject_map);
 739                        if (error)
 740                                goto out_free_map;
 741
 742                        if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
 743                                        &out[cur_ext], prealloced, bmvend,
 744                                        map[i].br_startblock,
 745                                        inject_map.br_startblock != NULLFSBLOCK))
 746                                goto out_free_map;
 747
 748                        bmv->bmv_offset =
 749                                out[cur_ext].bmv_offset +
 750                                out[cur_ext].bmv_length;
 751                        bmv->bmv_length =
 752                                max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
 753
 754                        /*
 755                         * In case we don't want to return the hole,
 756                         * don't increase cur_ext so that we can reuse
 757                         * it in the next loop.
 758                         */
 759                        if ((iflags & BMV_IF_NO_HOLES) &&
 760                            map[i].br_startblock == HOLESTARTBLOCK) {
 761                                memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
 762                                continue;
 763                        }
 764
 765                        if (inject_map.br_startblock != NULLFSBLOCK) {
 766                                map[i] = inject_map;
 767                                i--;
 768                        } else
 769                                nexleft--;
 770                        bmv->bmv_entries++;
 771                        cur_ext++;
 772                }
 773        } while (nmap && nexleft && bmv->bmv_length &&
 774                 cur_ext < bmv->bmv_count);
 775
 776 out_free_map:
 777        kmem_free(map);
 778 out_unlock_ilock:
 779        xfs_iunlock(ip, lock);
 780 out_unlock_iolock:
 781        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 782
 783        for (i = 0; i < cur_ext; i++) {
 784                int full = 0;   /* user array is full */
 785
 786                /* format results & advance arg */
 787                error = formatter(&arg, &out[i], &full);
 788                if (error || full)
 789                        break;
 790        }
 791
 792        kmem_free(out);
 793        return error;
 794}
 795
 796/*
 797 * dead simple method of punching delalyed allocation blocks from a range in
 798 * the inode. Walks a block at a time so will be slow, but is only executed in
 799 * rare error cases so the overhead is not critical. This will always punch out
 800 * both the start and end blocks, even if the ranges only partially overlap
 801 * them, so it is up to the caller to ensure that partial blocks are not
 802 * passed in.
 803 */
 804int
 805xfs_bmap_punch_delalloc_range(
 806        struct xfs_inode        *ip,
 807        xfs_fileoff_t           start_fsb,
 808        xfs_fileoff_t           length)
 809{
 810        xfs_fileoff_t           remaining = length;
 811        int                     error = 0;
 812
 813        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 814
 815        do {
 816                int             done;
 817                xfs_bmbt_irec_t imap;
 818                int             nimaps = 1;
 819                xfs_fsblock_t   firstblock;
 820                struct xfs_defer_ops dfops;
 821
 822                /*
 823                 * Map the range first and check that it is a delalloc extent
 824                 * before trying to unmap the range. Otherwise we will be
 825                 * trying to remove a real extent (which requires a
 826                 * transaction) or a hole, which is probably a bad idea...
 827                 */
 828                error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
 829                                       XFS_BMAPI_ENTIRE);
 830
 831                if (error) {
 832                        /* something screwed, just bail */
 833                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 834                                xfs_alert(ip->i_mount,
 835                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
 836                                                ip->i_ino, start_fsb);
 837                        }
 838                        break;
 839                }
 840                if (!nimaps) {
 841                        /* nothing there */
 842                        goto next_block;
 843                }
 844                if (imap.br_startblock != DELAYSTARTBLOCK) {
 845                        /* been converted, ignore */
 846                        goto next_block;
 847                }
 848                WARN_ON(imap.br_blockcount == 0);
 849
 850                /*
 851                 * Note: while we initialise the firstblock/dfops pair, they
 852                 * should never be used because blocks should never be
 853                 * allocated or freed for a delalloc extent and hence we need
 854                 * don't cancel or finish them after the xfs_bunmapi() call.
 855                 */
 856                xfs_defer_init(&dfops, &firstblock);
 857                error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
 858                                        &dfops, &done);
 859                if (error)
 860                        break;
 861
 862                ASSERT(!xfs_defer_has_unfinished_work(&dfops));
 863next_block:
 864                start_fsb++;
 865                remaining--;
 866        } while(remaining > 0);
 867
 868        return error;
 869}
 870
 871/*
 872 * Test whether it is appropriate to check an inode for and free post EOF
 873 * blocks. The 'force' parameter determines whether we should also consider
 874 * regular files that are marked preallocated or append-only.
 875 */
 876bool
 877xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
 878{
 879        /* prealloc/delalloc exists only on regular files */
 880        if (!S_ISREG(VFS_I(ip)->i_mode))
 881                return false;
 882
 883        /*
 884         * Zero sized files with no cached pages and delalloc blocks will not
 885         * have speculative prealloc/delalloc blocks to remove.
 886         */
 887        if (VFS_I(ip)->i_size == 0 &&
 888            VFS_I(ip)->i_mapping->nrpages == 0 &&
 889            ip->i_delayed_blks == 0)
 890                return false;
 891
 892        /* If we haven't read in the extent list, then don't do it now. */
 893        if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
 894                return false;
 895
 896        /*
 897         * Do not free real preallocated or append-only files unless the file
 898         * has delalloc blocks and we are forced to remove them.
 899         */
 900        if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
 901                if (!force || ip->i_delayed_blks == 0)
 902                        return false;
 903
 904        return true;
 905}
 906
 907/*
 908 * This is called by xfs_inactive to free any blocks beyond eof
 909 * when the link count isn't zero and by xfs_dm_punch_hole() when
 910 * punching a hole to EOF.
 911 */
 912int
 913xfs_free_eofblocks(
 914        xfs_mount_t     *mp,
 915        xfs_inode_t     *ip,
 916        bool            need_iolock)
 917{
 918        xfs_trans_t     *tp;
 919        int             error;
 920        xfs_fileoff_t   end_fsb;
 921        xfs_fileoff_t   last_fsb;
 922        xfs_filblks_t   map_len;
 923        int             nimaps;
 924        xfs_bmbt_irec_t imap;
 925
 926        /*
 927         * Figure out if there are any blocks beyond the end
 928         * of the file.  If not, then there is nothing to do.
 929         */
 930        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
 931        last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 932        if (last_fsb <= end_fsb)
 933                return 0;
 934        map_len = last_fsb - end_fsb;
 935
 936        nimaps = 1;
 937        xfs_ilock(ip, XFS_ILOCK_SHARED);
 938        error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
 939        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 940
 941        if (!error && (nimaps != 0) &&
 942            (imap.br_startblock != HOLESTARTBLOCK ||
 943             ip->i_delayed_blks)) {
 944                /*
 945                 * Attach the dquots to the inode up front.
 946                 */
 947                error = xfs_qm_dqattach(ip, 0);
 948                if (error)
 949                        return error;
 950
 951                /*
 952                 * There are blocks after the end of file.
 953                 * Free them up now by truncating the file to
 954                 * its current size.
 955                 */
 956                if (need_iolock) {
 957                        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
 958                                return -EAGAIN;
 959                }
 960
 961                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
 962                                &tp);
 963                if (error) {
 964                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
 965                        if (need_iolock)
 966                                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 967                        return error;
 968                }
 969
 970                xfs_ilock(ip, XFS_ILOCK_EXCL);
 971                xfs_trans_ijoin(tp, ip, 0);
 972
 973                /*
 974                 * Do not update the on-disk file size.  If we update the
 975                 * on-disk file size and then the system crashes before the
 976                 * contents of the file are flushed to disk then the files
 977                 * may be full of holes (ie NULL files bug).
 978                 */
 979                error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
 980                                              XFS_ISIZE(ip));
 981                if (error) {
 982                        /*
 983                         * If we get an error at this point we simply don't
 984                         * bother truncating the file.
 985                         */
 986                        xfs_trans_cancel(tp);
 987                } else {
 988                        error = xfs_trans_commit(tp);
 989                        if (!error)
 990                                xfs_inode_clear_eofblocks_tag(ip);
 991                }
 992
 993                xfs_iunlock(ip, XFS_ILOCK_EXCL);
 994                if (need_iolock)
 995                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 996        }
 997        return error;
 998}
 999
1000int

1001xfs_alloc_file_space(
1002        struct xfs_inode        *ip,
1003        xfs_off_t               offset,
1004        xfs_off_t               len,
1005        int                     alloc_type)
1006{
1007        xfs_mount_t             *mp = ip->i_mount;
1008        xfs_off_t               count;
1009        xfs_filblks_t           allocated_fsb;
1010        xfs_filblks_t           allocatesize_fsb;
1011        xfs_extlen_t            extsz, temp;
1012        xfs_fileoff_t           startoffset_fsb;
1013        xfs_fsblock_t           firstfsb;
1014        int                     nimaps;
1015        int                     quota_flag;
1016        int                     rt;
1017        xfs_trans_t             *tp;
1018        xfs_bmbt_irec_t         imaps[1], *imapp;
1019        struct xfs_defer_ops    dfops;
1020        uint                    qblocks, resblks, resrtextents;
1021        int                     error;
1022
1023        trace_xfs_alloc_file_space(ip);
1024
1025        if (XFS_FORCED_SHUTDOWN(mp))
1026                return -EIO;
1027
1028        error = xfs_qm_dqattach(ip, 0);
1029        if (error)
1030                return error;
1031
1032        if (len <= 0)
1033                return -EINVAL;
1034
1035        rt = XFS_IS_REALTIME_INODE(ip);
1036        extsz = xfs_get_extsz_hint(ip);
1037
1038        count = len;
1039        imapp = &imaps[0];
1040        nimaps = 1;
1041        startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
1042        allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1043
1044        /*
1045         * Allocate file space until done or until there is an error
1046         */
1047        while (allocatesize_fsb && !error) {
1048                xfs_fileoff_t   s, e;
1049
1050                /*
1051                 * Determine space reservations for data/realtime.
1052                 */
1053                if (unlikely(extsz)) {
1054                        s = startoffset_fsb;
1055                        do_div(s, extsz);
1056                        s *= extsz;
1057                        e = startoffset_fsb + allocatesize_fsb;
1058                        if ((temp = do_mod(startoffset_fsb, extsz)))
1059                                e += temp;
1060                        if ((temp = do_mod(e, extsz)))
1061                                e += extsz - temp;
1062                } else {
1063                        s = 0;
1064                        e = allocatesize_fsb;
1065                }
1066
1067                /*
1068                 * The transaction reservation is limited to a 32-bit block
1069                 * count, hence we need to limit the number of blocks we are
1070                 * trying to reserve to avoid an overflow. We can't allocate
1071                 * more than @nimaps extents, and an extent is limited on disk
1072                 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1073                 */
1074                resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1075                if (unlikely(rt)) {
1076                        resrtextents = qblocks = resblks;
1077                        resrtextents /= mp->m_sb.sb_rextsize;
1078                        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1079                        quota_flag = XFS_QMOPT_RES_RTBLKS;
1080                } else {
1081                        resrtextents = 0;
1082                        resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1083                        quota_flag = XFS_QMOPT_RES_REGBLKS;
1084                }
1085
1086                /*
1087                 * Allocate and setup the transaction.
1088                 */
1089                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
1090                                resrtextents, 0, &tp);
1091
1092                /*
1093                 * Check for running out of space
1094                 */
1095                if (error) {
1096                        /*
1097                         * Free the transaction structure.
1098                         */
1099                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1100                        break;
1101                }
1102                xfs_ilock(ip, XFS_ILOCK_EXCL);
1103                error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1104                                                      0, quota_flag);
1105                if (error)
1106                        goto error1;
1107
1108                xfs_trans_ijoin(tp, ip, 0);
1109
1110                xfs_defer_init(&dfops, &firstfsb);
1111                error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1112                                        allocatesize_fsb, alloc_type, &firstfsb,
1113                                        resblks, imapp, &nimaps, &dfops);
1114                if (error)
1115                        goto error0;
1116
1117                /*
1118                 * Complete the transaction
1119                 */
1120                error = xfs_defer_finish(&tp, &dfops, NULL);
1121                if (error)
1122                        goto error0;
1123
1124                error = xfs_trans_commit(tp);
1125                xfs_iunlock(ip, XFS_ILOCK_EXCL);
1126                if (error)
1127                        break;
1128
1129                allocated_fsb = imapp->br_blockcount;
1130
1131                if (nimaps == 0) {
1132                        error = -ENOSPC;
1133                        break;
1134                }
1135
1136                startoffset_fsb += allocated_fsb;
1137                allocatesize_fsb -= allocated_fsb;
1138        }
1139
1140        return error;
1141
1142error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1143        xfs_defer_cancel(&dfops);
1144        xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1145
1146error1: /* Just cancel transaction */
1147        xfs_trans_cancel(tp);
1148        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1149        return error;
1150}
1151
1152static int
1153xfs_unmap_extent(
1154        struct xfs_inode        *ip,
1155        xfs_fileoff_t           startoffset_fsb,
1156        xfs_filblks_t           len_fsb,
1157        int                     *done)
1158{
1159        struct xfs_mount        *mp = ip->i_mount;
1160        struct xfs_trans        *tp;
1161        struct xfs_defer_ops    dfops;
1162        xfs_fsblock_t           firstfsb;
1163        uint                    resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1164        int                     error;
1165
1166        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
1167        if (error) {
1168                ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1169                return error;
1170        }
1171
1172        xfs_ilock(ip, XFS_ILOCK_EXCL);
1173        error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
1174                        ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
1175        if (error)
1176                goto out_trans_cancel;
1177
1178        xfs_trans_ijoin(tp, ip, 0);
1179
1180        xfs_defer_init(&dfops, &firstfsb);
1181        error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
1182                        &dfops, done);
1183        if (error)
1184                goto out_bmap_cancel;
1185
1186        error = xfs_defer_finish(&tp, &dfops, ip);
1187        if (error)
1188                goto out_bmap_cancel;
1189
1190        error = xfs_trans_commit(tp);
1191out_unlock:
1192        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1193        return error;
1194
1195out_bmap_cancel:
1196        xfs_defer_cancel(&dfops);
1197out_trans_cancel:
1198        xfs_trans_cancel(tp);
1199        goto out_unlock;
1200}
1201
1202static int
1203xfs_adjust_extent_unmap_boundaries(
1204        struct xfs_inode        *ip,
1205        xfs_fileoff_t           *startoffset_fsb,
1206        xfs_fileoff_t           *endoffset_fsb)
1207{
1208        struct xfs_mount        *mp = ip->i_mount;
1209        struct xfs_bmbt_irec    imap;
1210        int                     nimap, error;
1211        xfs_extlen_t            mod = 0;
1212
1213        nimap = 1;
1214        error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
1215        if (error)
1216                return error;
1217
1218        if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1219                xfs_daddr_t     block;
1220
1221                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1222                block = imap.br_startblock;
1223                mod = do_div(block, mp->m_sb.sb_rextsize);
1224                if (mod)
1225                        *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1226        }
1227
1228        nimap = 1;
1229        error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
1230        if (error)
1231                return error;
1232
1233        if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1234                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1235                mod++;
1236                if (mod && mod != mp->m_sb.sb_rextsize)
1237                        *endoffset_fsb -= mod;
1238        }
1239
1240        return 0;
1241}
1242
1243static int
1244xfs_flush_unmap_range(
1245        struct xfs_inode        *ip,
1246        xfs_off_t               offset,
1247        xfs_off_t               len)
1248{
1249        struct xfs_mount        *mp = ip->i_mount;
1250        struct inode            *inode = VFS_I(ip);
1251        xfs_off_t               rounding, start, end;
1252        int                     error;
1253
1254        /* wait for the completion of any pending DIOs */
1255        inode_dio_wait(inode);
1256
1257        rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
1258        start = round_down(offset, rounding);
1259        end = round_up(offset + len, rounding) - 1;
1260
1261        error = filemap_write_and_wait_range(inode->i_mapping, start, end);
1262        if (error)
1263                return error;
1264        truncate_pagecache_range(inode, start, end);
1265        return 0;
1266}
1267
1268int
1269xfs_free_file_space(
1270        struct xfs_inode        *ip,
1271        xfs_off_t               offset,
1272        xfs_off_t               len)
1273{
1274        struct xfs_mount        *mp = ip->i_mount;
1275        xfs_fileoff_t           startoffset_fsb;
1276        xfs_fileoff_t           endoffset_fsb;
1277        int                     done = 0, error;
1278
1279        trace_xfs_free_file_space(ip);
1280
1281        error = xfs_qm_dqattach(ip, 0);
1282        if (error)
1283                return error;
1284
1285        if (len <= 0)   /* if nothing being freed */
1286                return 0;
1287
1288        error = xfs_flush_unmap_range(ip, offset, len);
1289        if (error)
1290                return error;
1291
1292        startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1293        endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1294
1295        /*
1296         * Need to zero the stuff we're not freeing, on disk.  If it's a RT file
1297         * and we can't use unwritten extents then we actually need to ensure
1298         * to zero the whole extent, otherwise we just need to take of block
1299         * boundaries, and xfs_bunmapi will handle the rest.
1300         */
1301        if (XFS_IS_REALTIME_INODE(ip) &&
1302            !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1303                error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
1304                                &endoffset_fsb);
1305                if (error)
1306                        return error;
1307        }
1308
1309        if (endoffset_fsb > startoffset_fsb) {
1310                while (!done) {
1311                        error = xfs_unmap_extent(ip, startoffset_fsb,
1312                                        endoffset_fsb - startoffset_fsb, &done);
1313                        if (error)
1314                                return error;
1315                }
1316        }
1317
1318        /*
1319         * Now that we've unmap all full blocks we'll have to zero out any
1320         * partial block at the beginning and/or end.  xfs_zero_range is
1321         * smart enough to skip any holes, including those we just created.
1322         */
1323        return xfs_zero_range(ip, offset, len, NULL);
1324}
1325
1326/*
1327 * Preallocate and zero a range of a file. This mechanism has the allocation
1328 * semantics of fallocate and in addition converts data in the range to zeroes.
1329 */
1330int
1331xfs_zero_file_space(
1332        struct xfs_inode        *ip,
1333        xfs_off_t               offset,
1334        xfs_off_t               len)
1335{
1336        struct xfs_mount        *mp = ip->i_mount;
1337        uint                    blksize;
1338        int                     error;
1339
1340        trace_xfs_zero_file_space(ip);
1341
1342        blksize = 1 << mp->m_sb.sb_blocklog;
1343
1344        /*
1345         * Punch a hole and prealloc the range. We use hole punch rather than
1346         * unwritten extent conversion for two reasons:
1347         *
1348         * 1.) Hole punch handles partial block zeroing for us.
1349         *
1350         * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
1351         * by virtue of the hole punch.
1352         */
1353        error = xfs_free_file_space(ip, offset, len);
1354        if (error)
1355                goto out;
1356
1357        error = xfs_alloc_file_space(ip, round_down(offset, blksize),
1358                                     round_up(offset + len, blksize) -
1359                                     round_down(offset, blksize),
1360                                     XFS_BMAPI_PREALLOC);
1361out:
1362        return error;
1363
1364}
1365
1366/*
1367 * @next_fsb will keep track of the extent currently undergoing shift.
1368 * @stop_fsb will keep track of the extent at which we have to stop.
1369 * If we are shifting left, we will start with block (offset + len) and
1370 * shift each extent till last extent.
1371 * If we are shifting right, we will start with last extent inside file space
1372 * and continue until we reach the block corresponding to offset.
1373 */
1374static int
1375xfs_shift_file_space(
1376        struct xfs_inode        *ip,
1377        xfs_off_t               offset,
1378        xfs_off_t               len,
1379        enum shift_direction    direction)
1380{
1381        int                     done = 0;
1382        struct xfs_mount        *mp = ip->i_mount;
1383        struct xfs_trans        *tp;
1384        int                     error;
1385        struct xfs_defer_ops    dfops;
1386        xfs_fsblock_t           first_block;
1387        xfs_fileoff_t           stop_fsb;
1388        xfs_fileoff_t           next_fsb;
1389        xfs_fileoff_t           shift_fsb;
1390
1391        ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
1392
1393        if (direction == SHIFT_LEFT) {
1394                next_fsb = XFS_B_TO_FSB(mp, offset + len);
1395                stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
1396        } else {
1397                /*
1398                 * If right shift, delegate the work of initialization of
1399                 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
1400                 */
1401                next_fsb = NULLFSBLOCK;
1402                stop_fsb = XFS_B_TO_FSB(mp, offset);
1403        }
1404
1405        shift_fsb = XFS_B_TO_FSB(mp, len);
1406
1407        /*
1408         * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
1409         * into the accessible region of the file.
1410         */
1411        if (xfs_can_free_eofblocks(ip, true)) {
1412                error = xfs_free_eofblocks(mp, ip, false);
1413                if (error)
1414                        return error;
1415        }
1416
1417        /*
1418         * Writeback and invalidate cache for the remainder of the file as we're
1419         * about to shift down every extent from offset to EOF.
1420         */
1421        error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1422                                             offset, -1);
1423        if (error)
1424                return error;
1425        error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
1426                                        offset >> PAGE_SHIFT, -1);
1427        if (error)
1428                return error;
1429
1430        /*
1431         * The extent shiting code works on extent granularity. So, if
1432         * stop_fsb is not the starting block of extent, we need to split
1433         * the extent at stop_fsb.
1434         */
1435        if (direction == SHIFT_RIGHT) {
1436                error = xfs_bmap_split_extent(ip, stop_fsb);
1437                if (error)
1438                        return error;
1439        }
1440
1441        while (!error && !done) {
1442                /*
1443                 * We would need to reserve permanent block for transaction.
1444                 * This will come into picture when after shifting extent into
1445                 * hole we found that adjacent extents can be merged which
1446                 * may lead to freeing of a block during record update.
1447                 */
1448                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
1449                                XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
1450                if (error)
1451                        break;
1452
1453                xfs_ilock(ip, XFS_ILOCK_EXCL);
1454                error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
1455                                ip->i_gdquot, ip->i_pdquot,
1456                                XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
1457                                XFS_QMOPT_RES_REGBLKS);
1458                if (error)
1459                        goto out_trans_cancel;
1460
1461                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1462
1463                xfs_defer_init(&dfops, &first_block);
1464
1465                /*
1466                 * We are using the write transaction in which max 2 bmbt
1467                 * updates are allowed
1468                 */
1469                error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
1470                                &done, stop_fsb, &first_block, &dfops,
1471                                direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
1472                if (error)
1473                        goto out_bmap_cancel;
1474
1475                error = xfs_defer_finish(&tp, &dfops, NULL);
1476                if (error)
1477                        goto out_bmap_cancel;
1478
1479                error = xfs_trans_commit(tp);
1480        }
1481
1482        return error;
1483
1484out_bmap_cancel:
1485        xfs_defer_cancel(&dfops);
1486out_trans_cancel:
1487        xfs_trans_cancel(tp);
1488        return error;
1489}
1490
1491/*
1492 * xfs_collapse_file_space()
1493 *      This routine frees disk space and shift extent for the given file.
1494 *      The first thing we do is to free data blocks in the specified range
1495 *      by calling xfs_free_file_space(). It would also sync dirty data
1496 *      and invalidate page cache over the region on which collapse range
1497 *      is working. And Shift extent records to the left to cover a hole.
1498 * RETURNS:
1499 *      0 on success
1500 *      errno on error
1501 *
1502 */
1503int
1504xfs_collapse_file_space(
1505        struct xfs_inode        *ip,
1506        xfs_off_t               offset,
1507        xfs_off_t               len)
1508{
1509        int error;
1510
1511        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1512        trace_xfs_collapse_file_space(ip);
1513
1514        error = xfs_free_file_space(ip, offset, len);
1515        if (error)
1516                return error;
1517
1518        return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
1519}
1520
1521/*
1522 * xfs_insert_file_space()
1523 *      This routine create hole space by shifting extents for the given file.
1524 *      The first thing we do is to sync dirty data and invalidate page cache
1525 *      over the region on which insert range is working. And split an extent
1526 *      to two extents at given offset by calling xfs_bmap_split_extent.
1527 *      And shift all extent records which are laying between [offset,
1528 *      last allocated extent] to the right to reserve hole range.
1529 * RETURNS:
1530 *      0 on success
1531 *      errno on error
1532 */
1533int
1534xfs_insert_file_space(
1535        struct xfs_inode        *ip,
1536        loff_t                  offset,
1537        loff_t                  len)
1538{
1539        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1540        trace_xfs_insert_file_space(ip);
1541
1542        return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
1543}
1544
1545/*
1546 * We need to check that the format of the data fork in the temporary inode is
1547 * valid for the target inode before doing the swap. This is not a problem with
1548 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1549 * data fork depending on the space the attribute fork is taking so we can get
1550 * invalid formats on the target inode.
1551 *
1552 * E.g. target has space for 7 extents in extent format, temp inode only has
1553 * space for 6.  If we defragment down to 7 extents, then the tmp format is a
1554 * btree, but when swapped it needs to be in extent format. Hence we can't just
1555 * blindly swap data forks on attr2 filesystems.
1556 *
1557 * Note that we check the swap in both directions so that we don't end up with
1558 * a corrupt temporary inode, either.
1559 *
1560 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
1561 * inode will prevent this situation from occurring, so all we do here is
1562 * reject and log the attempt. basically we are putting the responsibility on
1563 * userspace to get this right.
1564 */
1565static int
1566xfs_swap_extents_check_format(
1567        struct xfs_inode        *ip,    /* target inode */
1568        struct xfs_inode        *tip)   /* tmp inode */
1569{
1570
1571        /* Should never get a local format */
1572        if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
1573            tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
1574                return -EINVAL;
1575
1576        /*
1577         * if the target inode has less extents that then temporary inode then
1578         * why did userspace call us?
1579         */
1580        if (ip->i_d.di_nextents < tip->i_d.di_nextents)
1581                return -EINVAL;
1582
1583        /*
1584         * If we have to use the (expensive) rmap swap method, we can
1585         * handle any number of extents and any format.
1586         */
1587        if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb))
1588                return 0;
1589
1590        /*
1591         * if the target inode is in extent form and the temp inode is in btree
1592         * form then we will end up with the target inode in the wrong format
1593         * as we already know there are less extents in the temp inode.
1594         */
1595        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1596            tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1597                return -EINVAL;
1598
1599        /* Check temp in extent form to max in target */
1600        if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1601            XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
1602                        XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1603                return -EINVAL;
1604
1605        /* Check target in extent form to max in temp */
1606        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1607            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
1608                        XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1609                return -EINVAL;
1610
1611        /*
1612         * If we are in a btree format, check that the temp root block will fit
1613         * in the target and that it has enough extents to be in btree format
1614         * in the target.
1615         *
1616         * Note that we have to be careful to allow btree->extent conversions
1617         * (a common defrag case) which will occur when the temp inode is in
1618         * extent format...
1619         */
1620        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1621                if (XFS_IFORK_BOFF(ip) &&
1622                    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
1623                        return -EINVAL;
1624                if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
1625                    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1626                        return -EINVAL;
1627        }
1628
1629        /* Reciprocal target->temp btree format checks */
1630        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1631                if (XFS_IFORK_BOFF(tip) &&
1632                    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
1633                        return -EINVAL;
1634                if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
1635                    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1636                        return -EINVAL;
1637        }
1638
1639        return 0;
1640}
1641
1642static int
1643xfs_swap_extent_flush(
1644        struct xfs_inode        *ip)
1645{
1646        int     error;
1647
1648        error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
1649        if (error)
1650                return error;
1651        truncate_pagecache_range(VFS_I(ip), 0, -1);
1652
1653        /* Verify O_DIRECT for ftmp */
1654        if (VFS_I(ip)->i_mapping->nrpages)
1655                return -EINVAL;
1656        return 0;
1657}
1658
1659/*
1660 * Move extents from one file to another, when rmap is enabled.
1661 */
1662STATIC int
1663xfs_swap_extent_rmap(
1664        struct xfs_trans                **tpp,
1665        struct xfs_inode                *ip,
1666        struct xfs_inode                *tip)
1667{
1668        struct xfs_bmbt_irec            irec;
1669        struct xfs_bmbt_irec            uirec;
1670        struct xfs_bmbt_irec            tirec;
1671        xfs_fileoff_t                   offset_fsb;
1672        xfs_fileoff_t                   end_fsb;
1673        xfs_filblks_t                   count_fsb;
1674        xfs_fsblock_t                   firstfsb;
1675        struct xfs_defer_ops            dfops;
1676        int                             error;
1677        xfs_filblks_t                   ilen;
1678        xfs_filblks_t                   rlen;
1679        int                             nimaps;
1680        __uint64_t                      tip_flags2;
1681
1682        /*
1683         * If the source file has shared blocks, we must flag the donor
1684         * file as having shared blocks so that we get the shared-block
1685         * rmap functions when we go to fix up the rmaps.  The flags
1686         * will be switch for reals later.
1687         */
1688        tip_flags2 = tip->i_d.di_flags2;
1689        if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)
1690                tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
1691
1692        offset_fsb = 0;
1693        end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
1694        count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
1695
1696        while (count_fsb) {
1697                /* Read extent from the donor file */
1698                nimaps = 1;
1699                error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec,
1700                                &nimaps, 0);
1701                if (error)
1702                        goto out;
1703                ASSERT(nimaps == 1);
1704                ASSERT(tirec.br_startblock != DELAYSTARTBLOCK);
1705
1706                trace_xfs_swap_extent_rmap_remap(tip, &tirec);
1707                ilen = tirec.br_blockcount;
1708
1709                /* Unmap the old blocks in the source file. */
1710                while (tirec.br_blockcount) {
1711                        xfs_defer_init(&dfops, &firstfsb);
1712                        trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec);
1713
1714                        /* Read extent from the source file */
1715                        nimaps = 1;
1716                        error = xfs_bmapi_read(ip, tirec.br_startoff,
1717                                        tirec.br_blockcount, &irec,
1718                                        &nimaps, 0);
1719                        if (error)
1720                                goto out_defer;
1721                        ASSERT(nimaps == 1);
1722                        ASSERT(tirec.br_startoff == irec.br_startoff);
1723                        trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
1724
1725                        /* Trim the extent. */
1726                        uirec = tirec;
1727                        uirec.br_blockcount = rlen = min_t(xfs_filblks_t,
1728                                        tirec.br_blockcount,
1729                                        irec.br_blockcount);
1730                        trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
1731
1732                        /* Remove the mapping from the donor file. */
1733                        error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops,
1734                                        tip, &uirec);
1735                        if (error)
1736                                goto out_defer;
1737
1738                        /* Remove the mapping from the source file. */
1739                        error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops,
1740                                        ip, &irec);
1741                        if (error)
1742                                goto out_defer;
1743
1744                        /* Map the donor file's blocks into the source file. */
1745                        error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops,
1746                                        ip, &uirec);
1747                        if (error)
1748                                goto out_defer;
1749
1750                        /* Map the source file's blocks into the donor file. */
1751                        error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops,
1752                                        tip, &irec);
1753                        if (error)
1754                                goto out_defer;
1755
1756                        error = xfs_defer_finish(tpp, &dfops, ip);
1757                        if (error)
1758                                goto out_defer;
1759
1760                        tirec.br_startoff += rlen;
1761                        if (tirec.br_startblock != HOLESTARTBLOCK &&
1762                            tirec.br_startblock != DELAYSTARTBLOCK)
1763                                tirec.br_startblock += rlen;
1764                        tirec.br_blockcount -= rlen;
1765                }
1766
1767                /* Roll on... */
1768                count_fsb -= ilen;
1769                offset_fsb += ilen;
1770        }
1771
1772        tip->i_d.di_flags2 = tip_flags2;
1773        return 0;
1774
1775out_defer:
1776        xfs_defer_cancel(&dfops);
1777out:
1778        trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
1779        tip->i_d.di_flags2 = tip_flags2;
1780        return error;
1781}
1782
1783/* Swap the extents of two files by swapping data forks. */
1784STATIC int
1785xfs_swap_extent_forks(
1786        struct xfs_trans        *tp,
1787        struct xfs_inode        *ip,
1788        struct xfs_inode        *tip,
1789        int                     *src_log_flags,
1790        int                     *target_log_flags)
1791{
1792        struct xfs_ifork        tempifp, *ifp, *tifp;
1793        int                     aforkblks = 0;
1794        int                     taforkblks = 0;
1795        __uint64_t              tmp;
1796        int                     error;
1797
1798        /*
1799         * Count the number of extended attribute blocks
1800         */
1801        if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
1802             (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1803                error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK,
1804                                &aforkblks);
1805                if (error)
1806                        return error;
1807        }
1808        if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
1809             (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1810                error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
1811                                &taforkblks);
1812                if (error)
1813                        return error;
1814        }
1815
1816        /*
1817         * Before we've swapped the forks, lets set the owners of the forks
1818         * appropriately. We have to do this as we are demand paging the btree
1819         * buffers, and so the validation done on read will expect the owner
1820         * field to be correctly set. Once we change the owners, we can swap the
1821         * inode forks.
1822         */
1823        if (ip->i_d.di_version == 3 &&
1824            ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1825                (*target_log_flags) |= XFS_ILOG_DOWNER;
1826                error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
1827                                              tip->i_ino, NULL);
1828                if (error)
1829                        return error;
1830        }
1831
1832        if (tip->i_d.di_version == 3 &&
1833            tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1834                (*src_log_flags) |= XFS_ILOG_DOWNER;
1835                error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
1836                                              ip->i_ino, NULL);
1837                if (error)
1838                        return error;
1839        }
1840
1841        /*
1842         * Swap the data forks of the inodes
1843         */
1844        ifp = &ip->i_df;
1845        tifp = &tip->i_df;
1846        tempifp = *ifp;         /* struct copy */
1847        *ifp = *tifp;           /* struct copy */
1848        *tifp = tempifp;        /* struct copy */
1849
1850        /*
1851         * Fix the on-disk inode values
1852         */
1853        tmp = (__uint64_t)ip->i_d.di_nblocks;
1854        ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
1855        tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
1856
1857        tmp = (__uint64_t) ip->i_d.di_nextents;
1858        ip->i_d.di_nextents = tip->i_d.di_nextents;
1859        tip->i_d.di_nextents = tmp;
1860
1861        tmp = (__uint64_t) ip->i_d.di_format;
1862        ip->i_d.di_format = tip->i_d.di_format;
1863        tip->i_d.di_format = tmp;
1864
1865        /*
1866         * The extents in the source inode could still contain speculative
1867         * preallocation beyond EOF (e.g. the file is open but not modified
1868         * while defrag is in progress). In that case, we need to copy over the
1869         * number of delalloc blocks the data fork in the source inode is
1870         * tracking beyond EOF so that when the fork is truncated away when the
1871         * temporary inode is unlinked we don't underrun the i_delayed_blks
1872         * counter on that inode.
1873         */
1874        ASSERT(tip->i_delayed_blks == 0);
1875        tip->i_delayed_blks = ip->i_delayed_blks;
1876        ip->i_delayed_blks = 0;
1877
1878        switch (ip->i_d.di_format) {
1879        case XFS_DINODE_FMT_EXTENTS:
1880                /* If the extents fit in the inode, fix the
1881                 * pointer.  Otherwise it's already NULL or
1882                 * pointing to the extent.
1883                 */
1884                if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1885                        ifp->if_u1.if_extents =
1886                                ifp->if_u2.if_inline_ext;
1887                }
1888                (*src_log_flags) |= XFS_ILOG_DEXT;
1889                break;
1890        case XFS_DINODE_FMT_BTREE:
1891                ASSERT(ip->i_d.di_version < 3 ||
1892                       (*src_log_flags & XFS_ILOG_DOWNER));
1893                (*src_log_flags) |= XFS_ILOG_DBROOT;
1894                break;
1895        }
1896
1897        switch (tip->i_d.di_format) {
1898        case XFS_DINODE_FMT_EXTENTS:
1899                /* If the extents fit in the inode, fix the
1900                 * pointer.  Otherwise it's already NULL or
1901                 * pointing to the extent.
1902                 */
1903                if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1904                        tifp->if_u1.if_extents =
1905                                tifp->if_u2.if_inline_ext;
1906                }
1907                (*target_log_flags) |= XFS_ILOG_DEXT;
1908                break;
1909        case XFS_DINODE_FMT_BTREE:
1910                (*target_log_flags) |= XFS_ILOG_DBROOT;
1911                ASSERT(tip->i_d.di_version < 3 ||
1912                       (*target_log_flags & XFS_ILOG_DOWNER));
1913                break;
1914        }
1915
1916        return 0;
1917}
1918
1919int
1920xfs_swap_extents(
1921        struct xfs_inode        *ip,    /* target inode */
1922        struct xfs_inode        *tip,   /* tmp inode */
1923        struct xfs_swapext      *sxp)
1924{
1925        struct xfs_mount        *mp = ip->i_mount;
1926        struct xfs_trans        *tp;
1927        struct xfs_bstat        *sbp = &sxp->sx_stat;
1928        int                     src_log_flags, target_log_flags;
1929        int                     error = 0;
1930        int                     lock_flags;
1931        struct xfs_ifork        *cowfp;
1932        __uint64_t              f;
1933        int                     resblks;
1934
1935        /*
1936         * Lock the inodes against other IO, page faults and truncate to
1937         * begin with.  Then we can ensure the inodes are flushed and have no
1938         * page cache safely. Once we have done this we can take the ilocks and
1939         * do the rest of the checks.
1940         */
1941        lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1942        xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
1943        xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
1944
1945        /* Verify that both files have the same format */
1946        if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
1947                error = -EINVAL;
1948                goto out_unlock;
1949        }
1950
1951        /* Verify both files are either real-time or non-realtime */
1952        if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
1953                error = -EINVAL;
1954                goto out_unlock;
1955        }
1956
1957        error = xfs_swap_extent_flush(ip);
1958        if (error)
1959                goto out_unlock;
1960        error = xfs_swap_extent_flush(tip);
1961        if (error)
1962                goto out_unlock;
1963
1964        /*
1965         * Extent "swapping" with rmap requires a permanent reservation and
1966         * a block reservation because it's really just a remap operation
1967         * performed with log redo items!
1968         */
1969        if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
1970                /*
1971                 * Conceptually this shouldn't affect the shape of either
1972                 * bmbt, but since we atomically move extents one by one,
1973                 * we reserve enough space to rebuild both trees.
1974                 */
1975                resblks = XFS_SWAP_RMAP_SPACE_RES(mp,
1976                                XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK),
1977                                XFS_DATA_FORK) +
1978                          XFS_SWAP_RMAP_SPACE_RES(mp,
1979                                XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
1980                                XFS_DATA_FORK);
1981                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
1982                                0, 0, &tp);
1983        } else
1984                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
1985                                0, 0, &tp);
1986        if (error)
1987                goto out_unlock;
1988
1989        /*
1990         * Lock and join the inodes to the tansaction so that transaction commit
1991         * or cancel will unlock the inodes from this point onwards.
1992         */
1993        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1994        lock_flags |= XFS_ILOCK_EXCL;
1995        xfs_trans_ijoin(tp, ip, 0);
1996        xfs_trans_ijoin(tp, tip, 0);
1997
1998
1999        /* Verify all data are being swapped */
2000        if (sxp->sx_offset != 0 ||

2001            sxp->sx_length != ip->i_d.di_size ||
2002            sxp->sx_length != tip->i_d.di_size) {
2003                error = -EFAULT;
2004                goto out_trans_cancel;
2005        }
2006
2007        trace_xfs_swap_extent_before(ip, 0);
2008        trace_xfs_swap_extent_before(tip, 1);
2009
2010        /* check inode formats now that data is flushed */
2011        error = xfs_swap_extents_check_format(ip, tip);
2012        if (error) {
2013                xfs_notice(mp,
2014                    "%s: inode 0x%llx format is incompatible for exchanging.",
2015                                __func__, ip->i_ino);
2016                goto out_trans_cancel;
2017        }
2018
2019        /*
2020         * Compare the current change & modify times with that
2021         * passed in.  If they differ, we abort this swap.
2022         * This is the mechanism used to ensure the calling
2023         * process that the file was not changed out from
2024         * under it.
2025         */
2026        if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
2027            (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
2028            (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
2029            (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
2030                error = -EBUSY;
2031                goto out_trans_cancel;
2032        }
2033
2034        /*
2035         * Note the trickiness in setting the log flags - we set the owner log
2036         * flag on the opposite inode (i.e. the inode we are setting the new
2037         * owner to be) because once we swap the forks and log that, log
2038         * recovery is going to see the fork as owned by the swapped inode,
2039         * not the pre-swapped inodes.
2040         */
2041        src_log_flags = XFS_ILOG_CORE;
2042        target_log_flags = XFS_ILOG_CORE;
2043
2044        if (xfs_sb_version_hasrmapbt(&mp->m_sb))
2045                error = xfs_swap_extent_rmap(&tp, ip, tip);
2046        else
2047                error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
2048                                &target_log_flags);
2049        if (error)
2050                goto out_trans_cancel;
2051
2052        /* Do we have to swap reflink flags? */
2053        if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^
2054            (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) {
2055                f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
2056                ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
2057                ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
2058                tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
2059                tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
2060                cowfp = ip->i_cowfp;
2061                ip->i_cowfp = tip->i_cowfp;
2062                tip->i_cowfp = cowfp;
2063                xfs_inode_set_cowblocks_tag(ip);
2064                xfs_inode_set_cowblocks_tag(tip);
2065        }
2066
2067        xfs_trans_log_inode(tp, ip,  src_log_flags);
2068        xfs_trans_log_inode(tp, tip, target_log_flags);
2069
2070        /*
2071         * If this is a synchronous mount, make sure that the
2072         * transaction goes to disk before returning to the user.
2073         */
2074        if (mp->m_flags & XFS_MOUNT_WSYNC)
2075                xfs_trans_set_sync(tp);
2076
2077        error = xfs_trans_commit(tp);
2078
2079        trace_xfs_swap_extent_after(ip, 0);
2080        trace_xfs_swap_extent_after(tip, 1);
2081
2082        xfs_iunlock(ip, lock_flags);
2083        xfs_iunlock(tip, lock_flags);
2084        return error;
2085
2086out_trans_cancel:
2087        xfs_trans_cancel(tp);
2088
2089out_unlock:
2090        xfs_iunlock(ip, lock_flags);
2091        xfs_iunlock(tip, lock_flags);
2092        return error;
2093}
2094