linux/fs/xfs/xfs_inode.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include <linux/log2.h>
  19
  20#include "xfs.h"
  21#include "xfs_fs.h"
  22#include "xfs_types.h"
  23#include "xfs_bit.h"
  24#include "xfs_log.h"
  25#include "xfs_inum.h"
  26#include "xfs_trans.h"
  27#include "xfs_trans_priv.h"
  28#include "xfs_sb.h"
  29#include "xfs_ag.h"
  30#include "xfs_mount.h"
  31#include "xfs_bmap_btree.h"
  32#include "xfs_alloc_btree.h"
  33#include "xfs_ialloc_btree.h"
  34#include "xfs_attr_sf.h"
  35#include "xfs_dinode.h"
  36#include "xfs_inode.h"
  37#include "xfs_buf_item.h"
  38#include "xfs_inode_item.h"
  39#include "xfs_btree.h"
  40#include "xfs_alloc.h"
  41#include "xfs_ialloc.h"
  42#include "xfs_bmap.h"
  43#include "xfs_error.h"
  44#include "xfs_utils.h"
  45#include "xfs_quota.h"
  46#include "xfs_filestream.h"
  47#include "xfs_vnodeops.h"
  48#include "xfs_trace.h"
  49
  50kmem_zone_t *xfs_ifork_zone;
  51kmem_zone_t *xfs_inode_zone;
  52
  53/*
  54 * Used in xfs_itruncate_extents().  This is the maximum number of extents
  55 * freed from a file in a single transaction.
  56 */
  57#define XFS_ITRUNC_MAX_EXTENTS  2
  58
  59STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
  60STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
  61STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
  62STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
  63
  64#ifdef DEBUG
  65/*
  66 * Make sure that the extents in the given memory buffer
  67 * are valid.
  68 */
  69STATIC void
  70xfs_validate_extents(
  71        xfs_ifork_t             *ifp,
  72        int                     nrecs,
  73        xfs_exntfmt_t           fmt)
  74{
  75        xfs_bmbt_irec_t         irec;
  76        xfs_bmbt_rec_host_t     rec;
  77        int                     i;
  78
  79        for (i = 0; i < nrecs; i++) {
  80                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
  81                rec.l0 = get_unaligned(&ep->l0);
  82                rec.l1 = get_unaligned(&ep->l1);
  83                xfs_bmbt_get_all(&rec, &irec);
  84                if (fmt == XFS_EXTFMT_NOSTATE)
  85                        ASSERT(irec.br_state == XFS_EXT_NORM);
  86        }
  87}
  88#else /* DEBUG */
  89#define xfs_validate_extents(ifp, nrecs, fmt)
  90#endif /* DEBUG */
  91
  92/*
  93 * Check that none of the inode's in the buffer have a next
  94 * unlinked field of 0.
  95 */
  96#if defined(DEBUG)
  97void
  98xfs_inobp_check(
  99        xfs_mount_t     *mp,
 100        xfs_buf_t       *bp)
 101{
 102        int             i;
 103        int             j;
 104        xfs_dinode_t    *dip;
 105
 106        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
 107
 108        for (i = 0; i < j; i++) {
 109                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 110                                        i * mp->m_sb.sb_inodesize);
 111                if (!dip->di_next_unlinked)  {
 112                        xfs_alert(mp,
 113        "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
 114                                bp);
 115                        ASSERT(dip->di_next_unlinked);
 116                }
 117        }
 118}
 119#endif
 120
 121/*
 122 * Find the buffer associated with the given inode map
 123 * We do basic validation checks on the buffer once it has been
 124 * retrieved from disk.
 125 */
 126STATIC int
 127xfs_imap_to_bp(
 128        xfs_mount_t     *mp,
 129        xfs_trans_t     *tp,
 130        struct xfs_imap *imap,
 131        xfs_buf_t       **bpp,
 132        uint            buf_flags,
 133        uint            iget_flags)
 134{
 135        int             error;
 136        int             i;
 137        int             ni;
 138        xfs_buf_t       *bp;
 139
 140        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 141                                   (int)imap->im_len, buf_flags, &bp);
 142        if (error) {
 143                if (error != EAGAIN) {
 144                        xfs_warn(mp,
 145                                "%s: xfs_trans_read_buf() returned error %d.",
 146                                __func__, error);
 147                } else {
 148                        ASSERT(buf_flags & XBF_TRYLOCK);
 149                }
 150                return error;
 151        }
 152
 153        /*
 154         * Validate the magic number and version of every inode in the buffer
 155         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
 156         */
 157#ifdef DEBUG
 158        ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
 159#else   /* usual case */
 160        ni = 1;
 161#endif
 162
 163        for (i = 0; i < ni; i++) {
 164                int             di_ok;
 165                xfs_dinode_t    *dip;
 166
 167                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 168                                        (i << mp->m_sb.sb_inodelog));
 169                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
 170                            XFS_DINODE_GOOD_VERSION(dip->di_version);
 171                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
 172                                                XFS_ERRTAG_ITOBP_INOTOBP,
 173                                                XFS_RANDOM_ITOBP_INOTOBP))) {
 174                        if (iget_flags & XFS_IGET_UNTRUSTED) {
 175                                xfs_trans_brelse(tp, bp);
 176                                return XFS_ERROR(EINVAL);
 177                        }
 178                        XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
 179                                                XFS_ERRLEVEL_HIGH, mp, dip);
 180#ifdef DEBUG
 181                        xfs_emerg(mp,
 182                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
 183                                (unsigned long long)imap->im_blkno, i,
 184                                be16_to_cpu(dip->di_magic));
 185                        ASSERT(0);
 186#endif
 187                        xfs_trans_brelse(tp, bp);
 188                        return XFS_ERROR(EFSCORRUPTED);
 189                }
 190        }
 191
 192        xfs_inobp_check(mp, bp);
 193
 194        /*
 195         * Mark the buffer as an inode buffer now that it looks good
 196         */
 197        XFS_BUF_SET_VTYPE(bp, B_FS_INO);
 198
 199        *bpp = bp;
 200        return 0;
 201}
 202
 203/*
 204 * This routine is called to map an inode number within a file
 205 * system to the buffer containing the on-disk version of the
 206 * inode.  It returns a pointer to the buffer containing the
 207 * on-disk inode in the bpp parameter, and in the dip parameter
 208 * it returns a pointer to the on-disk inode within that buffer.
 209 *
 210 * If a non-zero error is returned, then the contents of bpp and
 211 * dipp are undefined.
 212 *
 213 * Use xfs_imap() to determine the size and location of the
 214 * buffer to read from disk.
 215 */
 216int
 217xfs_inotobp(
 218        xfs_mount_t     *mp,
 219        xfs_trans_t     *tp,
 220        xfs_ino_t       ino,
 221        xfs_dinode_t    **dipp,
 222        xfs_buf_t       **bpp,
 223        int             *offset,
 224        uint            imap_flags)
 225{
 226        struct xfs_imap imap;
 227        xfs_buf_t       *bp;
 228        int             error;
 229
 230        imap.im_blkno = 0;
 231        error = xfs_imap(mp, tp, ino, &imap, imap_flags);
 232        if (error)
 233                return error;
 234
 235        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
 236        if (error)
 237                return error;
 238
 239        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 240        *bpp = bp;
 241        *offset = imap.im_boffset;
 242        return 0;
 243}
 244
 245
 246/*
 247 * This routine is called to map an inode to the buffer containing
 248 * the on-disk version of the inode.  It returns a pointer to the
 249 * buffer containing the on-disk inode in the bpp parameter, and in
 250 * the dip parameter it returns a pointer to the on-disk inode within
 251 * that buffer.
 252 *
 253 * If a non-zero error is returned, then the contents of bpp and
 254 * dipp are undefined.
 255 *
 256 * The inode is expected to already been mapped to its buffer and read
 257 * in once, thus we can use the mapping information stored in the inode
 258 * rather than calling xfs_imap().  This allows us to avoid the overhead
 259 * of looking at the inode btree for small block file systems
 260 * (see xfs_imap()).
 261 */
 262int
 263xfs_itobp(
 264        xfs_mount_t     *mp,
 265        xfs_trans_t     *tp,
 266        xfs_inode_t     *ip,
 267        xfs_dinode_t    **dipp,
 268        xfs_buf_t       **bpp,
 269        uint            buf_flags)
 270{
 271        xfs_buf_t       *bp;
 272        int             error;
 273
 274        ASSERT(ip->i_imap.im_blkno != 0);
 275
 276        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
 277        if (error)
 278                return error;
 279
 280        if (!bp) {
 281                ASSERT(buf_flags & XBF_TRYLOCK);
 282                ASSERT(tp == NULL);
 283                *bpp = NULL;
 284                return EAGAIN;
 285        }
 286
 287        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 288        *bpp = bp;
 289        return 0;
 290}
 291
 292/*
 293 * Move inode type and inode format specific information from the
 294 * on-disk inode to the in-core inode.  For fifos, devs, and sockets
 295 * this means set if_rdev to the proper value.  For files, directories,
 296 * and symlinks this means to bring in the in-line data or extent
 297 * pointers.  For a file in B-tree format, only the root is immediately
 298 * brought in-core.  The rest will be in-lined in if_extents when it
 299 * is first referenced (see xfs_iread_extents()).
 300 */
 301STATIC int
 302xfs_iformat(
 303        xfs_inode_t             *ip,
 304        xfs_dinode_t            *dip)
 305{
 306        xfs_attr_shortform_t    *atp;
 307        int                     size;
 308        int                     error;
 309        xfs_fsize_t             di_size;
 310        ip->i_df.if_ext_max =
 311                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 312        error = 0;
 313
 314        if (unlikely(be32_to_cpu(dip->di_nextents) +
 315                     be16_to_cpu(dip->di_anextents) >
 316                     be64_to_cpu(dip->di_nblocks))) {
 317                xfs_warn(ip->i_mount,
 318                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
 319                        (unsigned long long)ip->i_ino,
 320                        (int)(be32_to_cpu(dip->di_nextents) +
 321                              be16_to_cpu(dip->di_anextents)),
 322                        (unsigned long long)
 323                                be64_to_cpu(dip->di_nblocks));
 324                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
 325                                     ip->i_mount, dip);
 326                return XFS_ERROR(EFSCORRUPTED);
 327        }
 328
 329        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
 330                xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
 331                        (unsigned long long)ip->i_ino,
 332                        dip->di_forkoff);
 333                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
 334                                     ip->i_mount, dip);
 335                return XFS_ERROR(EFSCORRUPTED);
 336        }
 337
 338        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
 339                     !ip->i_mount->m_rtdev_targp)) {
 340                xfs_warn(ip->i_mount,
 341                        "corrupt dinode %Lu, has realtime flag set.",
 342                        ip->i_ino);
 343                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
 344                                     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
 345                return XFS_ERROR(EFSCORRUPTED);
 346        }
 347
 348        switch (ip->i_d.di_mode & S_IFMT) {
 349        case S_IFIFO:
 350        case S_IFCHR:
 351        case S_IFBLK:
 352        case S_IFSOCK:
 353                if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
 354                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
 355                                              ip->i_mount, dip);
 356                        return XFS_ERROR(EFSCORRUPTED);
 357                }
 358                ip->i_d.di_size = 0;
 359                ip->i_size = 0;
 360                ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
 361                break;
 362
 363        case S_IFREG:
 364        case S_IFLNK:
 365        case S_IFDIR:
 366                switch (dip->di_format) {
 367                case XFS_DINODE_FMT_LOCAL:
 368                        /*
 369                         * no local regular files yet
 370                         */
 371                        if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
 372                                xfs_warn(ip->i_mount,
 373                        "corrupt inode %Lu (local format for regular file).",
 374                                        (unsigned long long) ip->i_ino);
 375                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
 376                                                     XFS_ERRLEVEL_LOW,
 377                                                     ip->i_mount, dip);
 378                                return XFS_ERROR(EFSCORRUPTED);
 379                        }
 380
 381                        di_size = be64_to_cpu(dip->di_size);
 382                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
 383                                xfs_warn(ip->i_mount,
 384                        "corrupt inode %Lu (bad size %Ld for local inode).",
 385                                        (unsigned long long) ip->i_ino,
 386                                        (long long) di_size);
 387                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
 388                                                     XFS_ERRLEVEL_LOW,
 389                                                     ip->i_mount, dip);
 390                                return XFS_ERROR(EFSCORRUPTED);
 391                        }
 392
 393                        size = (int)di_size;
 394                        error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
 395                        break;
 396                case XFS_DINODE_FMT_EXTENTS:
 397                        error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
 398                        break;
 399                case XFS_DINODE_FMT_BTREE:
 400                        error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
 401                        break;
 402                default:
 403                        XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
 404                                         ip->i_mount);
 405                        return XFS_ERROR(EFSCORRUPTED);
 406                }
 407                break;
 408
 409        default:
 410                XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
 411                return XFS_ERROR(EFSCORRUPTED);
 412        }
 413        if (error) {
 414                return error;
 415        }
 416        if (!XFS_DFORK_Q(dip))
 417                return 0;
 418        ASSERT(ip->i_afp == NULL);
 419        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
 420        ip->i_afp->if_ext_max =
 421                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 422        switch (dip->di_aformat) {
 423        case XFS_DINODE_FMT_LOCAL:
 424                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
 425                size = be16_to_cpu(atp->hdr.totsize);
 426
 427                if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
 428                        xfs_warn(ip->i_mount,
 429                                "corrupt inode %Lu (bad attr fork size %Ld).",
 430                                (unsigned long long) ip->i_ino,
 431                                (long long) size);
 432                        XFS_CORRUPTION_ERROR("xfs_iformat(8)",
 433                                             XFS_ERRLEVEL_LOW,
 434                                             ip->i_mount, dip);
 435                        return XFS_ERROR(EFSCORRUPTED);
 436                }
 437
 438                error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
 439                break;
 440        case XFS_DINODE_FMT_EXTENTS:
 441                error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
 442                break;
 443        case XFS_DINODE_FMT_BTREE:
 444                error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
 445                break;
 446        default:
 447                error = XFS_ERROR(EFSCORRUPTED);
 448                break;
 449        }
 450        if (error) {
 451                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 452                ip->i_afp = NULL;
 453                xfs_idestroy_fork(ip, XFS_DATA_FORK);
 454        }
 455        return error;
 456}
 457
 458/*
 459 * The file is in-lined in the on-disk inode.
 460 * If it fits into if_inline_data, then copy
 461 * it there, otherwise allocate a buffer for it
 462 * and copy the data there.  Either way, set
 463 * if_data to point at the data.
 464 * If we allocate a buffer for the data, make
 465 * sure that its size is a multiple of 4 and
 466 * record the real size in i_real_bytes.
 467 */
 468STATIC int
 469xfs_iformat_local(
 470        xfs_inode_t     *ip,
 471        xfs_dinode_t    *dip,
 472        int             whichfork,
 473        int             size)
 474{
 475        xfs_ifork_t     *ifp;
 476        int             real_size;
 477
 478        /*
 479         * If the size is unreasonable, then something
 480         * is wrong and we just bail out rather than crash in
 481         * kmem_alloc() or memcpy() below.
 482         */
 483        if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 484                xfs_warn(ip->i_mount,
 485        "corrupt inode %Lu (bad size %d for local fork, size = %d).",
 486                        (unsigned long long) ip->i_ino, size,
 487                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
 488                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
 489                                     ip->i_mount, dip);
 490                return XFS_ERROR(EFSCORRUPTED);
 491        }
 492        ifp = XFS_IFORK_PTR(ip, whichfork);
 493        real_size = 0;
 494        if (size == 0)
 495                ifp->if_u1.if_data = NULL;
 496        else if (size <= sizeof(ifp->if_u2.if_inline_data))
 497                ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 498        else {
 499                real_size = roundup(size, 4);
 500                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
 501        }
 502        ifp->if_bytes = size;
 503        ifp->if_real_bytes = real_size;
 504        if (size)
 505                memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
 506        ifp->if_flags &= ~XFS_IFEXTENTS;
 507        ifp->if_flags |= XFS_IFINLINE;
 508        return 0;
 509}
 510
 511/*
 512 * The file consists of a set of extents all
 513 * of which fit into the on-disk inode.
 514 * If there are few enough extents to fit into
 515 * the if_inline_ext, then copy them there.
 516 * Otherwise allocate a buffer for them and copy
 517 * them into it.  Either way, set if_extents
 518 * to point at the extents.
 519 */
 520STATIC int
 521xfs_iformat_extents(
 522        xfs_inode_t     *ip,
 523        xfs_dinode_t    *dip,
 524        int             whichfork)
 525{
 526        xfs_bmbt_rec_t  *dp;
 527        xfs_ifork_t     *ifp;
 528        int             nex;
 529        int             size;
 530        int             i;
 531
 532        ifp = XFS_IFORK_PTR(ip, whichfork);
 533        nex = XFS_DFORK_NEXTENTS(dip, whichfork);
 534        size = nex * (uint)sizeof(xfs_bmbt_rec_t);
 535
 536        /*
 537         * If the number of extents is unreasonable, then something
 538         * is wrong and we just bail out rather than crash in
 539         * kmem_alloc() or memcpy() below.
 540         */
 541        if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 542                xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
 543                        (unsigned long long) ip->i_ino, nex);
 544                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
 545                                     ip->i_mount, dip);
 546                return XFS_ERROR(EFSCORRUPTED);
 547        }
 548
 549        ifp->if_real_bytes = 0;
 550        if (nex == 0)
 551                ifp->if_u1.if_extents = NULL;
 552        else if (nex <= XFS_INLINE_EXTS)
 553                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 554        else
 555                xfs_iext_add(ifp, 0, nex);
 556
 557        ifp->if_bytes = size;
 558        if (size) {
 559                dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
 560                xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
 561                for (i = 0; i < nex; i++, dp++) {
 562                        xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 563                        ep->l0 = get_unaligned_be64(&dp->l0);
 564                        ep->l1 = get_unaligned_be64(&dp->l1);
 565                }
 566                XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
 567                if (whichfork != XFS_DATA_FORK ||
 568                        XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
 569                                if (unlikely(xfs_check_nostate_extents(
 570                                    ifp, 0, nex))) {
 571                                        XFS_ERROR_REPORT("xfs_iformat_extents(2)",
 572                                                         XFS_ERRLEVEL_LOW,
 573                                                         ip->i_mount);
 574                                        return XFS_ERROR(EFSCORRUPTED);
 575                                }
 576        }
 577        ifp->if_flags |= XFS_IFEXTENTS;
 578        return 0;
 579}
 580
 581/*
 582 * The file has too many extents to fit into
 583 * the inode, so they are in B-tree format.
 584 * Allocate a buffer for the root of the B-tree
 585 * and copy the root into it.  The i_extents
 586 * field will remain NULL until all of the
 587 * extents are read in (when they are needed).
 588 */
 589STATIC int
 590xfs_iformat_btree(
 591        xfs_inode_t             *ip,
 592        xfs_dinode_t            *dip,
 593        int                     whichfork)
 594{
 595        xfs_bmdr_block_t        *dfp;
 596        xfs_ifork_t             *ifp;
 597        /* REFERENCED */
 598        int                     nrecs;
 599        int                     size;
 600
 601        ifp = XFS_IFORK_PTR(ip, whichfork);
 602        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 603        size = XFS_BMAP_BROOT_SPACE(dfp);
 604        nrecs = be16_to_cpu(dfp->bb_numrecs);
 605
 606        /*
 607         * blow out if -- fork has less extents than can fit in
 608         * fork (fork shouldn't be a btree format), root btree
 609         * block has more records than can fit into the fork,
 610         * or the number of extents is greater than the number of
 611         * blocks.
 612         */
 613        if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
 614            || XFS_BMDR_SPACE_CALC(nrecs) >
 615                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
 616            || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
 617                xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
 618                        (unsigned long long) ip->i_ino);
 619                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
 620                                 ip->i_mount, dip);
 621                return XFS_ERROR(EFSCORRUPTED);
 622        }
 623
 624        ifp->if_broot_bytes = size;
 625        ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
 626        ASSERT(ifp->if_broot != NULL);
 627        /*
 628         * Copy and convert from the on-disk structure
 629         * to the in-memory structure.
 630         */
 631        xfs_bmdr_to_bmbt(ip->i_mount, dfp,
 632                         XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
 633                         ifp->if_broot, size);
 634        ifp->if_flags &= ~XFS_IFEXTENTS;
 635        ifp->if_flags |= XFS_IFBROOT;
 636
 637        return 0;
 638}
 639
 640STATIC void
 641xfs_dinode_from_disk(
 642        xfs_icdinode_t          *to,
 643        xfs_dinode_t            *from)
 644{
 645        to->di_magic = be16_to_cpu(from->di_magic);
 646        to->di_mode = be16_to_cpu(from->di_mode);
 647        to->di_version = from ->di_version;
 648        to->di_format = from->di_format;
 649        to->di_onlink = be16_to_cpu(from->di_onlink);
 650        to->di_uid = be32_to_cpu(from->di_uid);
 651        to->di_gid = be32_to_cpu(from->di_gid);
 652        to->di_nlink = be32_to_cpu(from->di_nlink);
 653        to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
 654        to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 655        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 656        to->di_flushiter = be16_to_cpu(from->di_flushiter);
 657        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
 658        to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
 659        to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
 660        to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
 661        to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
 662        to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
 663        to->di_size = be64_to_cpu(from->di_size);
 664        to->di_nblocks = be64_to_cpu(from->di_nblocks);
 665        to->di_extsize = be32_to_cpu(from->di_extsize);
 666        to->di_nextents = be32_to_cpu(from->di_nextents);
 667        to->di_anextents = be16_to_cpu(from->di_anextents);
 668        to->di_forkoff = from->di_forkoff;
 669        to->di_aformat  = from->di_aformat;
 670        to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
 671        to->di_dmstate  = be16_to_cpu(from->di_dmstate);
 672        to->di_flags    = be16_to_cpu(from->di_flags);
 673        to->di_gen      = be32_to_cpu(from->di_gen);
 674}
 675
 676void
 677xfs_dinode_to_disk(
 678        xfs_dinode_t            *to,
 679        xfs_icdinode_t          *from)
 680{
 681        to->di_magic = cpu_to_be16(from->di_magic);
 682        to->di_mode = cpu_to_be16(from->di_mode);
 683        to->di_version = from ->di_version;
 684        to->di_format = from->di_format;
 685        to->di_onlink = cpu_to_be16(from->di_onlink);
 686        to->di_uid = cpu_to_be32(from->di_uid);
 687        to->di_gid = cpu_to_be32(from->di_gid);
 688        to->di_nlink = cpu_to_be32(from->di_nlink);
 689        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 690        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 691        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 692        to->di_flushiter = cpu_to_be16(from->di_flushiter);
 693        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 694        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 695        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 696        to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 697        to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 698        to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
 699        to->di_size = cpu_to_be64(from->di_size);
 700        to->di_nblocks = cpu_to_be64(from->di_nblocks);
 701        to->di_extsize = cpu_to_be32(from->di_extsize);
 702        to->di_nextents = cpu_to_be32(from->di_nextents);
 703        to->di_anextents = cpu_to_be16(from->di_anextents);
 704        to->di_forkoff = from->di_forkoff;
 705        to->di_aformat = from->di_aformat;
 706        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 707        to->di_dmstate = cpu_to_be16(from->di_dmstate);
 708        to->di_flags = cpu_to_be16(from->di_flags);
 709        to->di_gen = cpu_to_be32(from->di_gen);
 710}
 711
 712STATIC uint
 713_xfs_dic2xflags(
 714        __uint16_t              di_flags)
 715{
 716        uint                    flags = 0;
 717
 718        if (di_flags & XFS_DIFLAG_ANY) {
 719                if (di_flags & XFS_DIFLAG_REALTIME)
 720                        flags |= XFS_XFLAG_REALTIME;
 721                if (di_flags & XFS_DIFLAG_PREALLOC)
 722                        flags |= XFS_XFLAG_PREALLOC;
 723                if (di_flags & XFS_DIFLAG_IMMUTABLE)
 724                        flags |= XFS_XFLAG_IMMUTABLE;
 725                if (di_flags & XFS_DIFLAG_APPEND)
 726                        flags |= XFS_XFLAG_APPEND;
 727                if (di_flags & XFS_DIFLAG_SYNC)
 728                        flags |= XFS_XFLAG_SYNC;
 729                if (di_flags & XFS_DIFLAG_NOATIME)
 730                        flags |= XFS_XFLAG_NOATIME;
 731                if (di_flags & XFS_DIFLAG_NODUMP)
 732                        flags |= XFS_XFLAG_NODUMP;
 733                if (di_flags & XFS_DIFLAG_RTINHERIT)
 734                        flags |= XFS_XFLAG_RTINHERIT;
 735                if (di_flags & XFS_DIFLAG_PROJINHERIT)
 736                        flags |= XFS_XFLAG_PROJINHERIT;
 737                if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 738                        flags |= XFS_XFLAG_NOSYMLINKS;
 739                if (di_flags & XFS_DIFLAG_EXTSIZE)
 740                        flags |= XFS_XFLAG_EXTSIZE;
 741                if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 742                        flags |= XFS_XFLAG_EXTSZINHERIT;
 743                if (di_flags & XFS_DIFLAG_NODEFRAG)
 744                        flags |= XFS_XFLAG_NODEFRAG;
 745                if (di_flags & XFS_DIFLAG_FILESTREAM)
 746                        flags |= XFS_XFLAG_FILESTREAM;
 747        }
 748
 749        return flags;
 750}
 751
 752uint
 753xfs_ip2xflags(
 754        xfs_inode_t             *ip)
 755{
 756        xfs_icdinode_t          *dic = &ip->i_d;
 757
 758        return _xfs_dic2xflags(dic->di_flags) |
 759                                (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
 760}
 761
 762uint
 763xfs_dic2xflags(
 764        xfs_dinode_t            *dip)
 765{
 766        return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
 767                                (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 768}
 769
 770/*
 771 * Read the disk inode attributes into the in-core inode structure.
 772 */
 773int
 774xfs_iread(
 775        xfs_mount_t     *mp,
 776        xfs_trans_t     *tp,
 777        xfs_inode_t     *ip,
 778        uint            iget_flags)
 779{
 780        xfs_buf_t       *bp;
 781        xfs_dinode_t    *dip;
 782        int             error;
 783
 784        /*
 785         * Fill in the location information in the in-core inode.
 786         */
 787        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
 788        if (error)
 789                return error;
 790
 791        /*
 792         * Get pointers to the on-disk inode and the buffer containing it.
 793         */
 794        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
 795                               XBF_LOCK, iget_flags);
 796        if (error)
 797                return error;
 798        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 799
 800        /*
 801         * If we got something that isn't an inode it means someone
 802         * (nfs or dmi) has a stale handle.
 803         */
 804        if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) {
 805#ifdef DEBUG
 806                xfs_alert(mp,
 807                        "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
 808                        __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
 809#endif /* DEBUG */
 810                error = XFS_ERROR(EINVAL);
 811                goto out_brelse;
 812        }
 813
 814        /*
 815         * If the on-disk inode is already linked to a directory
 816         * entry, copy all of the inode into the in-core inode.
 817         * xfs_iformat() handles copying in the inode format
 818         * specific information.
 819         * Otherwise, just get the truly permanent information.
 820         */
 821        if (dip->di_mode) {
 822                xfs_dinode_from_disk(&ip->i_d, dip);
 823                error = xfs_iformat(ip, dip);
 824                if (error)  {
 825#ifdef DEBUG
 826                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
 827                                __func__, error);
 828#endif /* DEBUG */
 829                        goto out_brelse;
 830                }
 831        } else {
 832                ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
 833                ip->i_d.di_version = dip->di_version;
 834                ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
 835                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
 836                /*
 837                 * Make sure to pull in the mode here as well in
 838                 * case the inode is released without being used.
 839                 * This ensures that xfs_inactive() will see that
 840                 * the inode is already free and not try to mess
 841                 * with the uninitialized part of it.
 842                 */
 843                ip->i_d.di_mode = 0;
 844                /*
 845                 * Initialize the per-fork minima and maxima for a new
 846                 * inode here.  xfs_iformat will do it for old inodes.
 847                 */
 848                ip->i_df.if_ext_max =
 849                        XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 850        }
 851
 852        /*
 853         * The inode format changed when we moved the link count and
 854         * made it 32 bits long.  If this is an old format inode,
 855         * convert it in memory to look like a new one.  If it gets
 856         * flushed to disk we will convert back before flushing or
 857         * logging it.  We zero out the new projid field and the old link
 858         * count field.  We'll handle clearing the pad field (the remains
 859         * of the old uuid field) when we actually convert the inode to
 860         * the new format. We don't change the version number so that we
 861         * can distinguish this from a real new format inode.
 862         */
 863        if (ip->i_d.di_version == 1) {
 864                ip->i_d.di_nlink = ip->i_d.di_onlink;
 865                ip->i_d.di_onlink = 0;
 866                xfs_set_projid(ip, 0);
 867        }
 868
 869        ip->i_delayed_blks = 0;
 870        ip->i_size = ip->i_d.di_size;
 871
 872        /*
 873         * Mark the buffer containing the inode as something to keep
 874         * around for a while.  This helps to keep recently accessed
 875         * meta-data in-core longer.
 876         */
 877        xfs_buf_set_ref(bp, XFS_INO_REF);
 878
 879        /*
 880         * Use xfs_trans_brelse() to release the buffer containing the
 881         * on-disk inode, because it was acquired with xfs_trans_read_buf()
 882         * in xfs_itobp() above.  If tp is NULL, this is just a normal
 883         * brelse().  If we're within a transaction, then xfs_trans_brelse()
 884         * will only release the buffer if it is not dirty within the
 885         * transaction.  It will be OK to release the buffer in this case,
 886         * because inodes on disk are never destroyed and we will be
 887         * locking the new in-core inode before putting it in the hash
 888         * table where other processes can find it.  Thus we don't have
 889         * to worry about the inode being changed just because we released
 890         * the buffer.
 891         */
 892 out_brelse:
 893        xfs_trans_brelse(tp, bp);
 894        return error;
 895}
 896
 897/*
 898 * Read in extents from a btree-format inode.
 899 * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
 900 */
 901int
 902xfs_iread_extents(
 903        xfs_trans_t     *tp,
 904        xfs_inode_t     *ip,
 905        int             whichfork)
 906{
 907        int             error;
 908        xfs_ifork_t     *ifp;
 909        xfs_extnum_t    nextents;
 910
 911        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
 912                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
 913                                 ip->i_mount);
 914                return XFS_ERROR(EFSCORRUPTED);
 915        }
 916        nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
 917        ifp = XFS_IFORK_PTR(ip, whichfork);
 918
 919        /*
 920         * We know that the size is valid (it's checked in iformat_btree)
 921         */
 922        ifp->if_bytes = ifp->if_real_bytes = 0;
 923        ifp->if_flags |= XFS_IFEXTENTS;
 924        xfs_iext_add(ifp, 0, nextents);
 925        error = xfs_bmap_read_extents(tp, ip, whichfork);
 926        if (error) {
 927                xfs_iext_destroy(ifp);
 928                ifp->if_flags &= ~XFS_IFEXTENTS;
 929                return error;
 930        }
 931        xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
 932        return 0;
 933}
 934
 935/*
 936 * Allocate an inode on disk and return a copy of its in-core version.
 937 * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
 938 * appropriately within the inode.  The uid and gid for the inode are
 939 * set according to the contents of the given cred structure.
 940 *
 941 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
 942 * has a free inode available, call xfs_iget()
 943 * to obtain the in-core version of the allocated inode.  Finally,
 944 * fill in the inode and log its initial contents.  In this case,
 945 * ialloc_context would be set to NULL and call_again set to false.
 946 *
 947 * If xfs_dialloc() does not have an available inode,
 948 * it will replenish its supply by doing an allocation. Since we can
 949 * only do one allocation within a transaction without deadlocks, we
 950 * must commit the current transaction before returning the inode itself.
 951 * In this case, therefore, we will set call_again to true and return.
 952 * The caller should then commit the current transaction, start a new
 953 * transaction, and call xfs_ialloc() again to actually get the inode.
 954 *
 955 * To ensure that some other process does not grab the inode that
 956 * was allocated during the first call to xfs_ialloc(), this routine
 957 * also returns the [locked] bp pointing to the head of the freelist
 958 * as ialloc_context.  The caller should hold this buffer across
 959 * the commit and pass it back into this routine on the second call.
 960 *
 961 * If we are allocating quota inodes, we do not have a parent inode
 962 * to attach to or associate with (i.e. pip == NULL) because they
 963 * are not linked into the directory structure - they are attached
 964 * directly to the superblock - and so have no parent.
 965 */
 966int
 967xfs_ialloc(
 968        xfs_trans_t     *tp,
 969        xfs_inode_t     *pip,
 970        mode_t          mode,
 971        xfs_nlink_t     nlink,
 972        xfs_dev_t       rdev,
 973        prid_t          prid,
 974        int             okalloc,
 975        xfs_buf_t       **ialloc_context,
 976        boolean_t       *call_again,
 977        xfs_inode_t     **ipp)
 978{
 979        xfs_ino_t       ino;
 980        xfs_inode_t     *ip;
 981        uint            flags;
 982        int             error;
 983        timespec_t      tv;
 984        int             filestreams = 0;
 985
 986        /*
 987         * Call the space management code to pick
 988         * the on-disk inode to be allocated.
 989         */
 990        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
 991                            ialloc_context, call_again, &ino);
 992        if (error)
 993                return error;
 994        if (*call_again || ino == NULLFSINO) {
 995                *ipp = NULL;
 996                return 0;
 997        }
 998        ASSERT(*ialloc_context == NULL);
 999
1000        /*
1001         * Get the in-core inode with the lock held exclusively.
1002         * This is because we're setting fields here we need
1003         * to prevent others from looking at until we're done.
1004         */
1005        error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
1006                         XFS_ILOCK_EXCL, &ip);
1007        if (error)
1008                return error;
1009        ASSERT(ip != NULL);
1010
1011        ip->i_d.di_mode = (__uint16_t)mode;
1012        ip->i_d.di_onlink = 0;
1013        ip->i_d.di_nlink = nlink;
1014        ASSERT(ip->i_d.di_nlink == nlink);
1015        ip->i_d.di_uid = current_fsuid();
1016        ip->i_d.di_gid = current_fsgid();
1017        xfs_set_projid(ip, prid);
1018        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1019
1020        /*
1021         * If the superblock version is up to where we support new format
1022         * inodes and this is currently an old format inode, then change
1023         * the inode version number now.  This way we only do the conversion
1024         * here rather than here and in the flush/logging code.
1025         */
1026        if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
1027            ip->i_d.di_version == 1) {
1028                ip->i_d.di_version = 2;
1029                /*
1030                 * We've already zeroed the old link count, the projid field,
1031                 * and the pad field.
1032                 */
1033        }
1034
1035        /*
1036         * Project ids won't be stored on disk if we are using a version 1 inode.
1037         */
1038        if ((prid != 0) && (ip->i_d.di_version == 1))
1039                xfs_bump_ino_vers2(tp, ip);
1040
1041        if (pip && XFS_INHERIT_GID(pip)) {
1042                ip->i_d.di_gid = pip->i_d.di_gid;
1043                if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
1044                        ip->i_d.di_mode |= S_ISGID;
1045                }
1046        }
1047
1048        /*
1049         * If the group ID of the new file does not match the effective group
1050         * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
1051         * (and only if the irix_sgid_inherit compatibility variable is set).
1052         */
1053        if ((irix_sgid_inherit) &&
1054            (ip->i_d.di_mode & S_ISGID) &&
1055            (!in_group_p((gid_t)ip->i_d.di_gid))) {
1056                ip->i_d.di_mode &= ~S_ISGID;
1057        }
1058
1059        ip->i_d.di_size = 0;
1060        ip->i_size = 0;
1061        ip->i_d.di_nextents = 0;
1062        ASSERT(ip->i_d.di_nblocks == 0);
1063
1064        nanotime(&tv);
1065        ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
1066        ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
1067        ip->i_d.di_atime = ip->i_d.di_mtime;
1068        ip->i_d.di_ctime = ip->i_d.di_mtime;
1069
1070        /*
1071         * di_gen will have been taken care of in xfs_iread.
1072         */
1073        ip->i_d.di_extsize = 0;
1074        ip->i_d.di_dmevmask = 0;
1075        ip->i_d.di_dmstate = 0;
1076        ip->i_d.di_flags = 0;
1077        flags = XFS_ILOG_CORE;
1078        switch (mode & S_IFMT) {
1079        case S_IFIFO:
1080        case S_IFCHR:
1081        case S_IFBLK:
1082        case S_IFSOCK:
1083                ip->i_d.di_format = XFS_DINODE_FMT_DEV;
1084                ip->i_df.if_u2.if_rdev = rdev;
1085                ip->i_df.if_flags = 0;
1086                flags |= XFS_ILOG_DEV;
1087                break;
1088        case S_IFREG:
1089                /*
1090                 * we can't set up filestreams until after the VFS inode
1091                 * is set up properly.
1092                 */
1093                if (pip && xfs_inode_is_filestream(pip))
1094                        filestreams = 1;
1095                /* fall through */
1096        case S_IFDIR:
1097                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1098                        uint    di_flags = 0;
1099
1100                        if (S_ISDIR(mode)) {
1101                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1102                                        di_flags |= XFS_DIFLAG_RTINHERIT;
1103                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1104                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
1105                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
1106                                }
1107                        } else if (S_ISREG(mode)) {
1108                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1109                                        di_flags |= XFS_DIFLAG_REALTIME;
1110                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1111                                        di_flags |= XFS_DIFLAG_EXTSIZE;
1112                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
1113                                }
1114                        }
1115                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1116                            xfs_inherit_noatime)
1117                                di_flags |= XFS_DIFLAG_NOATIME;
1118                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
1119                            xfs_inherit_nodump)
1120                                di_flags |= XFS_DIFLAG_NODUMP;
1121                        if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
1122                            xfs_inherit_sync)
1123                                di_flags |= XFS_DIFLAG_SYNC;
1124                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
1125                            xfs_inherit_nosymlinks)
1126                                di_flags |= XFS_DIFLAG_NOSYMLINKS;
1127                        if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1128                                di_flags |= XFS_DIFLAG_PROJINHERIT;
1129                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
1130                            xfs_inherit_nodefrag)
1131                                di_flags |= XFS_DIFLAG_NODEFRAG;
1132                        if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
1133                                di_flags |= XFS_DIFLAG_FILESTREAM;
1134                        ip->i_d.di_flags |= di_flags;
1135                }
1136                /* FALLTHROUGH */
1137        case S_IFLNK:
1138                ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1139                ip->i_df.if_flags = XFS_IFEXTENTS;
1140                ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
1141                ip->i_df.if_u1.if_extents = NULL;
1142                break;
1143        default:
1144                ASSERT(0);
1145        }
1146        /*
1147         * Attribute fork settings for new inode.
1148         */
1149        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1150        ip->i_d.di_anextents = 0;
1151
1152        /*
1153         * Log the new values stuffed into the inode.
1154         */
1155        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
1156        xfs_trans_log_inode(tp, ip, flags);
1157
1158        /* now that we have an i_mode we can setup inode ops and unlock */
1159        xfs_setup_inode(ip);
1160
1161        /* now we have set up the vfs inode we can associate the filestream */
1162        if (filestreams) {
1163                error = xfs_filestream_associate(pip, ip);
1164                if (error < 0)
1165                        return -error;
1166                if (!error)
1167                        xfs_iflags_set(ip, XFS_IFILESTREAM);
1168        }
1169
1170        *ipp = ip;
1171        return 0;
1172}
1173
1174/*
1175 * Check to make sure that there are no blocks allocated to the
1176 * file beyond the size of the file.  We don't check this for
1177 * files with fixed size extents or real time extents, but we
1178 * at least do it for regular files.
1179 */
1180#ifdef DEBUG
1181STATIC void
1182xfs_isize_check(
1183        struct xfs_inode        *ip,
1184        xfs_fsize_t             isize)
1185{
1186        struct xfs_mount        *mp = ip->i_mount;
1187        xfs_fileoff_t           map_first;
1188        int                     nimaps;
1189        xfs_bmbt_irec_t         imaps[2];
1190
1191        if (!S_ISREG(ip->i_d.di_mode))
1192                return;
1193
1194        if (XFS_IS_REALTIME_INODE(ip))
1195                return;
1196
1197        if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
1198                return;
1199
1200        nimaps = 2;
1201        map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
1202        /*
1203         * The filesystem could be shutting down, so bmapi may return
1204         * an error.
1205         */
1206        if (xfs_bmapi(NULL, ip, map_first,
1207                         (XFS_B_TO_FSB(mp,
1208                                       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
1209                          map_first),
1210                         XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
1211                         NULL))
1212            return;
1213        ASSERT(nimaps == 1);
1214        ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1215}
1216#else   /* DEBUG */
1217#define xfs_isize_check(ip, isize)
1218#endif  /* DEBUG */
1219
1220/*
1221 * Free up the underlying blocks past new_size.  The new size must be smaller
1222 * than the current size.  This routine can be used both for the attribute and
1223 * data fork, and does not modify the inode size, which is left to the caller.
1224 *
1225 * The transaction passed to this routine must have made a permanent log
1226 * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1227 * given transaction and start new ones, so make sure everything involved in
1228 * the transaction is tidy before calling here.  Some transaction will be
1229 * returned to the caller to be committed.  The incoming transaction must
1230 * already include the inode, and both inode locks must be held exclusively.
1231 * The inode must also be "held" within the transaction.  On return the inode
1232 * will be "held" within the returned transaction.  This routine does NOT
1233 * require any disk space to be reserved for it within the transaction.
1234 *
1235 * If we get an error, we must return with the inode locked and linked into the
1236 * current transaction. This keeps things simple for the higher level code,
1237 * because it always knows that the inode is locked and held in the transaction
1238 * that returns to it whether errors occur or not.  We don't mark the inode
1239 * dirty on error so that transactions can be easily aborted if possible.
1240 */
1241int
1242xfs_itruncate_extents(
1243        struct xfs_trans        **tpp,
1244        struct xfs_inode        *ip,
1245        int                     whichfork,
1246        xfs_fsize_t             new_size)
1247{
1248        struct xfs_mount        *mp = ip->i_mount;
1249        struct xfs_trans        *tp = *tpp;
1250        struct xfs_trans        *ntp;
1251        xfs_bmap_free_t         free_list;
1252        xfs_fsblock_t           first_block;
1253        xfs_fileoff_t           first_unmap_block;
1254        xfs_fileoff_t           last_block;
1255        xfs_filblks_t           unmap_len;
1256        int                     committed;
1257        int                     error = 0;
1258        int                     done = 0;
1259
1260        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
1261        ASSERT(new_size <= ip->i_size);
1262        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1263        ASSERT(ip->i_itemp != NULL);
1264        ASSERT(ip->i_itemp->ili_lock_flags == 0);
1265        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1266
1267        /*
1268         * Since it is possible for space to become allocated beyond
1269         * the end of the file (in a crash where the space is allocated
1270         * but the inode size is not yet updated), simply remove any
1271         * blocks which show up between the new EOF and the maximum
1272         * possible file size.  If the first block to be removed is
1273         * beyond the maximum file size (ie it is the same as last_block),
1274         * then there is nothing to do.
1275         */
1276        first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1277        last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1278        if (first_unmap_block == last_block)
1279                return 0;
1280
1281        ASSERT(first_unmap_block < last_block);
1282        unmap_len = last_block - first_unmap_block + 1;
1283        while (!done) {
1284                xfs_bmap_init(&free_list, &first_block);
1285                error = xfs_bunmapi(tp, ip,
1286                                    first_unmap_block, unmap_len,
1287                                    xfs_bmapi_aflag(whichfork),
1288                                    XFS_ITRUNC_MAX_EXTENTS,
1289                                    &first_block, &free_list,
1290                                    &done);
1291                if (error)
1292                        goto out_bmap_cancel;
1293
1294                /*
1295                 * Duplicate the transaction that has the permanent
1296                 * reservation and commit the old transaction.
1297                 */
1298                error = xfs_bmap_finish(&tp, &free_list, &committed);
1299                if (committed)
1300                        xfs_trans_ijoin(tp, ip);
1301                if (error)
1302                        goto out_bmap_cancel;
1303
1304                if (committed) {
1305                        /*
1306                         * Mark the inode dirty so it will be logged and
1307                         * moved forward in the log as part of every commit.
1308                         */
1309                        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1310                }
1311
1312                ntp = xfs_trans_dup(tp);
1313                error = xfs_trans_commit(tp, 0);
1314                tp = ntp;
1315
1316                xfs_trans_ijoin(tp, ip);
1317
1318                if (error)
1319                        goto out;
1320
1321                /*
1322                 * Transaction commit worked ok so we can drop the extra ticket
1323                 * reference that we gained in xfs_trans_dup()
1324                 */
1325                xfs_log_ticket_put(tp->t_ticket);
1326                error = xfs_trans_reserve(tp, 0,
1327                                        XFS_ITRUNCATE_LOG_RES(mp), 0,
1328                                        XFS_TRANS_PERM_LOG_RES,
1329                                        XFS_ITRUNCATE_LOG_COUNT);
1330                if (error)
1331                        goto out;
1332        }
1333
1334out:
1335        *tpp = tp;
1336        return error;
1337out_bmap_cancel:
1338        /*
1339         * If the bunmapi call encounters an error, return to the caller where
1340         * the transaction can be properly aborted.  We just need to make sure
1341         * we're not holding any resources that we were not when we came in.
1342         */
1343        xfs_bmap_cancel(&free_list);
1344        goto out;
1345}
1346
1347int
1348xfs_itruncate_data(
1349        struct xfs_trans        **tpp,
1350        struct xfs_inode        *ip,
1351        xfs_fsize_t             new_size)
1352{
1353        int                     error;
1354
1355        trace_xfs_itruncate_data_start(ip, new_size);
1356
1357        /*
1358         * The first thing we do is set the size to new_size permanently on
1359         * disk.  This way we don't have to worry about anyone ever being able
1360         * to look at the data being freed even in the face of a crash.
1361         * What we're getting around here is the case where we free a block, it
1362         * is allocated to another file, it is written to, and then we crash.
1363         * If the new data gets written to the file but the log buffers
1364         * containing the free and reallocation don't, then we'd end up with
1365         * garbage in the blocks being freed.  As long as we make the new_size
1366         * permanent before actually freeing any blocks it doesn't matter if
1367         * they get written to.
1368         */
1369        if (ip->i_d.di_nextents > 0) {
1370                /*
1371                 * If we are not changing the file size then do not update
1372                 * the on-disk file size - we may be called from
1373                 * xfs_inactive_free_eofblocks().  If we update the on-disk
1374                 * file size and then the system crashes before the contents
1375                 * of the file are flushed to disk then the files may be
1376                 * full of holes (ie NULL files bug).
1377                 */
1378                if (ip->i_size != new_size) {
1379                        ip->i_d.di_size = new_size;
1380                        ip->i_size = new_size;
1381                        xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1382                }
1383        }
1384
1385        error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
1386        if (error)
1387                return error;
1388
1389        /*
1390         * If we are not changing the file size then do not update the on-disk
1391         * file size - we may be called from xfs_inactive_free_eofblocks().
1392         * If we update the on-disk file size and then the system crashes
1393         * before the contents of the file are flushed to disk then the files
1394         * may be full of holes (ie NULL files bug).
1395         */
1396        xfs_isize_check(ip, new_size);
1397        if (ip->i_size != new_size) {
1398                ip->i_d.di_size = new_size;
1399                ip->i_size = new_size;
1400        }
1401
1402        ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
1403        ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
1404
1405        /*
1406         * Always re-log the inode so that our permanent transaction can keep
1407         * on rolling it forward in the log.
1408         */
1409        xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1410
1411        trace_xfs_itruncate_data_end(ip, new_size);
1412        return 0;
1413}
1414
1415/*
1416 * This is called when the inode's link count goes to 0.
1417 * We place the on-disk inode on a list in the AGI.  It
1418 * will be pulled from this list when the inode is freed.
1419 */
1420int
1421xfs_iunlink(
1422        xfs_trans_t     *tp,
1423        xfs_inode_t     *ip)
1424{
1425        xfs_mount_t     *mp;
1426        xfs_agi_t       *agi;
1427        xfs_dinode_t    *dip;
1428        xfs_buf_t       *agibp;
1429        xfs_buf_t       *ibp;
1430        xfs_agino_t     agino;
1431        short           bucket_index;
1432        int             offset;
1433        int             error;
1434
1435        ASSERT(ip->i_d.di_nlink == 0);
1436        ASSERT(ip->i_d.di_mode != 0);
1437
1438        mp = tp->t_mountp;
1439
1440        /*
1441         * Get the agi buffer first.  It ensures lock ordering
1442         * on the list.
1443         */
1444        error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1445        if (error)
1446                return error;
1447        agi = XFS_BUF_TO_AGI(agibp);
1448
1449        /*
1450         * Get the index into the agi hash table for the
1451         * list this inode will go on.
1452         */
1453        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1454        ASSERT(agino != 0);
1455        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1456        ASSERT(agi->agi_unlinked[bucket_index]);
1457        ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1458
1459        if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
1460                /*
1461                 * There is already another inode in the bucket we need
1462                 * to add ourselves to.  Add us at the front of the list.
1463                 * Here we put the head pointer into our next pointer,
1464                 * and then we fall through to point the head at us.
1465                 */
1466                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1467                if (error)
1468                        return error;
1469
1470                ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1471                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1472                offset = ip->i_imap.im_boffset +
1473                        offsetof(xfs_dinode_t, di_next_unlinked);
1474                xfs_trans_inode_buf(tp, ibp);
1475                xfs_trans_log_buf(tp, ibp, offset,
1476                                  (offset + sizeof(xfs_agino_t) - 1));
1477                xfs_inobp_check(mp, ibp);
1478        }
1479
1480        /*
1481         * Point the bucket head pointer at the inode being inserted.
1482         */
1483        ASSERT(agino != 0);
1484        agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1485        offset = offsetof(xfs_agi_t, agi_unlinked) +
1486                (sizeof(xfs_agino_t) * bucket_index);
1487        xfs_trans_log_buf(tp, agibp, offset,
1488                          (offset + sizeof(xfs_agino_t) - 1));
1489        return 0;
1490}
1491
1492/*
1493 * Pull the on-disk inode from the AGI unlinked list.
1494 */
1495STATIC int
1496xfs_iunlink_remove(
1497        xfs_trans_t     *tp,
1498        xfs_inode_t     *ip)
1499{
1500        xfs_ino_t       next_ino;
1501        xfs_mount_t     *mp;
1502        xfs_agi_t       *agi;
1503        xfs_dinode_t    *dip;
1504        xfs_buf_t       *agibp;
1505        xfs_buf_t       *ibp;
1506        xfs_agnumber_t  agno;
1507        xfs_agino_t     agino;
1508        xfs_agino_t     next_agino;
1509        xfs_buf_t       *last_ibp;
1510        xfs_dinode_t    *last_dip = NULL;
1511        short           bucket_index;
1512        int             offset, last_offset = 0;
1513        int             error;
1514
1515        mp = tp->t_mountp;
1516        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1517
1518        /*
1519         * Get the agi buffer first.  It ensures lock ordering
1520         * on the list.
1521         */
1522        error = xfs_read_agi(mp, tp, agno, &agibp);
1523        if (error)
1524                return error;
1525
1526        agi = XFS_BUF_TO_AGI(agibp);
1527
1528        /*
1529         * Get the index into the agi hash table for the
1530         * list this inode will go on.
1531         */
1532        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1533        ASSERT(agino != 0);
1534        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1535        ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
1536        ASSERT(agi->agi_unlinked[bucket_index]);
1537
1538        if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
1539                /*
1540                 * We're at the head of the list.  Get the inode's
1541                 * on-disk buffer to see if there is anyone after us
1542                 * on the list.  Only modify our next pointer if it
1543                 * is not already NULLAGINO.  This saves us the overhead
1544                 * of dealing with the buffer when there is no need to
1545                 * change it.
1546                 */
1547                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1548                if (error) {
1549                        xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
1550                                __func__, error);
1551                        return error;
1552                }
1553                next_agino = be32_to_cpu(dip->di_next_unlinked);
1554                ASSERT(next_agino != 0);
1555                if (next_agino != NULLAGINO) {
1556                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1557                        offset = ip->i_imap.im_boffset +
1558                                offsetof(xfs_dinode_t, di_next_unlinked);
1559                        xfs_trans_inode_buf(tp, ibp);
1560                        xfs_trans_log_buf(tp, ibp, offset,
1561                                          (offset + sizeof(xfs_agino_t) - 1));
1562                        xfs_inobp_check(mp, ibp);
1563                } else {
1564                        xfs_trans_brelse(tp, ibp);
1565                }
1566                /*
1567                 * Point the bucket head pointer at the next inode.
1568                 */
1569                ASSERT(next_agino != 0);
1570                ASSERT(next_agino != agino);
1571                agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
1572                offset = offsetof(xfs_agi_t, agi_unlinked) +
1573                        (sizeof(xfs_agino_t) * bucket_index);
1574                xfs_trans_log_buf(tp, agibp, offset,
1575                                  (offset + sizeof(xfs_agino_t) - 1));
1576        } else {
1577                /*
1578                 * We need to search the list for the inode being freed.
1579                 */
1580                next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1581                last_ibp = NULL;
1582                while (next_agino != agino) {
1583                        /*
1584                         * If the last inode wasn't the one pointing to
1585                         * us, then release its buffer since we're not
1586                         * going to do anything with it.
1587                         */
1588                        if (last_ibp != NULL) {
1589                                xfs_trans_brelse(tp, last_ibp);
1590                        }
1591                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
1592                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
1593                                            &last_ibp, &last_offset, 0);
1594                        if (error) {
1595                                xfs_warn(mp,
1596                                        "%s: xfs_inotobp() returned error %d.",
1597                                        __func__, error);
1598                                return error;
1599                        }
1600                        next_agino = be32_to_cpu(last_dip->di_next_unlinked);
1601                        ASSERT(next_agino != NULLAGINO);
1602                        ASSERT(next_agino != 0);
1603                }
1604                /*
1605                 * Now last_ibp points to the buffer previous to us on
1606                 * the unlinked list.  Pull us from the list.
1607                 */
1608                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1609                if (error) {
1610                        xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
1611                                __func__, error);
1612                        return error;
1613                }
1614                next_agino = be32_to_cpu(dip->di_next_unlinked);
1615                ASSERT(next_agino != 0);
1616                ASSERT(next_agino != agino);
1617                if (next_agino != NULLAGINO) {
1618                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1619                        offset = ip->i_imap.im_boffset +
1620                                offsetof(xfs_dinode_t, di_next_unlinked);
1621                        xfs_trans_inode_buf(tp, ibp);
1622                        xfs_trans_log_buf(tp, ibp, offset,
1623                                          (offset + sizeof(xfs_agino_t) - 1));
1624                        xfs_inobp_check(mp, ibp);
1625                } else {
1626                        xfs_trans_brelse(tp, ibp);
1627                }
1628                /*
1629                 * Point the previous inode on the list to the next inode.
1630                 */
1631                last_dip->di_next_unlinked = cpu_to_be32(next_agino);
1632                ASSERT(next_agino != 0);
1633                offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
1634                xfs_trans_inode_buf(tp, last_ibp);
1635                xfs_trans_log_buf(tp, last_ibp, offset,
1636                                  (offset + sizeof(xfs_agino_t) - 1));
1637                xfs_inobp_check(mp, last_ibp);
1638        }
1639        return 0;
1640}
1641
1642/*
1643 * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1644 * inodes that are in memory - they all must be marked stale and attached to
1645 * the cluster buffer.
1646 */
1647STATIC void
1648xfs_ifree_cluster(
1649        xfs_inode_t     *free_ip,
1650        xfs_trans_t     *tp,
1651        xfs_ino_t       inum)
1652{
1653        xfs_mount_t             *mp = free_ip->i_mount;
1654        int                     blks_per_cluster;
1655        int                     nbufs;
1656        int                     ninodes;
1657        int                     i, j;
1658        xfs_daddr_t             blkno;
1659        xfs_buf_t               *bp;
1660        xfs_inode_t             *ip;
1661        xfs_inode_log_item_t    *iip;
1662        xfs_log_item_t          *lip;
1663        struct xfs_perag        *pag;
1664
1665        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
1666        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
1667                blks_per_cluster = 1;
1668                ninodes = mp->m_sb.sb_inopblock;
1669                nbufs = XFS_IALLOC_BLOCKS(mp);
1670        } else {
1671                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
1672                                        mp->m_sb.sb_blocksize;
1673                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
1674                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
1675        }
1676
1677        for (j = 0; j < nbufs; j++, inum += ninodes) {
1678                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1679                                         XFS_INO_TO_AGBNO(mp, inum));
1680
1681                /*
1682                 * We obtain and lock the backing buffer first in the process
1683                 * here, as we have to ensure that any dirty inode that we
1684                 * can't get the flush lock on is attached to the buffer.
1685                 * If we scan the in-memory inodes first, then buffer IO can
1686                 * complete before we get a lock on it, and hence we may fail
1687                 * to mark all the active inodes on the buffer stale.
1688                 */
1689                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1690                                        mp->m_bsize * blks_per_cluster,
1691                                        XBF_LOCK);
1692
1693                /*
1694                 * Walk the inodes already attached to the buffer and mark them
1695                 * stale. These will all have the flush locks held, so an
1696                 * in-memory inode walk can't lock them. By marking them all
1697                 * stale first, we will not attempt to lock them in the loop
1698                 * below as the XFS_ISTALE flag will be set.
1699                 */
1700                lip = bp->b_fspriv;
1701                while (lip) {
1702                        if (lip->li_type == XFS_LI_INODE) {
1703                                iip = (xfs_inode_log_item_t *)lip;
1704                                ASSERT(iip->ili_logged == 1);
1705                                lip->li_cb = xfs_istale_done;
1706                                xfs_trans_ail_copy_lsn(mp->m_ail,
1707                                                        &iip->ili_flush_lsn,
1708                                                        &iip->ili_item.li_lsn);
1709                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1710                        }
1711                        lip = lip->li_bio_list;
1712                }
1713
1714
1715                /*
1716                 * For each inode in memory attempt to add it to the inode
1717                 * buffer and set it up for being staled on buffer IO
1718                 * completion.  This is safe as we've locked out tail pushing
1719                 * and flushing by locking the buffer.
1720                 *
1721                 * We have already marked every inode that was part of a
1722                 * transaction stale above, which means there is no point in
1723                 * even trying to lock them.
1724                 */
1725                for (i = 0; i < ninodes; i++) {
1726retry:
1727                        rcu_read_lock();
1728                        ip = radix_tree_lookup(&pag->pag_ici_root,
1729                                        XFS_INO_TO_AGINO(mp, (inum + i)));
1730
1731                        /* Inode not in memory, nothing to do */
1732                        if (!ip) {
1733                                rcu_read_unlock();
1734                                continue;
1735                        }
1736
1737                        /*
1738                         * because this is an RCU protected lookup, we could
1739                         * find a recently freed or even reallocated inode
1740                         * during the lookup. We need to check under the
1741                         * i_flags_lock for a valid inode here. Skip it if it
1742                         * is not valid, the wrong inode or stale.
1743                         */
1744                        spin_lock(&ip->i_flags_lock);
1745                        if (ip->i_ino != inum + i ||
1746                            __xfs_iflags_test(ip, XFS_ISTALE)) {
1747                                spin_unlock(&ip->i_flags_lock);
1748                                rcu_read_unlock();
1749                                continue;
1750                        }
1751                        spin_unlock(&ip->i_flags_lock);
1752
1753                        /*
1754                         * Don't try to lock/unlock the current inode, but we
1755                         * _cannot_ skip the other inodes that we did not find
1756                         * in the list attached to the buffer and are not
1757                         * already marked stale. If we can't lock it, back off
1758                         * and retry.
1759                         */
1760                        if (ip != free_ip &&
1761                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1762                                rcu_read_unlock();
1763                                delay(1);
1764                                goto retry;
1765                        }
1766                        rcu_read_unlock();
1767
1768                        xfs_iflock(ip);
1769                        xfs_iflags_set(ip, XFS_ISTALE);
1770
1771                        /*
1772                         * we don't need to attach clean inodes or those only
1773                         * with unlogged changes (which we throw away, anyway).
1774                         */
1775                        iip = ip->i_itemp;
1776                        if (!iip || xfs_inode_clean(ip)) {
1777                                ASSERT(ip != free_ip);
1778                                ip->i_update_core = 0;
1779                                xfs_ifunlock(ip);
1780                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
1781                                continue;
1782                        }
1783
1784                        iip->ili_last_fields = iip->ili_format.ilf_fields;
1785                        iip->ili_format.ilf_fields = 0;
1786                        iip->ili_logged = 1;
1787                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
1788                                                &iip->ili_item.li_lsn);
1789
1790                        xfs_buf_attach_iodone(bp, xfs_istale_done,
1791                                                  &iip->ili_item);
1792
1793                        if (ip != free_ip)
1794                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
1795                }
1796
1797                xfs_trans_stale_inode_buf(tp, bp);
1798                xfs_trans_binval(tp, bp);
1799        }
1800
1801        xfs_perag_put(pag);
1802}
1803
1804/*
1805 * This is called to return an inode to the inode free list.
1806 * The inode should already be truncated to 0 length and have
1807 * no pages associated with it.  This routine also assumes that
1808 * the inode is already a part of the transaction.
1809 *
1810 * The on-disk copy of the inode will have been added to the list
1811 * of unlinked inodes in the AGI. We need to remove the inode from
1812 * that list atomically with respect to freeing it here.
1813 */
1814int
1815xfs_ifree(
1816        xfs_trans_t     *tp,
1817        xfs_inode_t     *ip,
1818        xfs_bmap_free_t *flist)
1819{
1820        int                     error;
1821        int                     delete;
1822        xfs_ino_t               first_ino;
1823        xfs_dinode_t            *dip;
1824        xfs_buf_t               *ibp;
1825
1826        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1827        ASSERT(ip->i_d.di_nlink == 0);
1828        ASSERT(ip->i_d.di_nextents == 0);
1829        ASSERT(ip->i_d.di_anextents == 0);
1830        ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
1831               (!S_ISREG(ip->i_d.di_mode)));
1832        ASSERT(ip->i_d.di_nblocks == 0);
1833
1834        /*
1835         * Pull the on-disk inode from the AGI unlinked list.
1836         */
1837        error = xfs_iunlink_remove(tp, ip);
1838        if (error != 0) {
1839                return error;
1840        }
1841
1842        error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
1843        if (error != 0) {
1844                return error;
1845        }
1846        ip->i_d.di_mode = 0;            /* mark incore inode as free */
1847        ip->i_d.di_flags = 0;
1848        ip->i_d.di_dmevmask = 0;
1849        ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
1850        ip->i_df.if_ext_max =
1851                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
1852        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1853        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1854        /*
1855         * Bump the generation count so no one will be confused
1856         * by reincarnations of this inode.
1857         */
1858        ip->i_d.di_gen++;
1859
1860        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1861
1862        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
1863        if (error)
1864                return error;
1865
1866        /*
1867        * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
1868        * from picking up this inode when it is reclaimed (its incore state
1869        * initialzed but not flushed to disk yet). The in-core di_mode is
1870        * already cleared  and a corresponding transaction logged.
1871        * The hack here just synchronizes the in-core to on-disk
1872        * di_mode value in advance before the actual inode sync to disk.
1873        * This is OK because the inode is already unlinked and would never
1874        * change its di_mode again for this inode generation.
1875        * This is a temporary hack that would require a proper fix
1876        * in the future.
1877        */
1878        dip->di_mode = 0;
1879
1880        if (delete) {
1881                xfs_ifree_cluster(ip, tp, first_ino);
1882        }
1883
1884        return 0;
1885}
1886
1887/*
1888 * Reallocate the space for if_broot based on the number of records
1889 * being added or deleted as indicated in rec_diff.  Move the records
1890 * and pointers in if_broot to fit the new size.  When shrinking this
1891 * will eliminate holes between the records and pointers created by
1892 * the caller.  When growing this will create holes to be filled in
1893 * by the caller.
1894 *
1895 * The caller must not request to add more records than would fit in
1896 * the on-disk inode root.  If the if_broot is currently NULL, then
1897 * if we adding records one will be allocated.  The caller must also
1898 * not request that the number of records go below zero, although
1899 * it can go to zero.
1900 *
1901 * ip -- the inode whose if_broot area is changing
1902 * ext_diff -- the change in the number of records, positive or negative,
1903 *       requested for the if_broot array.
1904 */
1905void
1906xfs_iroot_realloc(
1907        xfs_inode_t             *ip,
1908        int                     rec_diff,
1909        int                     whichfork)
1910{
1911        struct xfs_mount        *mp = ip->i_mount;
1912        int                     cur_max;
1913        xfs_ifork_t             *ifp;
1914        struct xfs_btree_block  *new_broot;
1915        int                     new_max;
1916        size_t                  new_size;
1917        char                    *np;
1918        char                    *op;
1919
1920        /*
1921         * Handle the degenerate case quietly.
1922         */
1923        if (rec_diff == 0) {
1924                return;
1925        }
1926
1927        ifp = XFS_IFORK_PTR(ip, whichfork);
1928        if (rec_diff > 0) {
1929                /*
1930                 * If there wasn't any memory allocated before, just
1931                 * allocate it now and get out.
1932                 */
1933                if (ifp->if_broot_bytes == 0) {
1934                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
1935                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
1936                        ifp->if_broot_bytes = (int)new_size;
1937                        return;
1938                }
1939
1940                /*
1941                 * If there is already an existing if_broot, then we need
1942                 * to realloc() it and shift the pointers to their new
1943                 * location.  The records don't change location because
1944                 * they are kept butted up against the btree block header.
1945                 */
1946                cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
1947                new_max = cur_max + rec_diff;
1948                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
1949                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
1950                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
1951                                KM_SLEEP | KM_NOFS);
1952                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
1953                                                     ifp->if_broot_bytes);
1954                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
1955                                                     (int)new_size);
1956                ifp->if_broot_bytes = (int)new_size;
1957                ASSERT(ifp->if_broot_bytes <=
1958                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
1959                memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
1960                return;
1961        }
1962
1963        /*
1964         * rec_diff is less than 0.  In this case, we are shrinking the
1965         * if_broot buffer.  It must already exist.  If we go to zero
1966         * records, just get rid of the root and clear the status bit.
1967         */
1968        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
1969        cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
1970        new_max = cur_max + rec_diff;
1971        ASSERT(new_max >= 0);
1972        if (new_max > 0)
1973                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
1974        else
1975                new_size = 0;
1976        if (new_size > 0) {
1977                new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
1978                /*
1979                 * First copy over the btree block header.
1980                 */
1981                memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
1982        } else {
1983                new_broot = NULL;
1984                ifp->if_flags &= ~XFS_IFBROOT;
1985        }
1986
1987        /*
1988         * Only copy the records and pointers if there are any.
1989         */
1990        if (new_max > 0) {
1991                /*
1992                 * First copy the records.
1993                 */
1994                op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
1995                np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
1996                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
1997
1998                /*
1999                 * Then copy the pointers.
2000                 */
2001                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2002                                                     ifp->if_broot_bytes);
2003                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
2004                                                     (int)new_size);
2005                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2006        }
2007        kmem_free(ifp->if_broot);
2008        ifp->if_broot = new_broot;
2009        ifp->if_broot_bytes = (int)new_size;
2010        ASSERT(ifp->if_broot_bytes <=
2011                XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2012        return;
2013}
2014
2015
2016/*
2017 * This is called when the amount of space needed for if_data
2018 * is increased or decreased.  The change in size is indicated by
2019 * the number of bytes that need to be added or deleted in the
2020 * byte_diff parameter.
2021 *
2022 * If the amount of space needed has decreased below the size of the
2023 * inline buffer, then switch to using the inline buffer.  Otherwise,
2024 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2025 * to what is needed.
2026 *
2027 * ip -- the inode whose if_data area is changing
2028 * byte_diff -- the change in the number of bytes, positive or negative,
2029 *       requested for the if_data array.
2030 */
2031void
2032xfs_idata_realloc(
2033        xfs_inode_t     *ip,
2034        int             byte_diff,
2035        int             whichfork)
2036{
2037        xfs_ifork_t     *ifp;
2038        int             new_size;
2039        int             real_size;
2040
2041        if (byte_diff == 0) {
2042                return;
2043        }
2044
2045        ifp = XFS_IFORK_PTR(ip, whichfork);
2046        new_size = (int)ifp->if_bytes + byte_diff;
2047        ASSERT(new_size >= 0);
2048
2049        if (new_size == 0) {
2050                if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2051                        kmem_free(ifp->if_u1.if_data);
2052                }
2053                ifp->if_u1.if_data = NULL;
2054                real_size = 0;
2055        } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2056                /*
2057                 * If the valid extents/data can fit in if_inline_ext/data,
2058                 * copy them from the malloc'd vector and free it.
2059                 */
2060                if (ifp->if_u1.if_data == NULL) {
2061                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2062                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2063                        ASSERT(ifp->if_real_bytes != 0);
2064                        memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2065                              new_size);
2066                        kmem_free(ifp->if_u1.if_data);
2067                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2068                }
2069                real_size = 0;
2070        } else {
2071                /*
2072                 * Stuck with malloc/realloc.
2073                 * For inline data, the underlying buffer must be
2074                 * a multiple of 4 bytes in size so that it can be
2075                 * logged and stay on word boundaries.  We enforce
2076                 * that here.
2077                 */
2078                real_size = roundup(new_size, 4);
2079                if (ifp->if_u1.if_data == NULL) {
2080                        ASSERT(ifp->if_real_bytes == 0);
2081                        ifp->if_u1.if_data = kmem_alloc(real_size,
2082                                                        KM_SLEEP | KM_NOFS);
2083                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2084                        /*
2085                         * Only do the realloc if the underlying size
2086                         * is really changing.
2087                         */
2088                        if (ifp->if_real_bytes != real_size) {
2089                                ifp->if_u1.if_data =
2090                                        kmem_realloc(ifp->if_u1.if_data,
2091                                                        real_size,
2092                                                        ifp->if_real_bytes,
2093                                                        KM_SLEEP | KM_NOFS);
2094                        }
2095                } else {
2096                        ASSERT(ifp->if_real_bytes == 0);
2097                        ifp->if_u1.if_data = kmem_alloc(real_size,
2098                                                        KM_SLEEP | KM_NOFS);
2099                        memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2100                                ifp->if_bytes);
2101                }
2102        }
2103        ifp->if_real_bytes = real_size;
2104        ifp->if_bytes = new_size;
2105        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2106}
2107
2108void
2109xfs_idestroy_fork(
2110        xfs_inode_t     *ip,
2111        int             whichfork)
2112{
2113        xfs_ifork_t     *ifp;
2114
2115        ifp = XFS_IFORK_PTR(ip, whichfork);
2116        if (ifp->if_broot != NULL) {
2117                kmem_free(ifp->if_broot);
2118                ifp->if_broot = NULL;
2119        }
2120
2121        /*
2122         * If the format is local, then we can't have an extents
2123         * array so just look for an inline data array.  If we're
2124         * not local then we may or may not have an extents list,
2125         * so check and free it up if we do.
2126         */
2127        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2128                if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2129                    (ifp->if_u1.if_data != NULL)) {
2130                        ASSERT(ifp->if_real_bytes != 0);
2131                        kmem_free(ifp->if_u1.if_data);
2132                        ifp->if_u1.if_data = NULL;
2133                        ifp->if_real_bytes = 0;
2134                }
2135        } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2136                   ((ifp->if_flags & XFS_IFEXTIREC) ||
2137                    ((ifp->if_u1.if_extents != NULL) &&
2138                     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
2139                ASSERT(ifp->if_real_bytes != 0);
2140                xfs_iext_destroy(ifp);
2141        }
2142        ASSERT(ifp->if_u1.if_extents == NULL ||
2143               ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2144        ASSERT(ifp->if_real_bytes == 0);
2145        if (whichfork == XFS_ATTR_FORK) {
2146                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2147                ip->i_afp = NULL;
2148        }
2149}
2150
2151/*
2152 * This is called to unpin an inode.  The caller must have the inode locked
2153 * in at least shared mode so that the buffer cannot be subsequently pinned
2154 * once someone is waiting for it to be unpinned.
2155 */
2156static void
2157xfs_iunpin_nowait(
2158        struct xfs_inode        *ip)
2159{
2160        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2161
2162        trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2163
2164        /* Give the log a push to start the unpinning I/O */
2165        xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2166
2167}
2168
2169void
2170xfs_iunpin_wait(
2171        struct xfs_inode        *ip)
2172{
2173        if (xfs_ipincount(ip)) {
2174                xfs_iunpin_nowait(ip);
2175                wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
2176        }
2177}
2178
2179/*
2180 * xfs_iextents_copy()
2181 *
2182 * This is called to copy the REAL extents (as opposed to the delayed
2183 * allocation extents) from the inode into the given buffer.  It
2184 * returns the number of bytes copied into the buffer.
2185 *
2186 * If there are no delayed allocation extents, then we can just
2187 * memcpy() the extents into the buffer.  Otherwise, we need to
2188 * examine each extent in turn and skip those which are delayed.
2189 */
2190int
2191xfs_iextents_copy(
2192        xfs_inode_t             *ip,
2193        xfs_bmbt_rec_t          *dp,
2194        int                     whichfork)
2195{
2196        int                     copied;
2197        int                     i;
2198        xfs_ifork_t             *ifp;
2199        int                     nrecs;
2200        xfs_fsblock_t           start_block;
2201
2202        ifp = XFS_IFORK_PTR(ip, whichfork);
2203        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2204        ASSERT(ifp->if_bytes > 0);
2205
2206        nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2207        XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
2208        ASSERT(nrecs > 0);
2209
2210        /*
2211         * There are some delayed allocation extents in the
2212         * inode, so copy the extents one at a time and skip
2213         * the delayed ones.  There must be at least one
2214         * non-delayed extent.
2215         */
2216        copied = 0;
2217        for (i = 0; i < nrecs; i++) {
2218                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
2219                start_block = xfs_bmbt_get_startblock(ep);
2220                if (isnullstartblock(start_block)) {
2221                        /*
2222                         * It's a delayed allocation extent, so skip it.
2223                         */
2224                        continue;
2225                }
2226
2227                /* Translate to on disk format */
2228                put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
2229                put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
2230                dp++;
2231                copied++;
2232        }
2233        ASSERT(copied != 0);
2234        xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
2235
2236        return (copied * (uint)sizeof(xfs_bmbt_rec_t));
2237}
2238
2239/*
2240 * Each of the following cases stores data into the same region
2241 * of the on-disk inode, so only one of them can be valid at
2242 * any given time. While it is possible to have conflicting formats
2243 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2244 * in EXTENTS format, this can only happen when the fork has
2245 * changed formats after being modified but before being flushed.
2246 * In these cases, the format always takes precedence, because the
2247 * format indicates the current state of the fork.
2248 */
2249/*ARGSUSED*/
2250STATIC void
2251xfs_iflush_fork(
2252        xfs_inode_t             *ip,
2253        xfs_dinode_t            *dip,
2254        xfs_inode_log_item_t    *iip,
2255        int                     whichfork,
2256        xfs_buf_t               *bp)
2257{
2258        char                    *cp;
2259        xfs_ifork_t             *ifp;
2260        xfs_mount_t             *mp;
2261#ifdef XFS_TRANS_DEBUG
2262        int                     first;
2263#endif
2264        static const short      brootflag[2] =
2265                { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2266        static const short      dataflag[2] =
2267                { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2268        static const short      extflag[2] =
2269                { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2270
2271        if (!iip)
2272                return;
2273        ifp = XFS_IFORK_PTR(ip, whichfork);
2274        /*
2275         * This can happen if we gave up in iformat in an error path,
2276         * for the attribute fork.
2277         */
2278        if (!ifp) {
2279                ASSERT(whichfork == XFS_ATTR_FORK);
2280                return;
2281        }
2282        cp = XFS_DFORK_PTR(dip, whichfork);
2283        mp = ip->i_mount;
2284        switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2285        case XFS_DINODE_FMT_LOCAL:
2286                if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
2287                    (ifp->if_bytes > 0)) {
2288                        ASSERT(ifp->if_u1.if_data != NULL);
2289                        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2290                        memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
2291                }
2292                break;
2293
2294        case XFS_DINODE_FMT_EXTENTS:
2295                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2296                       !(iip->ili_format.ilf_fields & extflag[whichfork]));
2297                if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
2298                    (ifp->if_bytes > 0)) {
2299                        ASSERT(xfs_iext_get_ext(ifp, 0));
2300                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2301                        (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2302                                whichfork);
2303                }
2304                break;
2305
2306        case XFS_DINODE_FMT_BTREE:
2307                if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
2308                    (ifp->if_broot_bytes > 0)) {
2309                        ASSERT(ifp->if_broot != NULL);
2310                        ASSERT(ifp->if_broot_bytes <=
2311                               (XFS_IFORK_SIZE(ip, whichfork) +
2312                                XFS_BROOT_SIZE_ADJ));
2313                        xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
2314                                (xfs_bmdr_block_t *)cp,
2315                                XFS_DFORK_SIZE(dip, mp, whichfork));
2316                }
2317                break;
2318
2319        case XFS_DINODE_FMT_DEV:
2320                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
2321                        ASSERT(whichfork == XFS_DATA_FORK);
2322                        xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
2323                }
2324                break;
2325
2326        case XFS_DINODE_FMT_UUID:
2327                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
2328                        ASSERT(whichfork == XFS_DATA_FORK);
2329                        memcpy(XFS_DFORK_DPTR(dip),
2330                               &ip->i_df.if_u2.if_uuid,
2331                               sizeof(uuid_t));
2332                }
2333                break;
2334
2335        default:
2336                ASSERT(0);
2337                break;
2338        }
2339}
2340
2341STATIC int
2342xfs_iflush_cluster(
2343        xfs_inode_t     *ip,
2344        xfs_buf_t       *bp)
2345{
2346        xfs_mount_t             *mp = ip->i_mount;
2347        struct xfs_perag        *pag;
2348        unsigned long           first_index, mask;
2349        unsigned long           inodes_per_cluster;
2350        int                     ilist_size;
2351        xfs_inode_t             **ilist;
2352        xfs_inode_t             *iq;
2353        int                     nr_found;
2354        int                     clcount = 0;
2355        int                     bufwasdelwri;
2356        int                     i;
2357
2358        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2359
2360        inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2361        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2362        ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2363        if (!ilist)
2364                goto out_put;
2365
2366        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2367        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2368        rcu_read_lock();
2369        /* really need a gang lookup range call here */
2370        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2371                                        first_index, inodes_per_cluster);
2372        if (nr_found == 0)
2373                goto out_free;
2374
2375        for (i = 0; i < nr_found; i++) {
2376                iq = ilist[i];
2377                if (iq == ip)
2378                        continue;
2379
2380                /*
2381                 * because this is an RCU protected lookup, we could find a
2382                 * recently freed or even reallocated inode during the lookup.
2383                 * We need to check under the i_flags_lock for a valid inode
2384                 * here. Skip it if it is not valid or the wrong inode.
2385                 */
2386                spin_lock(&ip->i_flags_lock);
2387                if (!ip->i_ino ||
2388                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2389                        spin_unlock(&ip->i_flags_lock);
2390                        continue;
2391                }
2392                spin_unlock(&ip->i_flags_lock);
2393
2394                /*
2395                 * Do an un-protected check to see if the inode is dirty and
2396                 * is a candidate for flushing.  These checks will be repeated
2397                 * later after the appropriate locks are acquired.
2398                 */
2399                if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
2400                        continue;
2401
2402                /*
2403                 * Try to get locks.  If any are unavailable or it is pinned,
2404                 * then this inode cannot be flushed and is skipped.
2405                 */
2406
2407                if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
2408                        continue;
2409                if (!xfs_iflock_nowait(iq)) {
2410                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
2411                        continue;
2412                }
2413                if (xfs_ipincount(iq)) {
2414                        xfs_ifunlock(iq);
2415                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
2416                        continue;
2417                }
2418
2419                /*
2420                 * arriving here means that this inode can be flushed.  First
2421                 * re-check that it's dirty before flushing.
2422                 */
2423                if (!xfs_inode_clean(iq)) {
2424                        int     error;
2425                        error = xfs_iflush_int(iq, bp);
2426                        if (error) {
2427                                xfs_iunlock(iq, XFS_ILOCK_SHARED);
2428                                goto cluster_corrupt_out;
2429                        }
2430                        clcount++;
2431                } else {
2432                        xfs_ifunlock(iq);
2433                }
2434                xfs_iunlock(iq, XFS_ILOCK_SHARED);
2435        }
2436
2437        if (clcount) {
2438                XFS_STATS_INC(xs_icluster_flushcnt);
2439                XFS_STATS_ADD(xs_icluster_flushinode, clcount);
2440        }
2441
2442out_free:
2443        rcu_read_unlock();
2444        kmem_free(ilist);
2445out_put:
2446        xfs_perag_put(pag);
2447        return 0;
2448
2449
2450cluster_corrupt_out:
2451        /*
2452         * Corruption detected in the clustering loop.  Invalidate the
2453         * inode buffer and shut down the filesystem.
2454         */
2455        rcu_read_unlock();
2456        /*
2457         * Clean up the buffer.  If it was B_DELWRI, just release it --
2458         * brelse can handle it with no problems.  If not, shut down the
2459         * filesystem before releasing the buffer.
2460         */
2461        bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
2462        if (bufwasdelwri)
2463                xfs_buf_relse(bp);
2464
2465        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2466
2467        if (!bufwasdelwri) {
2468                /*
2469                 * Just like incore_relse: if we have b_iodone functions,
2470                 * mark the buffer as an error and call them.  Otherwise
2471                 * mark it as stale and brelse.
2472                 */
2473                if (bp->b_iodone) {
2474                        XFS_BUF_UNDONE(bp);
2475                        XFS_BUF_STALE(bp);
2476                        xfs_buf_ioerror(bp, EIO);
2477                        xfs_buf_ioend(bp, 0);
2478                } else {
2479                        XFS_BUF_STALE(bp);
2480                        xfs_buf_relse(bp);
2481                }
2482        }
2483
2484        /*
2485         * Unlocks the flush lock
2486         */
2487        xfs_iflush_abort(iq);
2488        kmem_free(ilist);
2489        xfs_perag_put(pag);
2490        return XFS_ERROR(EFSCORRUPTED);
2491}
2492
2493/*
2494 * xfs_iflush() will write a modified inode's changes out to the
2495 * inode's on disk home.  The caller must have the inode lock held
2496 * in at least shared mode and the inode flush completion must be
2497 * active as well.  The inode lock will still be held upon return from
2498 * the call and the caller is free to unlock it.
2499 * The inode flush will be completed when the inode reaches the disk.
2500 * The flags indicate how the inode's buffer should be written out.
2501 */
2502int
2503xfs_iflush(
2504        xfs_inode_t             *ip,
2505        uint                    flags)
2506{
2507        xfs_inode_log_item_t    *iip;
2508        xfs_buf_t               *bp;
2509        xfs_dinode_t            *dip;
2510        xfs_mount_t             *mp;
2511        int                     error;
2512
2513        XFS_STATS_INC(xs_iflush_count);
2514
2515        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2516        ASSERT(!completion_done(&ip->i_flush));
2517        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2518               ip->i_d.di_nextents > ip->i_df.if_ext_max);
2519
2520        iip = ip->i_itemp;
2521        mp = ip->i_mount;
2522
2523        /*
2524         * We can't flush the inode until it is unpinned, so wait for it if we
2525         * are allowed to block.  We know no one new can pin it, because we are
2526         * holding the inode lock shared and you need to hold it exclusively to
2527         * pin the inode.
2528         *
2529         * If we are not allowed to block, force the log out asynchronously so
2530         * that when we come back the inode will be unpinned. If other inodes
2531         * in the same cluster are dirty, they will probably write the inode
2532         * out for us if they occur after the log force completes.
2533         */
2534        if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2535                xfs_iunpin_nowait(ip);
2536                xfs_ifunlock(ip);
2537                return EAGAIN;
2538        }
2539        xfs_iunpin_wait(ip);
2540
2541        /*
2542         * For stale inodes we cannot rely on the backing buffer remaining
2543         * stale in cache for the remaining life of the stale inode and so
2544         * xfs_itobp() below may give us a buffer that no longer contains
2545         * inodes below. We have to check this after ensuring the inode is
2546         * unpinned so that it is safe to reclaim the stale inode after the
2547         * flush call.
2548         */
2549        if (xfs_iflags_test(ip, XFS_ISTALE)) {
2550                xfs_ifunlock(ip);
2551                return 0;
2552        }
2553
2554        /*
2555         * This may have been unpinned because the filesystem is shutting
2556         * down forcibly. If that's the case we must not write this inode
2557         * to disk, because the log record didn't make it to disk!
2558         */
2559        if (XFS_FORCED_SHUTDOWN(mp)) {
2560                ip->i_update_core = 0;
2561                if (iip)
2562                        iip->ili_format.ilf_fields = 0;
2563                xfs_ifunlock(ip);
2564                return XFS_ERROR(EIO);
2565        }
2566
2567        /*
2568         * Get the buffer containing the on-disk inode.
2569         */
2570        error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2571                                (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
2572        if (error || !bp) {
2573                xfs_ifunlock(ip);
2574                return error;
2575        }
2576
2577        /*
2578         * First flush out the inode that xfs_iflush was called with.
2579         */
2580        error = xfs_iflush_int(ip, bp);
2581        if (error)
2582                goto corrupt_out;
2583
2584        /*
2585         * If the buffer is pinned then push on the log now so we won't
2586         * get stuck waiting in the write for too long.
2587         */
2588        if (xfs_buf_ispinned(bp))
2589                xfs_log_force(mp, 0);
2590
2591        /*
2592         * inode clustering:
2593         * see if other inodes can be gathered into this write
2594         */
2595        error = xfs_iflush_cluster(ip, bp);
2596        if (error)
2597                goto cluster_corrupt_out;
2598
2599        if (flags & SYNC_WAIT)
2600                error = xfs_bwrite(mp, bp);
2601        else
2602                xfs_bdwrite(mp, bp);
2603        return error;
2604
2605corrupt_out:
2606        xfs_buf_relse(bp);
2607        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2608cluster_corrupt_out:
2609        /*
2610         * Unlocks the flush lock
2611         */
2612        xfs_iflush_abort(ip);
2613        return XFS_ERROR(EFSCORRUPTED);
2614}
2615
2616
2617STATIC int
2618xfs_iflush_int(
2619        xfs_inode_t             *ip,
2620        xfs_buf_t               *bp)
2621{
2622        xfs_inode_log_item_t    *iip;
2623        xfs_dinode_t            *dip;
2624        xfs_mount_t             *mp;
2625#ifdef XFS_TRANS_DEBUG
2626        int                     first;
2627#endif
2628
2629        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2630        ASSERT(!completion_done(&ip->i_flush));
2631        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2632               ip->i_d.di_nextents > ip->i_df.if_ext_max);
2633
2634        iip = ip->i_itemp;
2635        mp = ip->i_mount;
2636
2637        /* set *dip = inode's place in the buffer */
2638        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
2639
2640        /*
2641         * Clear i_update_core before copying out the data.
2642         * This is for coordination with our timestamp updates
2643         * that don't hold the inode lock. They will always
2644         * update the timestamps BEFORE setting i_update_core,
2645         * so if we clear i_update_core after they set it we
2646         * are guaranteed to see their updates to the timestamps.
2647         * I believe that this depends on strongly ordered memory
2648         * semantics, but we have that.  We use the SYNCHRONIZE
2649         * macro to make sure that the compiler does not reorder
2650         * the i_update_core access below the data copy below.
2651         */
2652        ip->i_update_core = 0;
2653        SYNCHRONIZE();
2654
2655        /*
2656         * Make sure to get the latest timestamps from the Linux inode.
2657         */
2658        xfs_synchronize_times(ip);
2659
2660        if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
2661                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2662                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2663                        "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
2664                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
2665                goto corrupt_out;
2666        }
2667        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
2668                                mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
2669                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2670                        "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
2671                        __func__, ip->i_ino, ip, ip->i_d.di_magic);
2672                goto corrupt_out;
2673        }
2674        if (S_ISREG(ip->i_d.di_mode)) {
2675                if (XFS_TEST_ERROR(
2676                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2677                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
2678                    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
2679                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2680                                "%s: Bad regular inode %Lu, ptr 0x%p",
2681                                __func__, ip->i_ino, ip);
2682                        goto corrupt_out;
2683                }
2684        } else if (S_ISDIR(ip->i_d.di_mode)) {
2685                if (XFS_TEST_ERROR(
2686                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2687                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
2688                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
2689                    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
2690                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2691                                "%s: Bad directory inode %Lu, ptr 0x%p",
2692                                __func__, ip->i_ino, ip);
2693                        goto corrupt_out;
2694                }
2695        }
2696        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
2697                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
2698                                XFS_RANDOM_IFLUSH_5)) {
2699                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2700                        "%s: detected corrupt incore inode %Lu, "
2701                        "total extents = %d, nblocks = %Ld, ptr 0x%p",
2702                        __func__, ip->i_ino,
2703                        ip->i_d.di_nextents + ip->i_d.di_anextents,
2704                        ip->i_d.di_nblocks, ip);
2705                goto corrupt_out;
2706        }
2707        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
2708                                mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
2709                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2710                        "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
2711                        __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2712                goto corrupt_out;
2713        }
2714        /*
2715         * bump the flush iteration count, used to detect flushes which
2716         * postdate a log record during recovery.
2717         */
2718
2719        ip->i_d.di_flushiter++;
2720
2721        /*
2722         * Copy the dirty parts of the inode into the on-disk
2723         * inode.  We always copy out the core of the inode,
2724         * because if the inode is dirty at all the core must
2725         * be.
2726         */
2727        xfs_dinode_to_disk(dip, &ip->i_d);
2728
2729        /* Wrap, we never let the log put out DI_MAX_FLUSH */
2730        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
2731                ip->i_d.di_flushiter = 0;
2732
2733        /*
2734         * If this is really an old format inode and the superblock version
2735         * has not been updated to support only new format inodes, then
2736         * convert back to the old inode format.  If the superblock version
2737         * has been updated, then make the conversion permanent.
2738         */
2739        ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
2740        if (ip->i_d.di_version == 1) {
2741                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
2742                        /*
2743                         * Convert it back.
2744                         */
2745                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
2746                        dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
2747                } else {
2748                        /*
2749                         * The superblock version has already been bumped,
2750                         * so just make the conversion to the new inode
2751                         * format permanent.
2752                         */
2753                        ip->i_d.di_version = 2;
2754                        dip->di_version = 2;
2755                        ip->i_d.di_onlink = 0;
2756                        dip->di_onlink = 0;
2757                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
2758                        memset(&(dip->di_pad[0]), 0,
2759                              sizeof(dip->di_pad));
2760                        ASSERT(xfs_get_projid(ip) == 0);
2761                }
2762        }
2763
2764        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
2765        if (XFS_IFORK_Q(ip))
2766                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
2767        xfs_inobp_check(mp, bp);
2768
2769        /*
2770         * We've recorded everything logged in the inode, so we'd
2771         * like to clear the ilf_fields bits so we don't log and
2772         * flush things unnecessarily.  However, we can't stop
2773         * logging all this information until the data we've copied
2774         * into the disk buffer is written to disk.  If we did we might
2775         * overwrite the copy of the inode in the log with all the
2776         * data after re-logging only part of it, and in the face of
2777         * a crash we wouldn't have all the data we need to recover.
2778         *
2779         * What we do is move the bits to the ili_last_fields field.
2780         * When logging the inode, these bits are moved back to the
2781         * ilf_fields field.  In the xfs_iflush_done() routine we
2782         * clear ili_last_fields, since we know that the information
2783         * those bits represent is permanently on disk.  As long as
2784         * the flush completes before the inode is logged again, then
2785         * both ilf_fields and ili_last_fields will be cleared.
2786         *
2787         * We can play with the ilf_fields bits here, because the inode
2788         * lock must be held exclusively in order to set bits there
2789         * and the flush lock protects the ili_last_fields bits.
2790         * Set ili_logged so the flush done
2791         * routine can tell whether or not to look in the AIL.
2792         * Also, store the current LSN of the inode so that we can tell
2793         * whether the item has moved in the AIL from xfs_iflush_done().
2794         * In order to read the lsn we need the AIL lock, because
2795         * it is a 64 bit value that cannot be read atomically.
2796         */
2797        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
2798                iip->ili_last_fields = iip->ili_format.ilf_fields;
2799                iip->ili_format.ilf_fields = 0;
2800                iip->ili_logged = 1;
2801
2802                xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2803                                        &iip->ili_item.li_lsn);
2804
2805                /*
2806                 * Attach the function xfs_iflush_done to the inode's
2807                 * buffer.  This will remove the inode from the AIL
2808                 * and unlock the inode's flush lock when the inode is
2809                 * completely written to disk.
2810                 */
2811                xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
2812
2813                ASSERT(bp->b_fspriv != NULL);
2814                ASSERT(bp->b_iodone != NULL);
2815        } else {
2816                /*
2817                 * We're flushing an inode which is not in the AIL and has
2818                 * not been logged but has i_update_core set.  For this
2819                 * case we can use a B_DELWRI flush and immediately drop
2820                 * the inode flush lock because we can avoid the whole
2821                 * AIL state thing.  It's OK to drop the flush lock now,
2822                 * because we've already locked the buffer and to do anything
2823                 * you really need both.
2824                 */
2825                if (iip != NULL) {
2826                        ASSERT(iip->ili_logged == 0);
2827                        ASSERT(iip->ili_last_fields == 0);
2828                        ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
2829                }
2830                xfs_ifunlock(ip);
2831        }
2832
2833        return 0;
2834
2835corrupt_out:
2836        return XFS_ERROR(EFSCORRUPTED);
2837}
2838
2839/*
2840 * Return a pointer to the extent record at file index idx.
2841 */
2842xfs_bmbt_rec_host_t *
2843xfs_iext_get_ext(
2844        xfs_ifork_t     *ifp,           /* inode fork pointer */
2845        xfs_extnum_t    idx)            /* index of target extent */
2846{
2847        ASSERT(idx >= 0);
2848        ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
2849
2850        if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
2851                return ifp->if_u1.if_ext_irec->er_extbuf;
2852        } else if (ifp->if_flags & XFS_IFEXTIREC) {
2853                xfs_ext_irec_t  *erp;           /* irec pointer */
2854                int             erp_idx = 0;    /* irec index */
2855                xfs_extnum_t    page_idx = idx; /* ext index in target list */
2856
2857                erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
2858                return &erp->er_extbuf[page_idx];
2859        } else if (ifp->if_bytes) {
2860                return &ifp->if_u1.if_extents[idx];
2861        } else {
2862                return NULL;
2863        }
2864}
2865
2866/*
2867 * Insert new item(s) into the extent records for incore inode
2868 * fork 'ifp'.  'count' new items are inserted at index 'idx'.
2869 */
2870void
2871xfs_iext_insert(
2872        xfs_inode_t     *ip,            /* incore inode pointer */
2873        xfs_extnum_t    idx,            /* starting index of new items */
2874        xfs_extnum_t    count,          /* number of inserted items */
2875        xfs_bmbt_irec_t *new,           /* items to insert */
2876        int             state)          /* type of extent conversion */
2877{
2878        xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
2879        xfs_extnum_t    i;              /* extent record index */
2880
2881        trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
2882
2883        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
2884        xfs_iext_add(ifp, idx, count);
2885        for (i = idx; i < idx + count; i++, new++)
2886                xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
2887}
2888
2889/*
2890 * This is called when the amount of space required for incore file
2891 * extents needs to be increased. The ext_diff parameter stores the
2892 * number of new extents being added and the idx parameter contains
2893 * the extent index where the new extents will be added. If the new
2894 * extents are being appended, then we just need to (re)allocate and
2895 * initialize the space. Otherwise, if the new extents are being
2896 * inserted into the middle of the existing entries, a bit more work
2897 * is required to make room for the new extents to be inserted. The
2898 * caller is responsible for filling in the new extent entries upon
2899 * return.
2900 */
2901void
2902xfs_iext_add(
2903        xfs_ifork_t     *ifp,           /* inode fork pointer */
2904        xfs_extnum_t    idx,            /* index to begin adding exts */
2905        int             ext_diff)       /* number of extents to add */
2906{
2907        int             byte_diff;      /* new bytes being added */
2908        int             new_size;       /* size of extents after adding */
2909        xfs_extnum_t    nextents;       /* number of extents in file */
2910
2911        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2912        ASSERT((idx >= 0) && (idx <= nextents));
2913        byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
2914        new_size = ifp->if_bytes + byte_diff;
2915        /*
2916         * If the new number of extents (nextents + ext_diff)
2917         * fits inside the inode, then continue to use the inline
2918         * extent buffer.
2919         */
2920        if (nextents + ext_diff <= XFS_INLINE_EXTS) {
2921                if (idx < nextents) {
2922                        memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
2923                                &ifp->if_u2.if_inline_ext[idx],
2924                                (nextents - idx) * sizeof(xfs_bmbt_rec_t));
2925                        memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
2926                }
2927                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
2928                ifp->if_real_bytes = 0;
2929        }
2930        /*
2931         * Otherwise use a linear (direct) extent list.
2932         * If the extents are currently inside the inode,
2933         * xfs_iext_realloc_direct will switch us from
2934         * inline to direct extent allocation mode.
2935         */
2936        else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
2937                xfs_iext_realloc_direct(ifp, new_size);
2938                if (idx < nextents) {
2939                        memmove(&ifp->if_u1.if_extents[idx + ext_diff],
2940                                &ifp->if_u1.if_extents[idx],
2941                                (nextents - idx) * sizeof(xfs_bmbt_rec_t));
2942                        memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
2943                }
2944        }
2945        /* Indirection array */
2946        else {
2947                xfs_ext_irec_t  *erp;
2948                int             erp_idx = 0;
2949                int             page_idx = idx;
2950
2951                ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
2952                if (ifp->if_flags & XFS_IFEXTIREC) {
2953                        erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
2954                } else {
2955                        xfs_iext_irec_init(ifp);
2956                        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
2957                        erp = ifp->if_u1.if_ext_irec;
2958                }
2959                /* Extents fit in target extent page */
2960                if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
2961                        if (page_idx < erp->er_extcount) {
2962                                memmove(&erp->er_extbuf[page_idx + ext_diff],
2963                                        &erp->er_extbuf[page_idx],
2964                                        (erp->er_extcount - page_idx) *
2965                                        sizeof(xfs_bmbt_rec_t));
2966                                memset(&erp->er_extbuf[page_idx], 0, byte_diff);
2967                        }
2968                        erp->er_extcount += ext_diff;
2969                        xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
2970                }
2971                /* Insert a new extent page */
2972                else if (erp) {
2973                        xfs_iext_add_indirect_multi(ifp,
2974                                erp_idx, page_idx, ext_diff);
2975                }
2976                /*
2977                 * If extent(s) are being appended to the last page in
2978                 * the indirection array and the new extent(s) don't fit
2979                 * in the page, then erp is NULL and erp_idx is set to
2980                 * the next index needed in the indirection array.
2981                 */
2982                else {
2983                        int     count = ext_diff;
2984
2985                        while (count) {
2986                                erp = xfs_iext_irec_new(ifp, erp_idx);
2987                                erp->er_extcount = count;
2988                                count -= MIN(count, (int)XFS_LINEAR_EXTS);
2989                                if (count) {
2990                                        erp_idx++;
2991                                }
2992                        }
2993                }
2994        }
2995        ifp->if_bytes = new_size;
2996}
2997
2998/*
2999 * This is called when incore extents are being added to the indirection
3000 * array and the new extents do not fit in the target extent list. The
3001 * erp_idx parameter contains the irec index for the target extent list
3002 * in the indirection array, and the idx parameter contains the extent
3003 * index within the list. The number of extents being added is stored
3004 * in the count parameter.
3005 *
3006 *    |-------|   |-------|
3007 *    |       |   |       |    idx - number of extents before idx
3008 *    |  idx  |   | count |
3009 *    |       |   |       |    count - number of extents being inserted at idx
3010 *    |-------|   |-------|
3011 *    | count |   | nex2  |    nex2 - number of extents after idx + count
3012 *    |-------|   |-------|
3013 */
3014void
3015xfs_iext_add_indirect_multi(
3016        xfs_ifork_t     *ifp,                   /* inode fork pointer */
3017        int             erp_idx,                /* target extent irec index */
3018        xfs_extnum_t    idx,                    /* index within target list */
3019        int             count)                  /* new extents being added */
3020{
3021        int             byte_diff;              /* new bytes being added */
3022        xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
3023        xfs_extnum_t    ext_diff;               /* number of extents to add */
3024        xfs_extnum_t    ext_cnt;                /* new extents still needed */
3025        xfs_extnum_t    nex2;                   /* extents after idx + count */
3026        xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
3027        int             nlists;                 /* number of irec's (lists) */
3028
3029        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3030        erp = &ifp->if_u1.if_ext_irec[erp_idx];
3031        nex2 = erp->er_extcount - idx;
3032        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3033
3034        /*
3035         * Save second part of target extent list
3036         * (all extents past */
3037        if (nex2) {
3038                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3039                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
3040                memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
3041                erp->er_extcount -= nex2;
3042                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
3043                memset(&erp->er_extbuf[idx], 0, byte_diff);
3044        }
3045
3046        /*
3047         * Add the new extents to the end of the target
3048         * list, then allocate new irec record(s) and
3049         * extent buffer(s) as needed to store the rest
3050         * of the new extents.
3051         */
3052        ext_cnt = count;
3053        ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
3054        if (ext_diff) {
3055                erp->er_extcount += ext_diff;
3056                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3057                ext_cnt -= ext_diff;
3058        }
3059        while (ext_cnt) {
3060                erp_idx++;
3061                erp = xfs_iext_irec_new(ifp, erp_idx);
3062                ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
3063                erp->er_extcount = ext_diff;
3064                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3065                ext_cnt -= ext_diff;
3066        }
3067
3068        /* Add nex2 extents back to indirection array */
3069        if (nex2) {
3070                xfs_extnum_t    ext_avail;
3071                int             i;
3072
3073                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3074                ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
3075                i = 0;
3076                /*
3077                 * If nex2 extents fit in the current page, append
3078                 * nex2_ep after the new extents.
3079                 */
3080                if (nex2 <= ext_avail) {
3081                        i = erp->er_extcount;
3082                }
3083                /*
3084                 * Otherwise, check if space is available in the
3085                 * next page.
3086                 */
3087                else if ((erp_idx < nlists - 1) &&
3088                         (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
3089                          ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
3090                        erp_idx++;
3091                        erp++;
3092                        /* Create a hole for nex2 extents */
3093                        memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
3094                                erp->er_extcount * sizeof(xfs_bmbt_rec_t));
3095                }
3096                /*
3097                 * Final choice, create a new extent page for
3098                 * nex2 extents.
3099                 */
3100                else {
3101                        erp_idx++;
3102                        erp = xfs_iext_irec_new(ifp, erp_idx);
3103                }
3104                memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
3105                kmem_free(nex2_ep);
3106                erp->er_extcount += nex2;
3107                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
3108        }
3109}
3110
3111/*
3112 * This is called when the amount of space required for incore file
3113 * extents needs to be decreased. The ext_diff parameter stores the
3114 * number of extents to be removed and the idx parameter contains
3115 * the extent index where the extents will be removed from.
3116 *
3117 * If the amount of space needed has decreased below the linear
3118 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
3119 * extent array.  Otherwise, use kmem_realloc() to adjust the
3120 * size to what is needed.
3121 */
3122void
3123xfs_iext_remove(
3124        xfs_inode_t     *ip,            /* incore inode pointer */
3125        xfs_extnum_t    idx,            /* index to begin removing exts */
3126        int             ext_diff,       /* number of extents to remove */
3127        int             state)          /* type of extent conversion */
3128{
3129        xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
3130        xfs_extnum_t    nextents;       /* number of extents in file */
3131        int             new_size;       /* size of extents after removal */
3132
3133        trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
3134
3135        ASSERT(ext_diff > 0);
3136        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3137        new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
3138
3139        if (new_size == 0) {
3140                xfs_iext_destroy(ifp);
3141        } else if (ifp->if_flags & XFS_IFEXTIREC) {
3142                xfs_iext_remove_indirect(ifp, idx, ext_diff);
3143        } else if (ifp->if_real_bytes) {
3144                xfs_iext_remove_direct(ifp, idx, ext_diff);
3145        } else {
3146                xfs_iext_remove_inline(ifp, idx, ext_diff);
3147        }
3148        ifp->if_bytes = new_size;
3149}
3150
3151/*
3152 * This removes ext_diff extents from the inline buffer, beginning
3153 * at extent index idx.
3154 */
3155void
3156xfs_iext_remove_inline(
3157        xfs_ifork_t     *ifp,           /* inode fork pointer */
3158        xfs_extnum_t    idx,            /* index to begin removing exts */
3159        int             ext_diff)       /* number of extents to remove */
3160{
3161        int             nextents;       /* number of extents in file */
3162
3163        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3164        ASSERT(idx < XFS_INLINE_EXTS);
3165        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3166        ASSERT(((nextents - ext_diff) > 0) &&
3167                (nextents - ext_diff) < XFS_INLINE_EXTS);
3168
3169        if (idx + ext_diff < nextents) {
3170                memmove(&ifp->if_u2.if_inline_ext[idx],
3171                        &ifp->if_u2.if_inline_ext[idx + ext_diff],
3172                        (nextents - (idx + ext_diff)) *
3173                         sizeof(xfs_bmbt_rec_t));
3174                memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
3175                        0, ext_diff * sizeof(xfs_bmbt_rec_t));
3176        } else {
3177                memset(&ifp->if_u2.if_inline_ext[idx], 0,
3178                        ext_diff * sizeof(xfs_bmbt_rec_t));
3179        }
3180}
3181
3182/*
3183 * This removes ext_diff extents from a linear (direct) extent list,
3184 * beginning at extent index idx. If the extents are being removed
3185 * from the end of the list (ie. truncate) then we just need to re-
3186 * allocate the list to remove the extra space. Otherwise, if the
3187 * extents are being removed from the middle of the existing extent
3188 * entries, then we first need to move the extent records beginning
3189 * at idx + ext_diff up in the list to overwrite the records being
3190 * removed, then remove the extra space via kmem_realloc.
3191 */
3192void
3193xfs_iext_remove_direct(
3194        xfs_ifork_t     *ifp,           /* inode fork pointer */
3195        xfs_extnum_t    idx,            /* index to begin removing exts */
3196        int             ext_diff)       /* number of extents to remove */
3197{
3198        xfs_extnum_t    nextents;       /* number of extents in file */
3199        int             new_size;       /* size of extents after removal */
3200
3201        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3202        new_size = ifp->if_bytes -
3203                (ext_diff * sizeof(xfs_bmbt_rec_t));
3204        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3205
3206        if (new_size == 0) {
3207                xfs_iext_destroy(ifp);
3208                return;
3209        }
3210        /* Move extents up in the list (if needed) */
3211        if (idx + ext_diff < nextents) {
3212                memmove(&ifp->if_u1.if_extents[idx],
3213                        &ifp->if_u1.if_extents[idx + ext_diff],
3214                        (nextents - (idx + ext_diff)) *
3215                         sizeof(xfs_bmbt_rec_t));
3216        }
3217        memset(&ifp->if_u1.if_extents[nextents - ext_diff],
3218                0, ext_diff * sizeof(xfs_bmbt_rec_t));
3219        /*
3220         * Reallocate the direct extent list. If the extents
3221         * will fit inside the inode then xfs_iext_realloc_direct
3222         * will switch from direct to inline extent allocation
3223         * mode for us.
3224         */
3225        xfs_iext_realloc_direct(ifp, new_size);
3226        ifp->if_bytes = new_size;
3227}
3228
3229/*
3230 * This is called when incore extents are being removed from the
3231 * indirection array and the extents being removed span multiple extent
3232 * buffers. The idx parameter contains the file extent index where we
3233 * want to begin removing extents, and the count parameter contains
3234 * how many extents need to be removed.
3235 *
3236 *    |-------|   |-------|
3237 *    | nex1  |   |       |    nex1 - number of extents before idx
3238 *    |-------|   | count |
3239 *    |       |   |       |    count - number of extents being removed at idx
3240 *    | count |   |-------|
3241 *    |       |   | nex2  |    nex2 - number of extents after idx + count
3242 *    |-------|   |-------|
3243 */
3244void
3245xfs_iext_remove_indirect(
3246        xfs_ifork_t     *ifp,           /* inode fork pointer */
3247        xfs_extnum_t    idx,            /* index to begin removing extents */
3248        int             count)          /* number of extents to remove */
3249{
3250        xfs_ext_irec_t  *erp;           /* indirection array pointer */
3251        int             erp_idx = 0;    /* indirection array index */
3252        xfs_extnum_t    ext_cnt;        /* extents left to remove */
3253        xfs_extnum_t    ext_diff;       /* extents to remove in current list */
3254        xfs_extnum_t    nex1;           /* number of extents before idx */
3255        xfs_extnum_t    nex2;           /* extents after idx + count */
3256        int             page_idx = idx; /* index in target extent list */
3257
3258        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3259        erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
3260        ASSERT(erp != NULL);
3261        nex1 = page_idx;
3262        ext_cnt = count;
3263        while (ext_cnt) {
3264                nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
3265                ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
3266                /*
3267                 * Check for deletion of entire list;
3268                 * xfs_iext_irec_remove() updates extent offsets.
3269                 */
3270                if (ext_diff == erp->er_extcount) {
3271                        xfs_iext_irec_remove(ifp, erp_idx);
3272                        ext_cnt -= ext_diff;
3273                        nex1 = 0;
3274                        if (ext_cnt) {
3275                                ASSERT(erp_idx < ifp->if_real_bytes /
3276                                        XFS_IEXT_BUFSZ);
3277                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
3278                                nex1 = 0;
3279                                continue;
3280                        } else {
3281                                break;
3282                        }
3283                }
3284                /* Move extents up (if needed) */
3285                if (nex2) {
3286                        memmove(&erp->er_extbuf[nex1],
3287                                &erp->er_extbuf[nex1 + ext_diff],
3288                                nex2 * sizeof(xfs_bmbt_rec_t));
3289                }
3290                /* Zero out rest of page */
3291                memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
3292                        ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
3293                /* Update remaining counters */
3294                erp->er_extcount -= ext_diff;
3295                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
3296                ext_cnt -= ext_diff;
3297                nex1 = 0;
3298                erp_idx++;
3299                erp++;
3300        }
3301        ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
3302        xfs_iext_irec_compact(ifp);
3303}
3304
3305/*
3306 * Create, destroy, or resize a linear (direct) block of extents.
3307 */
3308void
3309xfs_iext_realloc_direct(
3310        xfs_ifork_t     *ifp,           /* inode fork pointer */
3311        int             new_size)       /* new size of extents */
3312{
3313        int             rnew_size;      /* real new size of extents */
3314
3315        rnew_size = new_size;
3316
3317        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
3318                ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
3319                 (new_size != ifp->if_real_bytes)));
3320
3321        /* Free extent records */
3322        if (new_size == 0) {
3323                xfs_iext_destroy(ifp);
3324        }
3325        /* Resize direct extent list and zero any new bytes */
3326        else if (ifp->if_real_bytes) {
3327                /* Check if extents will fit inside the inode */
3328                if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
3329                        xfs_iext_direct_to_inline(ifp, new_size /
3330                                (uint)sizeof(xfs_bmbt_rec_t));
3331                        ifp->if_bytes = new_size;
3332                        return;
3333                }
3334                if (!is_power_of_2(new_size)){
3335                        rnew_size = roundup_pow_of_two(new_size);
3336                }
3337                if (rnew_size != ifp->if_real_bytes) {
3338                        ifp->if_u1.if_extents =
3339                                kmem_realloc(ifp->if_u1.if_extents,
3340                                                rnew_size,
3341                                                ifp->if_real_bytes, KM_NOFS);
3342                }
3343                if (rnew_size > ifp->if_real_bytes) {
3344                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
3345                                (uint)sizeof(xfs_bmbt_rec_t)], 0,
3346                                rnew_size - ifp->if_real_bytes);
3347                }
3348        }
3349        /*
3350         * Switch from the inline extent buffer to a direct
3351         * extent list. Be sure to include the inline extent
3352         * bytes in new_size.
3353         */
3354        else {
3355                new_size += ifp->if_bytes;
3356                if (!is_power_of_2(new_size)) {
3357                        rnew_size = roundup_pow_of_two(new_size);
3358                }
3359                xfs_iext_inline_to_direct(ifp, rnew_size);
3360        }
3361        ifp->if_real_bytes = rnew_size;
3362        ifp->if_bytes = new_size;
3363}
3364
3365/*
3366 * Switch from linear (direct) extent records to inline buffer.
3367 */
3368void
3369xfs_iext_direct_to_inline(
3370        xfs_ifork_t     *ifp,           /* inode fork pointer */
3371        xfs_extnum_t    nextents)       /* number of extents in file */
3372{
3373        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3374        ASSERT(nextents <= XFS_INLINE_EXTS);
3375        /*
3376         * The inline buffer was zeroed when we switched
3377         * from inline to direct extent allocation mode,
3378         * so we don't need to clear it here.
3379         */
3380        memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
3381                nextents * sizeof(xfs_bmbt_rec_t));
3382        kmem_free(ifp->if_u1.if_extents);
3383        ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3384        ifp->if_real_bytes = 0;
3385}
3386
3387/*
3388 * Switch from inline buffer to linear (direct) extent records.
3389 * new_size should already be rounded up to the next power of 2
3390 * by the caller (when appropriate), so use new_size as it is.
3391 * However, since new_size may be rounded up, we can't update
3392 * if_bytes here. It is the caller's responsibility to update
3393 * if_bytes upon return.
3394 */
3395void
3396xfs_iext_inline_to_direct(
3397        xfs_ifork_t     *ifp,           /* inode fork pointer */
3398        int             new_size)       /* number of extents in file */
3399{
3400        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
3401        memset(ifp->if_u1.if_extents, 0, new_size);
3402        if (ifp->if_bytes) {
3403                memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
3404                        ifp->if_bytes);
3405                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3406                        sizeof(xfs_bmbt_rec_t));
3407        }
3408        ifp->if_real_bytes = new_size;
3409}
3410
3411/*
3412 * Resize an extent indirection array to new_size bytes.
3413 */
3414STATIC void
3415xfs_iext_realloc_indirect(
3416        xfs_ifork_t     *ifp,           /* inode fork pointer */
3417        int             new_size)       /* new indirection array size */
3418{
3419        int             nlists;         /* number of irec's (ex lists) */
3420        int             size;           /* current indirection array size */
3421
3422        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3423        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3424        size = nlists * sizeof(xfs_ext_irec_t);
3425        ASSERT(ifp->if_real_bytes);
3426        ASSERT((new_size >= 0) && (new_size != size));
3427        if (new_size == 0) {
3428                xfs_iext_destroy(ifp);
3429        } else {
3430                ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
3431                        kmem_realloc(ifp->if_u1.if_ext_irec,
3432                                new_size, size, KM_NOFS);
3433        }
3434}
3435
3436/*
3437 * Switch from indirection array to linear (direct) extent allocations.
3438 */
3439STATIC void
3440xfs_iext_indirect_to_direct(
3441         xfs_ifork_t    *ifp)           /* inode fork pointer */
3442{
3443        xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
3444        xfs_extnum_t    nextents;       /* number of extents in file */
3445        int             size;           /* size of file extents */
3446
3447        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3448        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3449        ASSERT(nextents <= XFS_LINEAR_EXTS);
3450        size = nextents * sizeof(xfs_bmbt_rec_t);
3451
3452        xfs_iext_irec_compact_pages(ifp);
3453        ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
3454
3455        ep = ifp->if_u1.if_ext_irec->er_extbuf;
3456        kmem_free(ifp->if_u1.if_ext_irec);
3457        ifp->if_flags &= ~XFS_IFEXTIREC;
3458        ifp->if_u1.if_extents = ep;
3459        ifp->if_bytes = size;
3460        if (nextents < XFS_LINEAR_EXTS) {
3461                xfs_iext_realloc_direct(ifp, size);
3462        }
3463}
3464
3465/*
3466 * Free incore file extents.
3467 */
3468void
3469xfs_iext_destroy(
3470        xfs_ifork_t     *ifp)           /* inode fork pointer */
3471{
3472        if (ifp->if_flags & XFS_IFEXTIREC) {
3473                int     erp_idx;
3474                int     nlists;
3475
3476                nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3477                for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
3478                        xfs_iext_irec_remove(ifp, erp_idx);
3479                }
3480                ifp->if_flags &= ~XFS_IFEXTIREC;
3481        } else if (ifp->if_real_bytes) {
3482                kmem_free(ifp->if_u1.if_extents);
3483        } else if (ifp->if_bytes) {
3484                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3485                        sizeof(xfs_bmbt_rec_t));
3486        }
3487        ifp->if_u1.if_extents = NULL;
3488        ifp->if_real_bytes = 0;
3489        ifp->if_bytes = 0;
3490}
3491
3492/*
3493 * Return a pointer to the extent record for file system block bno.
3494 */
3495xfs_bmbt_rec_host_t *                   /* pointer to found extent record */
3496xfs_iext_bno_to_ext(
3497        xfs_ifork_t     *ifp,           /* inode fork pointer */
3498        xfs_fileoff_t   bno,            /* block number to search for */
3499        xfs_extnum_t    *idxp)          /* index of target extent */
3500{
3501        xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
3502        xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
3503        xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
3504        xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
3505        int             high;           /* upper boundary in search */
3506        xfs_extnum_t    idx = 0;        /* index of target extent */
3507        int             low;            /* lower boundary in search */
3508        xfs_extnum_t    nextents;       /* number of file extents */
3509        xfs_fileoff_t   startoff = 0;   /* start offset of extent */
3510
3511        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3512        if (nextents == 0) {
3513                *idxp = 0;
3514                return NULL;
3515        }
3516        low = 0;
3517        if (ifp->if_flags & XFS_IFEXTIREC) {
3518                /* Find target extent list */
3519                int     erp_idx = 0;
3520                erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
3521                base = erp->er_extbuf;
3522                high = erp->er_extcount - 1;
3523        } else {
3524                base = ifp->if_u1.if_extents;
3525                high = nextents - 1;
3526        }
3527        /* Binary search extent records */
3528        while (low <= high) {
3529                idx = (low + high) >> 1;
3530                ep = base + idx;
3531                startoff = xfs_bmbt_get_startoff(ep);
3532                blockcount = xfs_bmbt_get_blockcount(ep);
3533                if (bno < startoff) {
3534                        high = idx - 1;
3535                } else if (bno >= startoff + blockcount) {
3536                        low = idx + 1;
3537                } else {
3538                        /* Convert back to file-based extent index */
3539                        if (ifp->if_flags & XFS_IFEXTIREC) {
3540                                idx += erp->er_extoff;
3541                        }
3542                        *idxp = idx;
3543                        return ep;
3544                }
3545        }
3546        /* Convert back to file-based extent index */
3547        if (ifp->if_flags & XFS_IFEXTIREC) {
3548                idx += erp->er_extoff;
3549        }
3550        if (bno >= startoff + blockcount) {
3551                if (++idx == nextents) {
3552                        ep = NULL;
3553                } else {
3554                        ep = xfs_iext_get_ext(ifp, idx);
3555                }
3556        }
3557        *idxp = idx;
3558        return ep;
3559}
3560
3561/*
3562 * Return a pointer to the indirection array entry containing the
3563 * extent record for filesystem block bno. Store the index of the
3564 * target irec in *erp_idxp.
3565 */
3566xfs_ext_irec_t *                        /* pointer to found extent record */
3567xfs_iext_bno_to_irec(
3568        xfs_ifork_t     *ifp,           /* inode fork pointer */
3569        xfs_fileoff_t   bno,            /* block number to search for */
3570        int             *erp_idxp)      /* irec index of target ext list */
3571{
3572        xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
3573        xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
3574        int             erp_idx;        /* indirection array index */
3575        int             nlists;         /* number of extent irec's (lists) */
3576        int             high;           /* binary search upper limit */
3577        int             low;            /* binary search lower limit */
3578
3579        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3580        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3581        erp_idx = 0;
3582        low = 0;
3583        high = nlists - 1;
3584        while (low <= high) {
3585                erp_idx = (low + high) >> 1;
3586                erp = &ifp->if_u1.if_ext_irec[erp_idx];
3587                erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
3588                if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
3589                        high = erp_idx - 1;
3590                } else if (erp_next && bno >=
3591                           xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
3592                        low = erp_idx + 1;
3593                } else {
3594                        break;
3595                }
3596        }
3597        *erp_idxp = erp_idx;
3598        return erp;
3599}
3600
3601/*
3602 * Return a pointer to the indirection array entry containing the
3603 * extent record at file extent index *idxp. Store the index of the
3604 * target irec in *erp_idxp and store the page index of the target
3605 * extent record in *idxp.
3606 */
3607xfs_ext_irec_t *
3608xfs_iext_idx_to_irec(
3609        xfs_ifork_t     *ifp,           /* inode fork pointer */
3610        xfs_extnum_t    *idxp,          /* extent index (file -> page) */
3611        int             *erp_idxp,      /* pointer to target irec */
3612        int             realloc)        /* new bytes were just added */
3613{
3614        xfs_ext_irec_t  *prev;          /* pointer to previous irec */
3615        xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
3616        int             erp_idx;        /* indirection array index */
3617        int             nlists;         /* number of irec's (ex lists) */
3618        int             high;           /* binary search upper limit */
3619        int             low;            /* binary search lower limit */
3620        xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
3621
3622        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3623        ASSERT(page_idx >= 0);
3624        ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3625        ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
3626
3627        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3628        erp_idx = 0;
3629        low = 0;
3630        high = nlists - 1;
3631
3632        /* Binary search extent irec's */
3633        while (low <= high) {
3634                erp_idx = (low + high) >> 1;
3635                erp = &ifp->if_u1.if_ext_irec[erp_idx];
3636                prev = erp_idx > 0 ? erp - 1 : NULL;
3637                if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
3638                     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
3639                        high = erp_idx - 1;
3640                } else if (page_idx > erp->er_extoff + erp->er_extcount ||
3641                           (page_idx == erp->er_extoff + erp->er_extcount &&
3642                            !realloc)) {
3643                        low = erp_idx + 1;
3644                } else if (page_idx == erp->er_extoff + erp->er_extcount &&
3645                           erp->er_extcount == XFS_LINEAR_EXTS) {
3646                        ASSERT(realloc);
3647                        page_idx = 0;
3648                        erp_idx++;
3649                        erp = erp_idx < nlists ? erp + 1 : NULL;
3650                        break;
3651                } else {
3652                        page_idx -= erp->er_extoff;
3653                        break;
3654                }
3655        }
3656        *idxp = page_idx;
3657        *erp_idxp = erp_idx;
3658        return(erp);
3659}
3660
3661/*
3662 * Allocate and initialize an indirection array once the space needed
3663 * for incore extents increases above XFS_IEXT_BUFSZ.
3664 */
3665void
3666xfs_iext_irec_init(
3667        xfs_ifork_t     *ifp)           /* inode fork pointer */
3668{
3669        xfs_ext_irec_t  *erp;           /* indirection array pointer */
3670        xfs_extnum_t    nextents;       /* number of extents in file */
3671
3672        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3673        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3674        ASSERT(nextents <= XFS_LINEAR_EXTS);
3675
3676        erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
3677
3678        if (nextents == 0) {
3679                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3680        } else if (!ifp->if_real_bytes) {
3681                xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
3682        } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
3683                xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
3684        }
3685        erp->er_extbuf = ifp->if_u1.if_extents;
3686        erp->er_extcount = nextents;
3687        erp->er_extoff = 0;
3688
3689        ifp->if_flags |= XFS_IFEXTIREC;
3690        ifp->if_real_bytes = XFS_IEXT_BUFSZ;
3691        ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
3692        ifp->if_u1.if_ext_irec = erp;
3693
3694        return;
3695}
3696
3697/*
3698 * Allocate and initialize a new entry in the indirection array.
3699 */
3700xfs_ext_irec_t *
3701xfs_iext_irec_new(
3702        xfs_ifork_t     *ifp,           /* inode fork pointer */
3703        int             erp_idx)        /* index for new irec */
3704{
3705        xfs_ext_irec_t  *erp;           /* indirection array pointer */
3706        int             i;              /* loop counter */
3707        int             nlists;         /* number of irec's (ex lists) */
3708
3709        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3710        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3711
3712        /* Resize indirection array */
3713        xfs_iext_realloc_indirect(ifp, ++nlists *
3714                                  sizeof(xfs_ext_irec_t));
3715        /*
3716         * Move records down in the array so the
3717         * new page can use erp_idx.
3718         */
3719        erp = ifp->if_u1.if_ext_irec;
3720        for (i = nlists - 1; i > erp_idx; i--) {
3721                memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
3722        }
3723        ASSERT(i == erp_idx);
3724
3725        /* Initialize new extent record */
3726        erp = ifp->if_u1.if_ext_irec;
3727        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3728        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3729        memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
3730        erp[erp_idx].er_extcount = 0;
3731        erp[erp_idx].er_extoff = erp_idx > 0 ?
3732                erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
3733        return (&erp[erp_idx]);
3734}
3735
3736/*
3737 * Remove a record from the indirection array.
3738 */
3739void
3740xfs_iext_irec_remove(
3741        xfs_ifork_t     *ifp,           /* inode fork pointer */
3742        int             erp_idx)        /* irec index to remove */
3743{
3744        xfs_ext_irec_t  *erp;           /* indirection array pointer */
3745        int             i;              /* loop counter */
3746        int             nlists;         /* number of irec's (ex lists) */
3747
3748        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3749        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3750        erp = &ifp->if_u1.if_ext_irec[erp_idx];
3751        if (erp->er_extbuf) {
3752                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
3753                        -erp->er_extcount);
3754                kmem_free(erp->er_extbuf);
3755        }
3756        /* Compact extent records */
3757        erp = ifp->if_u1.if_ext_irec;
3758        for (i = erp_idx; i < nlists - 1; i++) {
3759                memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
3760        }
3761        /*
3762         * Manually free the last extent record from the indirection
3763         * array.  A call to xfs_iext_realloc_indirect() with a size
3764         * of zero would result in a call to xfs_iext_destroy() which
3765         * would in turn call this function again, creating a nasty
3766         * infinite loop.
3767         */
3768        if (--nlists) {
3769                xfs_iext_realloc_indirect(ifp,
3770                        nlists * sizeof(xfs_ext_irec_t));
3771        } else {
3772                kmem_free(ifp->if_u1.if_ext_irec);
3773        }
3774        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3775}
3776
3777/*
3778 * This is called to clean up large amounts of unused memory allocated
3779 * by the indirection array.  Before compacting anything though, verify
3780 * that the indirection array is still needed and switch back to the
3781 * linear extent list (or even the inline buffer) if possible.  The
3782 * compaction policy is as follows:
3783 *
3784 *    Full Compaction: Extents fit into a single page (or inline buffer)
3785 * Partial Compaction: Extents occupy less than 50% of allocated space
3786 *      No Compaction: Extents occupy at least 50% of allocated space
3787 */
3788void
3789xfs_iext_irec_compact(
3790        xfs_ifork_t     *ifp)           /* inode fork pointer */
3791{
3792        xfs_extnum_t    nextents;       /* number of extents in file */
3793        int             nlists;         /* number of irec's (ex lists) */
3794
3795        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3796        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3797        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3798
3799        if (nextents == 0) {
3800                xfs_iext_destroy(ifp);
3801        } else if (nextents <= XFS_INLINE_EXTS) {
3802                xfs_iext_indirect_to_direct(ifp);
3803                xfs_iext_direct_to_inline(ifp, nextents);
3804        } else if (nextents <= XFS_LINEAR_EXTS) {
3805                xfs_iext_indirect_to_direct(ifp);
3806        } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
3807                xfs_iext_irec_compact_pages(ifp);
3808        }
3809}
3810
3811/*
3812 * Combine extents from neighboring extent pages.
3813 */
3814void
3815xfs_iext_irec_compact_pages(
3816        xfs_ifork_t     *ifp)           /* inode fork pointer */
3817{
3818        xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
3819        int             erp_idx = 0;    /* indirection array index */
3820        int             nlists;         /* number of irec's (ex lists) */
3821
3822        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3823        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3824        while (erp_idx < nlists - 1) {
3825                erp = &ifp->if_u1.if_ext_irec[erp_idx];
3826                erp_next = erp + 1;
3827                if (erp_next->er_extcount <=
3828                    (XFS_LINEAR_EXTS - erp->er_extcount)) {
3829                        memcpy(&erp->er_extbuf[erp->er_extcount],
3830                                erp_next->er_extbuf, erp_next->er_extcount *
3831                                sizeof(xfs_bmbt_rec_t));
3832                        erp->er_extcount += erp_next->er_extcount;
3833                        /*
3834                         * Free page before removing extent record
3835                         * so er_extoffs don't get modified in
3836                         * xfs_iext_irec_remove.
3837                         */
3838                        kmem_free(erp_next->er_extbuf);
3839                        erp_next->er_extbuf = NULL;
3840                        xfs_iext_irec_remove(ifp, erp_idx + 1);
3841                        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3842                } else {
3843                        erp_idx++;
3844                }
3845        }
3846}
3847
3848/*
3849 * This is called to update the er_extoff field in the indirection
3850 * array when extents have been added or removed from one of the
3851 * extent lists. erp_idx contains the irec index to begin updating
3852 * at and ext_diff contains the number of extents that were added
3853 * or removed.
3854 */
3855void
3856xfs_iext_irec_update_extoffs(
3857        xfs_ifork_t     *ifp,           /* inode fork pointer */
3858        int             erp_idx,        /* irec index to update */
3859        int             ext_diff)       /* number of new extents */
3860{
3861        int             i;              /* loop counter */
3862        int             nlists;         /* number of irec's (ex lists */
3863
3864        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3865        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3866        for (i = erp_idx; i < nlists; i++) {
3867                ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
3868        }
3869}
3870