linux/fs/xfs/xfs_vnodeops.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18
  19#include "xfs.h"
  20#include "xfs_fs.h"
  21#include "xfs_types.h"
  22#include "xfs_bit.h"
  23#include "xfs_log.h"
  24#include "xfs_inum.h"
  25#include "xfs_trans.h"
  26#include "xfs_sb.h"
  27#include "xfs_ag.h"
  28#include "xfs_dir2.h"
  29#include "xfs_dmapi.h"
  30#include "xfs_mount.h"
  31#include "xfs_da_btree.h"
  32#include "xfs_bmap_btree.h"
  33#include "xfs_alloc_btree.h"
  34#include "xfs_ialloc_btree.h"
  35#include "xfs_dir2_sf.h"
  36#include "xfs_attr_sf.h"
  37#include "xfs_dinode.h"
  38#include "xfs_inode.h"
  39#include "xfs_inode_item.h"
  40#include "xfs_itable.h"
  41#include "xfs_btree.h"
  42#include "xfs_ialloc.h"
  43#include "xfs_alloc.h"
  44#include "xfs_bmap.h"
  45#include "xfs_attr.h"
  46#include "xfs_rw.h"
  47#include "xfs_error.h"
  48#include "xfs_quota.h"
  49#include "xfs_utils.h"
  50#include "xfs_rtalloc.h"
  51#include "xfs_refcache.h"
  52#include "xfs_trans_space.h"
  53#include "xfs_log_priv.h"
  54#include "xfs_filestream.h"
  55#include "xfs_vnodeops.h"
  56
  57int
  58xfs_open(
  59        xfs_inode_t     *ip)
  60{
  61        int             mode;
  62
  63        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  64                return XFS_ERROR(EIO);
  65
  66        /*
  67         * If it's a directory with any blocks, read-ahead block 0
  68         * as we're almost certain to have the next operation be a read there.
  69         */
  70        if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
  71                mode = xfs_ilock_map_shared(ip);
  72                if (ip->i_d.di_nextents > 0)
  73                        (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  74                xfs_iunlock(ip, mode);
  75        }
  76        return 0;
  77}
  78
  79/*
  80 * xfs_getattr
  81 */
  82int
  83xfs_getattr(
  84        xfs_inode_t     *ip,
  85        bhv_vattr_t     *vap,
  86        int             flags)
  87{
  88        bhv_vnode_t     *vp = XFS_ITOV(ip);
  89        xfs_mount_t     *mp = ip->i_mount;
  90
  91        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
  92
  93        if (XFS_FORCED_SHUTDOWN(mp))
  94                return XFS_ERROR(EIO);
  95
  96        if (!(flags & ATTR_LAZY))
  97                xfs_ilock(ip, XFS_ILOCK_SHARED);
  98
  99        vap->va_size = XFS_ISIZE(ip);
 100        if (vap->va_mask == XFS_AT_SIZE)
 101                goto all_done;
 102
 103        vap->va_nblocks =
 104                XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 105        vap->va_nodeid = ip->i_ino;
 106#if XFS_BIG_INUMS
 107        vap->va_nodeid += mp->m_inoadd;
 108#endif
 109        vap->va_nlink = ip->i_d.di_nlink;
 110
 111        /*
 112         * Quick exit for non-stat callers
 113         */
 114        if ((vap->va_mask &
 115            ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 116              XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 117                goto all_done;
 118
 119        /*
 120         * Copy from in-core inode.
 121         */
 122        vap->va_mode = ip->i_d.di_mode;
 123        vap->va_uid = ip->i_d.di_uid;
 124        vap->va_gid = ip->i_d.di_gid;
 125        vap->va_projid = ip->i_d.di_projid;
 126
 127        /*
 128         * Check vnode type block/char vs. everything else.
 129         */
 130        switch (ip->i_d.di_mode & S_IFMT) {
 131        case S_IFBLK:
 132        case S_IFCHR:
 133                vap->va_rdev = ip->i_df.if_u2.if_rdev;
 134                vap->va_blocksize = BLKDEV_IOSIZE;
 135                break;
 136        default:
 137                vap->va_rdev = 0;
 138
 139                if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
 140                        vap->va_blocksize = xfs_preferred_iosize(mp);
 141                } else {
 142
 143                        /*
 144                         * If the file blocks are being allocated from a
 145                         * realtime partition, then return the inode's
 146                         * realtime extent size or the realtime volume's
 147                         * extent size.
 148                         */
 149                        vap->va_blocksize =
 150                                xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
 151                }
 152                break;
 153        }
 154
 155        vn_atime_to_timespec(vp, &vap->va_atime);
 156        vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 157        vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 158        vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 159        vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 160
 161        /*
 162         * Exit for stat callers.  See if any of the rest of the fields
 163         * to be filled in are needed.
 164         */
 165        if ((vap->va_mask &
 166             (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 167              XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 168                goto all_done;
 169
 170        /*
 171         * Convert di_flags to xflags.
 172         */
 173        vap->va_xflags = xfs_ip2xflags(ip);
 174
 175        /*
 176         * Exit for inode revalidate.  See if any of the rest of
 177         * the fields to be filled in are needed.
 178         */
 179        if ((vap->va_mask &
 180             (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 181              XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 182                goto all_done;
 183
 184        vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 185        vap->va_nextents =
 186                (ip->i_df.if_flags & XFS_IFEXTENTS) ?
 187                        ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 188                        ip->i_d.di_nextents;
 189        if (ip->i_afp)
 190                vap->va_anextents =
 191                        (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 192                                ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 193                                 ip->i_d.di_anextents;
 194        else
 195                vap->va_anextents = 0;
 196        vap->va_gen = ip->i_d.di_gen;
 197
 198 all_done:
 199        if (!(flags & ATTR_LAZY))
 200                xfs_iunlock(ip, XFS_ILOCK_SHARED);
 201        return 0;
 202}
 203
 204
 205/*
 206 * xfs_setattr
 207 */
 208int
 209xfs_setattr(
 210        xfs_inode_t             *ip,
 211        bhv_vattr_t             *vap,
 212        int                     flags,
 213        cred_t                  *credp)
 214{
 215        bhv_vnode_t             *vp = XFS_ITOV(ip);
 216        xfs_mount_t             *mp = ip->i_mount;
 217        xfs_trans_t             *tp;
 218        int                     mask;
 219        int                     code;
 220        uint                    lock_flags;
 221        uint                    commit_flags=0;
 222        uid_t                   uid=0, iuid=0;
 223        gid_t                   gid=0, igid=0;
 224        int                     timeflags = 0;
 225        xfs_prid_t              projid=0, iprojid=0;
 226        int                     mandlock_before, mandlock_after;
 227        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
 228        int                     file_owner;
 229        int                     need_iolock = 1;
 230
 231        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
 232
 233        if (mp->m_flags & XFS_MOUNT_RDONLY)
 234                return XFS_ERROR(EROFS);
 235
 236        /*
 237         * Cannot set certain attributes.
 238         */
 239        mask = vap->va_mask;
 240        if (mask & XFS_AT_NOSET) {
 241                return XFS_ERROR(EINVAL);
 242        }
 243
 244        if (XFS_FORCED_SHUTDOWN(mp))
 245                return XFS_ERROR(EIO);
 246
 247        /*
 248         * Timestamps do not need to be logged and hence do not
 249         * need to be done within a transaction.
 250         */
 251        if (mask & XFS_AT_UPDTIMES) {
 252                ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 253                timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 254                            ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 255                            ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 256                xfs_ichgtime(ip, timeflags);
 257                return 0;
 258        }
 259
 260        olddquot1 = olddquot2 = NULL;
 261        udqp = gdqp = NULL;
 262
 263        /*
 264         * If disk quotas is on, we make sure that the dquots do exist on disk,
 265         * before we start any other transactions. Trying to do this later
 266         * is messy. We don't care to take a readlock to look at the ids
 267         * in inode here, because we can't hold it across the trans_reserve.
 268         * If the IDs do change before we take the ilock, we're covered
 269         * because the i_*dquot fields will get updated anyway.
 270         */
 271        if (XFS_IS_QUOTA_ON(mp) &&
 272            (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 273                uint    qflags = 0;
 274
 275                if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 276                        uid = vap->va_uid;
 277                        qflags |= XFS_QMOPT_UQUOTA;
 278                } else {
 279                        uid = ip->i_d.di_uid;
 280                }
 281                if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 282                        gid = vap->va_gid;
 283                        qflags |= XFS_QMOPT_GQUOTA;
 284                }  else {
 285                        gid = ip->i_d.di_gid;
 286                }
 287                if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 288                        projid = vap->va_projid;
 289                        qflags |= XFS_QMOPT_PQUOTA;
 290                }  else {
 291                        projid = ip->i_d.di_projid;
 292                }
 293                /*
 294                 * We take a reference when we initialize udqp and gdqp,
 295                 * so it is important that we never blindly double trip on
 296                 * the same variable. See xfs_create() for an example.
 297                 */
 298                ASSERT(udqp == NULL);
 299                ASSERT(gdqp == NULL);
 300                code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 301                                         &udqp, &gdqp);
 302                if (code)
 303                        return code;
 304        }
 305
 306        /*
 307         * For the other attributes, we acquire the inode lock and
 308         * first do an error checking pass.
 309         */
 310        tp = NULL;
 311        lock_flags = XFS_ILOCK_EXCL;
 312        if (flags & ATTR_NOLOCK)
 313                need_iolock = 0;
 314        if (!(mask & XFS_AT_SIZE)) {
 315                if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 316                    (mp->m_flags & XFS_MOUNT_WSYNC)) {
 317                        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 318                        commit_flags = 0;
 319                        if ((code = xfs_trans_reserve(tp, 0,
 320                                                     XFS_ICHANGE_LOG_RES(mp), 0,
 321                                                     0, 0))) {
 322                                lock_flags = 0;
 323                                goto error_return;
 324                        }
 325                }
 326        } else {
 327                if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 328                    !(flags & ATTR_DMI)) {
 329                        int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 330                        code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
 331                                vap->va_size, 0, dmflags, NULL);
 332                        if (code) {
 333                                lock_flags = 0;
 334                                goto error_return;
 335                        }
 336                }
 337                if (need_iolock)
 338                        lock_flags |= XFS_IOLOCK_EXCL;
 339        }
 340
 341        xfs_ilock(ip, lock_flags);
 342
 343        /* boolean: are we the file owner? */
 344        file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 345
 346        /*
 347         * Change various properties of a file.
 348         * Only the owner or users with CAP_FOWNER
 349         * capability may do these things.
 350         */
 351        if (mask &
 352            (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 353             XFS_AT_GID|XFS_AT_PROJID)) {
 354                /*
 355                 * CAP_FOWNER overrides the following restrictions:
 356                 *
 357                 * The user ID of the calling process must be equal
 358                 * to the file owner ID, except in cases where the
 359                 * CAP_FSETID capability is applicable.
 360                 */
 361                if (!file_owner && !capable(CAP_FOWNER)) {
 362                        code = XFS_ERROR(EPERM);
 363                        goto error_return;
 364                }
 365
 366                /*
 367                 * CAP_FSETID overrides the following restrictions:
 368                 *
 369                 * The effective user ID of the calling process shall match
 370                 * the file owner when setting the set-user-ID and
 371                 * set-group-ID bits on that file.
 372                 *
 373                 * The effective group ID or one of the supplementary group
 374                 * IDs of the calling process shall match the group owner of
 375                 * the file when setting the set-group-ID bit on that file
 376                 */
 377                if (mask & XFS_AT_MODE) {
 378                        mode_t m = 0;
 379
 380                        if ((vap->va_mode & S_ISUID) && !file_owner)
 381                                m |= S_ISUID;
 382                        if ((vap->va_mode & S_ISGID) &&
 383                            !in_group_p((gid_t)ip->i_d.di_gid))
 384                                m |= S_ISGID;
 385#if 0
 386                        /* Linux allows this, Irix doesn't. */
 387                        if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 388                                m |= S_ISVTX;
 389#endif
 390                        if (m && !capable(CAP_FSETID))
 391                                vap->va_mode &= ~m;
 392                }
 393        }
 394
 395        /*
 396         * Change file ownership.  Must be the owner or privileged.
 397         * If the system was configured with the "restricted_chown"
 398         * option, the owner is not permitted to give away the file,
 399         * and can change the group id only to a group of which he
 400         * or she is a member.
 401         */
 402        if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 403                /*
 404                 * These IDs could have changed since we last looked at them.
 405                 * But, we're assured that if the ownership did change
 406                 * while we didn't have the inode locked, inode's dquot(s)
 407                 * would have changed also.
 408                 */
 409                iuid = ip->i_d.di_uid;
 410                iprojid = ip->i_d.di_projid;
 411                igid = ip->i_d.di_gid;
 412                gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 413                uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 414                projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 415                         iprojid;
 416
 417                /*
 418                 * CAP_CHOWN overrides the following restrictions:
 419                 *
 420                 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 421                 * shall override the restriction that a process cannot
 422                 * change the user ID of a file it owns and the restriction
 423                 * that the group ID supplied to the chown() function
 424                 * shall be equal to either the group ID or one of the
 425                 * supplementary group IDs of the calling process.
 426                 */
 427                if (restricted_chown &&
 428                    (iuid != uid || (igid != gid &&
 429                                     !in_group_p((gid_t)gid))) &&
 430                    !capable(CAP_CHOWN)) {
 431                        code = XFS_ERROR(EPERM);
 432                        goto error_return;
 433                }
 434                /*
 435                 * Do a quota reservation only if uid/projid/gid is actually
 436                 * going to change.
 437                 */
 438                if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 439                    (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 440                    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 441                        ASSERT(tp);
 442                        code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 443                                                capable(CAP_FOWNER) ?
 444                                                XFS_QMOPT_FORCE_RES : 0);
 445                        if (code)       /* out of quota */
 446                                goto error_return;
 447                }
 448        }
 449
 450        /*
 451         * Truncate file.  Must have write permission and not be a directory.
 452         */
 453        if (mask & XFS_AT_SIZE) {
 454                /* Short circuit the truncate case for zero length files */
 455                if ((vap->va_size == 0) &&
 456                   (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 457                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 458                        lock_flags &= ~XFS_ILOCK_EXCL;
 459                        if (mask & XFS_AT_CTIME)
 460                                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 461                        code = 0;
 462                        goto error_return;
 463                }
 464
 465                if (VN_ISDIR(vp)) {
 466                        code = XFS_ERROR(EISDIR);
 467                        goto error_return;
 468                } else if (!VN_ISREG(vp)) {
 469                        code = XFS_ERROR(EINVAL);
 470                        goto error_return;
 471                }
 472                /*
 473                 * Make sure that the dquots are attached to the inode.
 474                 */
 475                if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 476                        goto error_return;
 477        }
 478
 479        /*
 480         * Change file access or modified times.
 481         */
 482        if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 483                if (!file_owner) {
 484                        if ((flags & ATTR_UTIME) &&
 485                            !capable(CAP_FOWNER)) {
 486                                code = XFS_ERROR(EPERM);
 487                                goto error_return;
 488                        }
 489                }
 490        }
 491
 492        /*
 493         * Change extent size or realtime flag.
 494         */
 495        if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 496                /*
 497                 * Can't change extent size if any extents are allocated.
 498                 */
 499                if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 500                    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 501                     vap->va_extsize) ) {
 502                        code = XFS_ERROR(EINVAL);       /* EFBIG? */
 503                        goto error_return;
 504                }
 505
 506                /*
 507                 * Can't change realtime flag if any extents are allocated.
 508                 */
 509                if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 510                    (mask & XFS_AT_XFLAGS) &&
 511                    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
 512                    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 513                        code = XFS_ERROR(EINVAL);       /* EFBIG? */
 514                        goto error_return;
 515                }
 516                /*
 517                 * Extent size must be a multiple of the appropriate block
 518                 * size, if set at all.
 519                 */
 520                if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 521                        xfs_extlen_t    size;
 522
 523                        if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 524                            ((mask & XFS_AT_XFLAGS) &&
 525                            (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 526                                size = mp->m_sb.sb_rextsize <<
 527                                       mp->m_sb.sb_blocklog;
 528                        } else {
 529                                size = mp->m_sb.sb_blocksize;
 530                        }
 531                        if (vap->va_extsize % size) {
 532                                code = XFS_ERROR(EINVAL);
 533                                goto error_return;
 534                        }
 535                }
 536                /*
 537                 * If realtime flag is set then must have realtime data.
 538                 */
 539                if ((mask & XFS_AT_XFLAGS) &&
 540                    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 541                        if ((mp->m_sb.sb_rblocks == 0) ||
 542                            (mp->m_sb.sb_rextsize == 0) ||
 543                            (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 544                                code = XFS_ERROR(EINVAL);
 545                                goto error_return;
 546                        }
 547                }
 548
 549                /*
 550                 * Can't modify an immutable/append-only file unless
 551                 * we have appropriate permission.
 552                 */
 553                if ((mask & XFS_AT_XFLAGS) &&
 554                    (ip->i_d.di_flags &
 555                                (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 556                     (vap->va_xflags &
 557                                (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 558                    !capable(CAP_LINUX_IMMUTABLE)) {
 559                        code = XFS_ERROR(EPERM);
 560                        goto error_return;
 561                }
 562        }
 563
 564        /*
 565         * Now we can make the changes.  Before we join the inode
 566         * to the transaction, if XFS_AT_SIZE is set then take care of
 567         * the part of the truncation that must be done without the
 568         * inode lock.  This needs to be done before joining the inode
 569         * to the transaction, because the inode cannot be unlocked
 570         * once it is a part of the transaction.
 571         */
 572        if (mask & XFS_AT_SIZE) {
 573                code = 0;
 574                if ((vap->va_size > ip->i_size) &&
 575                    (flags & ATTR_NOSIZETOK) == 0) {
 576                        code = xfs_igrow_start(ip, vap->va_size, credp);
 577                }
 578                xfs_iunlock(ip, XFS_ILOCK_EXCL);
 579
 580                /*
 581                 * We are going to log the inode size change in this
 582                 * transaction so any previous writes that are beyond the on
 583                 * disk EOF and the new EOF that have not been written out need
 584                 * to be written here. If we do not write the data out, we
 585                 * expose ourselves to the null files problem.
 586                 *
 587                 * Only flush from the on disk size to the smaller of the in
 588                 * memory file size or the new size as that's the range we
 589                 * really care about here and prevents waiting for other data
 590                 * not within the range we care about here.
 591                 */
 592                if (!code &&
 593                    (ip->i_size != ip->i_d.di_size) &&
 594                    (vap->va_size > ip->i_d.di_size)) {
 595                        code = xfs_flush_pages(ip,
 596                                        ip->i_d.di_size, vap->va_size,
 597                                        XFS_B_ASYNC, FI_NONE);
 598                }
 599
 600                /* wait for all I/O to complete */
 601                vn_iowait(ip);
 602
 603                if (!code)
 604                        code = xfs_itruncate_data(ip, vap->va_size);
 605                if (code) {
 606                        ASSERT(tp == NULL);
 607                        lock_flags &= ~XFS_ILOCK_EXCL;
 608                        ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 609                        goto error_return;
 610                }
 611                tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 612                if ((code = xfs_trans_reserve(tp, 0,
 613                                             XFS_ITRUNCATE_LOG_RES(mp), 0,
 614                                             XFS_TRANS_PERM_LOG_RES,
 615                                             XFS_ITRUNCATE_LOG_COUNT))) {
 616                        xfs_trans_cancel(tp, 0);
 617                        if (need_iolock)
 618                                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 619                        return code;
 620                }
 621                commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 622                xfs_ilock(ip, XFS_ILOCK_EXCL);
 623        }
 624
 625        if (tp) {
 626                xfs_trans_ijoin(tp, ip, lock_flags);
 627                xfs_trans_ihold(tp, ip);
 628        }
 629
 630        /* determine whether mandatory locking mode changes */
 631        mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 632
 633        /*
 634         * Truncate file.  Must have write permission and not be a directory.
 635         */
 636        if (mask & XFS_AT_SIZE) {
 637                if (vap->va_size > ip->i_size) {
 638                        xfs_igrow_finish(tp, ip, vap->va_size,
 639                            !(flags & ATTR_DMI));
 640                } else if ((vap->va_size <= ip->i_size) ||
 641                           ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 642                        /*
 643                         * signal a sync transaction unless
 644                         * we're truncating an already unlinked
 645                         * file on a wsync filesystem
 646                         */
 647                        code = xfs_itruncate_finish(&tp, ip,
 648                                            (xfs_fsize_t)vap->va_size,
 649                                            XFS_DATA_FORK,
 650                                            ((ip->i_d.di_nlink != 0 ||
 651                                              !(mp->m_flags & XFS_MOUNT_WSYNC))
 652                                             ? 1 : 0));
 653                        if (code)
 654                                goto abort_return;
 655                        /*
 656                         * Truncated "down", so we're removing references
 657                         * to old data here - if we now delay flushing for
 658                         * a long time, we expose ourselves unduly to the
 659                         * notorious NULL files problem.  So, we mark this
 660                         * vnode and flush it when the file is closed, and
 661                         * do not wait the usual (long) time for writeout.
 662                         */
 663                        xfs_iflags_set(ip, XFS_ITRUNCATED);
 664                }
 665                /*
 666                 * Have to do this even if the file's size doesn't change.
 667                 */
 668                timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 669        }
 670
 671        /*
 672         * Change file access modes.
 673         */
 674        if (mask & XFS_AT_MODE) {
 675                ip->i_d.di_mode &= S_IFMT;
 676                ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 677
 678                xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 679                timeflags |= XFS_ICHGTIME_CHG;
 680        }
 681
 682        /*
 683         * Change file ownership.  Must be the owner or privileged.
 684         * If the system was configured with the "restricted_chown"
 685         * option, the owner is not permitted to give away the file,
 686         * and can change the group id only to a group of which he
 687         * or she is a member.
 688         */
 689        if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 690                /*
 691                 * CAP_FSETID overrides the following restrictions:
 692                 *
 693                 * The set-user-ID and set-group-ID bits of a file will be
 694                 * cleared upon successful return from chown()
 695                 */
 696                if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 697                    !capable(CAP_FSETID)) {
 698                        ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 699                }
 700
 701                /*
 702                 * Change the ownerships and register quota modifications
 703                 * in the transaction.
 704                 */
 705                if (iuid != uid) {
 706                        if (XFS_IS_UQUOTA_ON(mp)) {
 707                                ASSERT(mask & XFS_AT_UID);
 708                                ASSERT(udqp);
 709                                olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 710                                                        &ip->i_udquot, udqp);
 711                        }
 712                        ip->i_d.di_uid = uid;
 713                }
 714                if (igid != gid) {
 715                        if (XFS_IS_GQUOTA_ON(mp)) {
 716                                ASSERT(!XFS_IS_PQUOTA_ON(mp));
 717                                ASSERT(mask & XFS_AT_GID);
 718                                ASSERT(gdqp);
 719                                olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 720                                                        &ip->i_gdquot, gdqp);
 721                        }
 722                        ip->i_d.di_gid = gid;
 723                }
 724                if (iprojid != projid) {
 725                        if (XFS_IS_PQUOTA_ON(mp)) {
 726                                ASSERT(!XFS_IS_GQUOTA_ON(mp));
 727                                ASSERT(mask & XFS_AT_PROJID);
 728                                ASSERT(gdqp);
 729                                olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 730                                                        &ip->i_gdquot, gdqp);
 731                        }
 732                        ip->i_d.di_projid = projid;
 733                        /*
 734                         * We may have to rev the inode as well as
 735                         * the superblock version number since projids didn't
 736                         * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 737                         */
 738                        if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 739                                xfs_bump_ino_vers2(tp, ip);
 740                }
 741
 742                xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 743                timeflags |= XFS_ICHGTIME_CHG;
 744        }
 745
 746
 747        /*
 748         * Change file access or modified times.
 749         */
 750        if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 751                if (mask & XFS_AT_ATIME) {
 752                        ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 753                        ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 754                        ip->i_update_core = 1;
 755                        timeflags &= ~XFS_ICHGTIME_ACC;
 756                }
 757                if (mask & XFS_AT_MTIME) {
 758                        ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 759                        ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 760                        timeflags &= ~XFS_ICHGTIME_MOD;
 761                        timeflags |= XFS_ICHGTIME_CHG;
 762                }
 763                if (tp && (flags & ATTR_UTIME))
 764                        xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 765        }
 766
 767        /*
 768         * Change XFS-added attributes.
 769         */
 770        if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 771                if (mask & XFS_AT_EXTSIZE) {
 772                        /*
 773                         * Converting bytes to fs blocks.
 774                         */
 775                        ip->i_d.di_extsize = vap->va_extsize >>
 776                                mp->m_sb.sb_blocklog;
 777                }
 778                if (mask & XFS_AT_XFLAGS) {
 779                        uint    di_flags;
 780
 781                        /* can't set PREALLOC this way, just preserve it */
 782                        di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 783                        if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 784                                di_flags |= XFS_DIFLAG_IMMUTABLE;
 785                        if (vap->va_xflags & XFS_XFLAG_APPEND)
 786                                di_flags |= XFS_DIFLAG_APPEND;
 787                        if (vap->va_xflags & XFS_XFLAG_SYNC)
 788                                di_flags |= XFS_DIFLAG_SYNC;
 789                        if (vap->va_xflags & XFS_XFLAG_NOATIME)
 790                                di_flags |= XFS_DIFLAG_NOATIME;
 791                        if (vap->va_xflags & XFS_XFLAG_NODUMP)
 792                                di_flags |= XFS_DIFLAG_NODUMP;
 793                        if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 794                                di_flags |= XFS_DIFLAG_PROJINHERIT;
 795                        if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
 796                                di_flags |= XFS_DIFLAG_NODEFRAG;
 797                        if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
 798                                di_flags |= XFS_DIFLAG_FILESTREAM;
 799                        if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 800                                if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 801                                        di_flags |= XFS_DIFLAG_RTINHERIT;
 802                                if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 803                                        di_flags |= XFS_DIFLAG_NOSYMLINKS;
 804                                if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 805                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 806                        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 807                                if (vap->va_xflags & XFS_XFLAG_REALTIME) {
 808                                        di_flags |= XFS_DIFLAG_REALTIME;
 809                                        ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 810                                } else {
 811                                        ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
 812                                }
 813                                if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 814                                        di_flags |= XFS_DIFLAG_EXTSIZE;
 815                        }
 816                        ip->i_d.di_flags = di_flags;
 817                }
 818                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 819                timeflags |= XFS_ICHGTIME_CHG;
 820        }
 821
 822        /*
 823         * Change file inode change time only if XFS_AT_CTIME set
 824         * AND we have been called by a DMI function.
 825         */
 826
 827        if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 828                ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 829                ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 830                ip->i_update_core = 1;
 831                timeflags &= ~XFS_ICHGTIME_CHG;
 832        }
 833
 834        /*
 835         * Send out timestamp changes that need to be set to the
 836         * current time.  Not done when called by a DMI function.
 837         */
 838        if (timeflags && !(flags & ATTR_DMI))
 839                xfs_ichgtime(ip, timeflags);
 840
 841        XFS_STATS_INC(xs_ig_attrchg);
 842
 843        /*
 844         * If this is a synchronous mount, make sure that the
 845         * transaction goes to disk before returning to the user.
 846         * This is slightly sub-optimal in that truncates require
 847         * two sync transactions instead of one for wsync filesystems.
 848         * One for the truncate and one for the timestamps since we
 849         * don't want to change the timestamps unless we're sure the
 850         * truncate worked.  Truncates are less than 1% of the laddis
 851         * mix so this probably isn't worth the trouble to optimize.
 852         */
 853        code = 0;
 854        if (tp) {
 855                if (mp->m_flags & XFS_MOUNT_WSYNC)
 856                        xfs_trans_set_sync(tp);
 857
 858                code = xfs_trans_commit(tp, commit_flags);
 859        }
 860
 861        /*
 862         * If the (regular) file's mandatory locking mode changed, then
 863         * notify the vnode.  We do this under the inode lock to prevent
 864         * racing calls to vop_vnode_change.
 865         */
 866        mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 867
 868        xfs_iunlock(ip, lock_flags);
 869
 870        /*
 871         * Release any dquot(s) the inode had kept before chown.
 872         */
 873        XFS_QM_DQRELE(mp, olddquot1);
 874        XFS_QM_DQRELE(mp, olddquot2);
 875        XFS_QM_DQRELE(mp, udqp);
 876        XFS_QM_DQRELE(mp, gdqp);
 877
 878        if (code) {
 879                return code;
 880        }
 881
 882        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 883            !(flags & ATTR_DMI)) {
 884                (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
 885                                        NULL, DM_RIGHT_NULL, NULL, NULL,
 886                                        0, 0, AT_DELAY_FLAG(flags));
 887        }
 888        return 0;
 889
 890 abort_return:
 891        commit_flags |= XFS_TRANS_ABORT;
 892        /* FALLTHROUGH */
 893 error_return:
 894        XFS_QM_DQRELE(mp, udqp);
 895        XFS_QM_DQRELE(mp, gdqp);
 896        if (tp) {
 897                xfs_trans_cancel(tp, commit_flags);
 898        }
 899        if (lock_flags != 0) {
 900                xfs_iunlock(ip, lock_flags);
 901        }
 902        return code;
 903}
 904
 905
 906/*
 907 * xfs_access
 908 * Null conversion from vnode mode bits to inode mode bits, as in efs.
 909 */
 910int
 911xfs_access(
 912        xfs_inode_t     *ip,
 913        int             mode,
 914        cred_t          *credp)
 915{
 916        int             error;
 917
 918        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
 919
 920        xfs_ilock(ip, XFS_ILOCK_SHARED);
 921        error = xfs_iaccess(ip, mode, credp);
 922        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 923        return error;
 924}
 925
 926
 927/*
 928 * The maximum pathlen is 1024 bytes. Since the minimum file system
 929 * blocksize is 512 bytes, we can get a max of 2 extents back from
 930 * bmapi.
 931 */
 932#define SYMLINK_MAPS 2
 933
 934STATIC int
 935xfs_readlink_bmap(
 936        xfs_inode_t     *ip,
 937        char            *link)
 938{
 939        xfs_mount_t     *mp = ip->i_mount;
 940        int             pathlen = ip->i_d.di_size;
 941        int             nmaps = SYMLINK_MAPS;
 942        xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 943        xfs_daddr_t     d;
 944        int             byte_cnt;
 945        int             n;
 946        xfs_buf_t       *bp;
 947        int             error = 0;
 948
 949        error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
 950                        mval, &nmaps, NULL, NULL);
 951        if (error)
 952                goto out;
 953
 954        for (n = 0; n < nmaps; n++) {
 955                d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 956                byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 957
 958                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
 959                error = XFS_BUF_GETERROR(bp);
 960                if (error) {
 961                        xfs_ioerror_alert("xfs_readlink",
 962                                  ip->i_mount, bp, XFS_BUF_ADDR(bp));
 963                        xfs_buf_relse(bp);
 964                        goto out;
 965                }
 966                if (pathlen < byte_cnt)
 967                        byte_cnt = pathlen;
 968                pathlen -= byte_cnt;
 969
 970                memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
 971                xfs_buf_relse(bp);
 972        }
 973
 974        link[ip->i_d.di_size] = '\0';
 975        error = 0;
 976
 977 out:
 978        return error;
 979}
 980
 981int
 982xfs_readlink(
 983        xfs_inode_t     *ip,
 984        char            *link)
 985{
 986        xfs_mount_t     *mp = ip->i_mount;
 987        int             pathlen;
 988        int             error = 0;
 989
 990        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
 991
 992        if (XFS_FORCED_SHUTDOWN(mp))
 993                return XFS_ERROR(EIO);
 994
 995        xfs_ilock(ip, XFS_ILOCK_SHARED);
 996
 997        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 998        ASSERT(ip->i_d.di_size <= MAXPATHLEN);
 999
1000        pathlen = ip->i_d.di_size;
1001        if (!pathlen)
1002                goto out;
1003
1004        if (ip->i_df.if_flags & XFS_IFINLINE) {
1005                memcpy(link, ip->i_df.if_u1.if_data, pathlen);
1006                link[pathlen] = '\0';
1007        } else {
1008                error = xfs_readlink_bmap(ip, link);
1009        }
1010
1011 out:
1012        xfs_iunlock(ip, XFS_ILOCK_SHARED);
1013        return error;
1014}
1015
1016/*
1017 * xfs_fsync
1018 *
1019 * This is called to sync the inode and its data out to disk.
1020 * We need to hold the I/O lock while flushing the data, and
1021 * the inode lock while flushing the inode.  The inode lock CANNOT
1022 * be held while flushing the data, so acquire after we're done
1023 * with that.
1024 */
1025int
1026xfs_fsync(
1027        xfs_inode_t     *ip,
1028        int             flag,
1029        xfs_off_t       start,
1030        xfs_off_t       stop)
1031{
1032        xfs_trans_t     *tp;
1033        int             error;
1034        int             log_flushed = 0, changed = 1;
1035
1036        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
1037
1038        ASSERT(start >= 0 && stop >= -1);
1039
1040        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1041                return XFS_ERROR(EIO);
1042
1043        if (flag & FSYNC_DATA)
1044                filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
1045
1046        /*
1047         * We always need to make sure that the required inode state
1048         * is safe on disk.  The vnode might be clean but because
1049         * of committed transactions that haven't hit the disk yet.
1050         * Likewise, there could be unflushed non-transactional
1051         * changes to the inode core that have to go to disk.
1052         *
1053         * The following code depends on one assumption:  that
1054         * any transaction that changes an inode logs the core
1055         * because it has to change some field in the inode core
1056         * (typically nextents or nblocks).  That assumption
1057         * implies that any transactions against an inode will
1058         * catch any non-transactional updates.  If inode-altering
1059         * transactions exist that violate this assumption, the
1060         * code breaks.  Right now, it figures that if the involved
1061         * update_* field is clear and the inode is unpinned, the
1062         * inode is clean.  Either it's been flushed or it's been
1063         * committed and the commit has hit the disk unpinning the inode.
1064         * (Note that xfs_inode_item_format() called at commit clears
1065         * the update_* fields.)
1066         */
1067        xfs_ilock(ip, XFS_ILOCK_SHARED);
1068
1069        /* If we are flushing data then we care about update_size
1070         * being set, otherwise we care about update_core
1071         */
1072        if ((flag & FSYNC_DATA) ?
1073                        (ip->i_update_size == 0) :
1074                        (ip->i_update_core == 0)) {
1075                /*
1076                 * Timestamps/size haven't changed since last inode
1077                 * flush or inode transaction commit.  That means
1078                 * either nothing got written or a transaction
1079                 * committed which caught the updates.  If the
1080                 * latter happened and the transaction hasn't
1081                 * hit the disk yet, the inode will be still
1082                 * be pinned.  If it is, force the log.
1083                 */
1084
1085                xfs_iunlock(ip, XFS_ILOCK_SHARED);
1086
1087                if (xfs_ipincount(ip)) {
1088                        _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1089                                      XFS_LOG_FORCE |
1090                                      ((flag & FSYNC_WAIT)
1091                                       ? XFS_LOG_SYNC : 0),
1092                                      &log_flushed);
1093                } else {
1094                        /*
1095                         * If the inode is not pinned and nothing
1096                         * has changed we don't need to flush the
1097                         * cache.
1098                         */
1099                        changed = 0;
1100                }
1101                error = 0;
1102        } else  {
1103                /*
1104                 * Kick off a transaction to log the inode
1105                 * core to get the updates.  Make it
1106                 * sync if FSYNC_WAIT is passed in (which
1107                 * is done by everybody but specfs).  The
1108                 * sync transaction will also force the log.
1109                 */
1110                xfs_iunlock(ip, XFS_ILOCK_SHARED);
1111                tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1112                if ((error = xfs_trans_reserve(tp, 0,
1113                                XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1114                                0, 0, 0)))  {
1115                        xfs_trans_cancel(tp, 0);
1116                        return error;
1117                }
1118                xfs_ilock(ip, XFS_ILOCK_EXCL);
1119
1120                /*
1121                 * Note - it's possible that we might have pushed
1122                 * ourselves out of the way during trans_reserve
1123                 * which would flush the inode.  But there's no
1124                 * guarantee that the inode buffer has actually
1125                 * gone out yet (it's delwri).  Plus the buffer
1126                 * could be pinned anyway if it's part of an
1127                 * inode in another recent transaction.  So we
1128                 * play it safe and fire off the transaction anyway.
1129                 */
1130                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1131                xfs_trans_ihold(tp, ip);
1132                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1133                if (flag & FSYNC_WAIT)
1134                        xfs_trans_set_sync(tp);
1135                error = _xfs_trans_commit(tp, 0, &log_flushed);
1136
1137                xfs_iunlock(ip, XFS_ILOCK_EXCL);
1138        }
1139
1140        if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1141                /*
1142                 * If the log write didn't issue an ordered tag we need
1143                 * to flush the disk cache for the data device now.
1144                 */
1145                if (!log_flushed)
1146                        xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1147
1148                /*
1149                 * If this inode is on the RT dev we need to flush that
1150                 * cache as well.
1151                 */
1152                if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1153                        xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1154        }
1155
1156        return error;
1157}
1158
1159/*
1160 * This is called by xfs_inactive to free any blocks beyond eof
1161 * when the link count isn't zero and by xfs_dm_punch_hole() when
1162 * punching a hole to EOF.
1163 */
1164int
1165xfs_free_eofblocks(
1166        xfs_mount_t     *mp,
1167        xfs_inode_t     *ip,
1168        int             flags)
1169{
1170        xfs_trans_t     *tp;
1171        int             error;
1172        xfs_fileoff_t   end_fsb;
1173        xfs_fileoff_t   last_fsb;
1174        xfs_filblks_t   map_len;
1175        int             nimaps;
1176        xfs_bmbt_irec_t imap;
1177        int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1178
1179        /*
1180         * Figure out if there are any blocks beyond the end
1181         * of the file.  If not, then there is nothing to do.
1182         */
1183        end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1184        last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1185        map_len = last_fsb - end_fsb;
1186        if (map_len <= 0)
1187                return 0;
1188
1189        nimaps = 1;
1190        xfs_ilock(ip, XFS_ILOCK_SHARED);
1191        error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1192                          NULL, 0, &imap, &nimaps, NULL, NULL);
1193        xfs_iunlock(ip, XFS_ILOCK_SHARED);
1194
1195        if (!error && (nimaps != 0) &&
1196            (imap.br_startblock != HOLESTARTBLOCK ||
1197             ip->i_delayed_blks)) {
1198                /*
1199                 * Attach the dquots to the inode up front.
1200                 */
1201                if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1202                        return error;
1203
1204                /*
1205                 * There are blocks after the end of file.
1206                 * Free them up now by truncating the file to
1207                 * its current size.
1208                 */
1209                tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1210
1211                /*
1212                 * Do the xfs_itruncate_start() call before
1213                 * reserving any log space because
1214                 * itruncate_start will call into the buffer
1215                 * cache and we can't
1216                 * do that within a transaction.
1217                 */
1218                if (use_iolock)
1219                        xfs_ilock(ip, XFS_IOLOCK_EXCL);
1220                error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1221                                    ip->i_size);
1222                if (error) {
1223                        xfs_trans_cancel(tp, 0);
1224                        if (use_iolock)
1225                                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1226                        return error;
1227                }
1228
1229                error = xfs_trans_reserve(tp, 0,
1230                                          XFS_ITRUNCATE_LOG_RES(mp),
1231                                          0, XFS_TRANS_PERM_LOG_RES,
1232                                          XFS_ITRUNCATE_LOG_COUNT);
1233                if (error) {
1234                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
1235                        xfs_trans_cancel(tp, 0);
1236                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1237                        return error;
1238                }
1239
1240                xfs_ilock(ip, XFS_ILOCK_EXCL);
1241                xfs_trans_ijoin(tp, ip,
1242                                XFS_IOLOCK_EXCL |
1243                                XFS_ILOCK_EXCL);
1244                xfs_trans_ihold(tp, ip);
1245
1246                error = xfs_itruncate_finish(&tp, ip,
1247                                             ip->i_size,
1248                                             XFS_DATA_FORK,
1249                                             0);
1250                /*
1251                 * If we get an error at this point we
1252                 * simply don't bother truncating the file.
1253                 */
1254                if (error) {
1255                        xfs_trans_cancel(tp,
1256                                         (XFS_TRANS_RELEASE_LOG_RES |
1257                                          XFS_TRANS_ABORT));
1258                } else {
1259                        error = xfs_trans_commit(tp,
1260                                                XFS_TRANS_RELEASE_LOG_RES);
1261                }
1262                xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1263                                            : XFS_ILOCK_EXCL));
1264        }
1265        return error;
1266}
1267
1268/*
1269 * Free a symlink that has blocks associated with it.
1270 */
1271STATIC int
1272xfs_inactive_symlink_rmt(
1273        xfs_inode_t     *ip,
1274        xfs_trans_t     **tpp)
1275{
1276        xfs_buf_t       *bp;
1277        int             committed;
1278        int             done;
1279        int             error;
1280        xfs_fsblock_t   first_block;
1281        xfs_bmap_free_t free_list;
1282        int             i;
1283        xfs_mount_t     *mp;
1284        xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1285        int             nmaps;
1286        xfs_trans_t     *ntp;
1287        int             size;
1288        xfs_trans_t     *tp;
1289
1290        tp = *tpp;
1291        mp = ip->i_mount;
1292        ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1293        /*
1294         * We're freeing a symlink that has some
1295         * blocks allocated to it.  Free the
1296         * blocks here.  We know that we've got
1297         * either 1 or 2 extents and that we can
1298         * free them all in one bunmapi call.
1299         */
1300        ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1301        if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1302                        XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1303                ASSERT(XFS_FORCED_SHUTDOWN(mp));
1304                xfs_trans_cancel(tp, 0);
1305                *tpp = NULL;
1306                return error;
1307        }
1308        /*
1309         * Lock the inode, fix the size, and join it to the transaction.
1310         * Hold it so in the normal path, we still have it locked for
1311         * the second transaction.  In the error paths we need it
1312         * held so the cancel won't rele it, see below.
1313         */
1314        xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1315        size = (int)ip->i_d.di_size;
1316        ip->i_d.di_size = 0;
1317        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1318        xfs_trans_ihold(tp, ip);
1319        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1320        /*
1321         * Find the block(s) so we can inval and unmap them.
1322         */
1323        done = 0;
1324        XFS_BMAP_INIT(&free_list, &first_block);
1325        nmaps = ARRAY_SIZE(mval);
1326        if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1327                        XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1328                        &free_list, NULL)))
1329                goto error0;
1330        /*
1331         * Invalidate the block(s).
1332         */
1333        for (i = 0; i < nmaps; i++) {
1334                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1335                        XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1336                        XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1337                xfs_trans_binval(tp, bp);
1338        }
1339        /*
1340         * Unmap the dead block(s) to the free_list.
1341         */
1342        if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1343                        &first_block, &free_list, NULL, &done)))
1344                goto error1;
1345        ASSERT(done);
1346        /*
1347         * Commit the first transaction.  This logs the EFI and the inode.
1348         */
1349        if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1350                goto error1;
1351        /*
1352         * The transaction must have been committed, since there were
1353         * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1354         * The new tp has the extent freeing and EFDs.
1355         */
1356        ASSERT(committed);
1357        /*
1358         * The first xact was committed, so add the inode to the new one.
1359         * Mark it dirty so it will be logged and moved forward in the log as
1360         * part of every commit.
1361         */
1362        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1363        xfs_trans_ihold(tp, ip);
1364        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1365        /*
1366         * Get a new, empty transaction to return to our caller.
1367         */
1368        ntp = xfs_trans_dup(tp);
1369        /*
1370         * Commit the transaction containing extent freeing and EFDs.
1371         * If we get an error on the commit here or on the reserve below,
1372         * we need to unlock the inode since the new transaction doesn't
1373         * have the inode attached.
1374         */
1375        error = xfs_trans_commit(tp, 0);
1376        tp = ntp;
1377        if (error) {
1378                ASSERT(XFS_FORCED_SHUTDOWN(mp));
1379                goto error0;
1380        }
1381        /*
1382         * Remove the memory for extent descriptions (just bookkeeping).
1383         */
1384        if (ip->i_df.if_bytes)
1385                xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1386        ASSERT(ip->i_df.if_bytes == 0);
1387        /*
1388         * Put an itruncate log reservation in the new transaction
1389         * for our caller.
1390         */
1391        if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1392                        XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1393                ASSERT(XFS_FORCED_SHUTDOWN(mp));
1394                goto error0;
1395        }
1396        /*
1397         * Return with the inode locked but not joined to the transaction.
1398         */
1399        *tpp = tp;
1400        return 0;
1401
1402 error1:
1403        xfs_bmap_cancel(&free_list);
1404 error0:
1405        /*
1406         * Have to come here with the inode locked and either
1407         * (held and in the transaction) or (not in the transaction).
1408         * If the inode isn't held then cancel would iput it, but
1409         * that's wrong since this is inactive and the vnode ref
1410         * count is 0 already.
1411         * Cancel won't do anything to the inode if held, but it still
1412         * needs to be locked until the cancel is done, if it was
1413         * joined to the transaction.
1414         */
1415        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1416        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1417        *tpp = NULL;
1418        return error;
1419
1420}
1421
1422STATIC int
1423xfs_inactive_symlink_local(
1424        xfs_inode_t     *ip,
1425        xfs_trans_t     **tpp)
1426{
1427        int             error;
1428
1429        ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1430        /*
1431         * We're freeing a symlink which fit into
1432         * the inode.  Just free the memory used
1433         * to hold the old symlink.
1434         */
1435        error = xfs_trans_reserve(*tpp, 0,
1436                                  XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1437                                  0, XFS_TRANS_PERM_LOG_RES,
1438                                  XFS_ITRUNCATE_LOG_COUNT);
1439
1440        if (error) {
1441                xfs_trans_cancel(*tpp, 0);
1442                *tpp = NULL;
1443                return error;
1444        }
1445        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1446
1447        /*
1448         * Zero length symlinks _can_ exist.
1449         */
1450        if (ip->i_df.if_bytes > 0) {
1451                xfs_idata_realloc(ip,
1452                                  -(ip->i_df.if_bytes),
1453                                  XFS_DATA_FORK);
1454                ASSERT(ip->i_df.if_bytes == 0);
1455        }
1456        return 0;
1457}
1458
1459STATIC int
1460xfs_inactive_attrs(
1461        xfs_inode_t     *ip,
1462        xfs_trans_t     **tpp)
1463{
1464        xfs_trans_t     *tp;
1465        int             error;
1466        xfs_mount_t     *mp;
1467
1468        ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1469        tp = *tpp;
1470        mp = ip->i_mount;
1471        ASSERT(ip->i_d.di_forkoff != 0);
1472        xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1473        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1474
1475        error = xfs_attr_inactive(ip);
1476        if (error) {
1477                *tpp = NULL;
1478                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1479                return error; /* goto out */
1480        }
1481
1482        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1483        error = xfs_trans_reserve(tp, 0,
1484                                  XFS_IFREE_LOG_RES(mp),
1485                                  0, XFS_TRANS_PERM_LOG_RES,
1486                                  XFS_INACTIVE_LOG_COUNT);
1487        if (error) {
1488                ASSERT(XFS_FORCED_SHUTDOWN(mp));
1489                xfs_trans_cancel(tp, 0);
1490                *tpp = NULL;
1491                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1492                return error;
1493        }
1494
1495        xfs_ilock(ip, XFS_ILOCK_EXCL);
1496        xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1497        xfs_trans_ihold(tp, ip);
1498        xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1499
1500        ASSERT(ip->i_d.di_anextents == 0);
1501
1502        *tpp = tp;
1503        return 0;
1504}
1505
1506int
1507xfs_release(
1508        xfs_inode_t     *ip)
1509{
1510        bhv_vnode_t     *vp = XFS_ITOV(ip);
1511        xfs_mount_t     *mp = ip->i_mount;
1512        int             error;
1513
1514        if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1515                return 0;
1516
1517        /* If this is a read-only mount, don't do this (would generate I/O) */
1518        if (mp->m_flags & XFS_MOUNT_RDONLY)
1519                return 0;
1520
1521        if (!XFS_FORCED_SHUTDOWN(mp)) {
1522                int truncated;
1523
1524                /*
1525                 * If we are using filestreams, and we have an unlinked
1526                 * file that we are processing the last close on, then nothing
1527                 * will be able to reopen and write to this file. Purge this
1528                 * inode from the filestreams cache so that it doesn't delay
1529                 * teardown of the inode.
1530                 */
1531                if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1532                        xfs_filestream_deassociate(ip);
1533
1534                /*
1535                 * If we previously truncated this file and removed old data
1536                 * in the process, we want to initiate "early" writeout on
1537                 * the last close.  This is an attempt to combat the notorious
1538                 * NULL files problem which is particularly noticable from a
1539                 * truncate down, buffered (re-)write (delalloc), followed by
1540                 * a crash.  What we are effectively doing here is
1541                 * significantly reducing the time window where we'd otherwise
1542                 * be exposed to that problem.
1543                 */
1544                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1545                if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1546                        xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
1547        }
1548
1549#ifdef HAVE_REFCACHE
1550        /* If we are in the NFS reference cache then don't do this now */
1551        if (ip->i_refcache)
1552                return 0;
1553#endif
1554
1555        if (ip->i_d.di_nlink != 0) {
1556                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1557                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1558                       ip->i_delayed_blks > 0)) &&
1559                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1560                    (!(ip->i_d.di_flags &
1561                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1562                        error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1563                        if (error)
1564                                return error;
1565                        /* Update linux inode block count after free above */
1566                        vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1567                                ip->i_d.di_nblocks + ip->i_delayed_blks);
1568                }
1569        }
1570
1571        return 0;
1572}
1573
1574/*
1575 * xfs_inactive
1576 *
1577 * This is called when the vnode reference count for the vnode
1578 * goes to zero.  If the file has been unlinked, then it must
1579 * now be truncated.  Also, we clear all of the read-ahead state
1580 * kept for the inode here since the file is now closed.
1581 */
1582int
1583xfs_inactive(
1584        xfs_inode_t     *ip)
1585{
1586        bhv_vnode_t     *vp = XFS_ITOV(ip);
1587        xfs_bmap_free_t free_list;
1588        xfs_fsblock_t   first_block;
1589        int             committed;
1590        xfs_trans_t     *tp;
1591        xfs_mount_t     *mp;
1592        int             error;
1593        int             truncate;
1594
1595        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
1596
1597        /*
1598         * If the inode is already free, then there can be nothing
1599         * to clean up here.
1600         */
1601        if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1602                ASSERT(ip->i_df.if_real_bytes == 0);
1603                ASSERT(ip->i_df.if_broot_bytes == 0);
1604                return VN_INACTIVE_CACHE;
1605        }
1606
1607        /*
1608         * Only do a truncate if it's a regular file with
1609         * some actual space in it.  It's OK to look at the
1610         * inode's fields without the lock because we're the
1611         * only one with a reference to the inode.
1612         */
1613        truncate = ((ip->i_d.di_nlink == 0) &&
1614            ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1615             (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1616            ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1617
1618        mp = ip->i_mount;
1619
1620        if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
1621                (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1622        }
1623
1624        error = 0;
1625
1626        /* If this is a read-only mount, don't do this (would generate I/O) */
1627        if (mp->m_flags & XFS_MOUNT_RDONLY)
1628                goto out;
1629
1630        if (ip->i_d.di_nlink != 0) {
1631                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1632                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1633                       ip->i_delayed_blks > 0)) &&
1634                      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1635                     (!(ip->i_d.di_flags &
1636                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1637                      (ip->i_delayed_blks != 0)))) {
1638                        error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1639                        if (error)
1640                                return VN_INACTIVE_CACHE;
1641                        /* Update linux inode block count after free above */
1642                        vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1643                                ip->i_d.di_nblocks + ip->i_delayed_blks);
1644                }
1645                goto out;
1646        }
1647
1648        ASSERT(ip->i_d.di_nlink == 0);
1649
1650        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1651                return VN_INACTIVE_CACHE;
1652
1653        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1654        if (truncate) {
1655                /*
1656                 * Do the xfs_itruncate_start() call before
1657                 * reserving any log space because itruncate_start
1658                 * will call into the buffer cache and we can't
1659                 * do that within a transaction.
1660                 */
1661                xfs_ilock(ip, XFS_IOLOCK_EXCL);
1662
1663                error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1664                if (error) {
1665                        xfs_trans_cancel(tp, 0);
1666                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1667                        return VN_INACTIVE_CACHE;
1668                }
1669
1670                error = xfs_trans_reserve(tp, 0,
1671                                          XFS_ITRUNCATE_LOG_RES(mp),
1672                                          0, XFS_TRANS_PERM_LOG_RES,
1673                                          XFS_ITRUNCATE_LOG_COUNT);
1674                if (error) {
1675                        /* Don't call itruncate_cleanup */
1676                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
1677                        xfs_trans_cancel(tp, 0);
1678                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1679                        return VN_INACTIVE_CACHE;
1680                }
1681
1682                xfs_ilock(ip, XFS_ILOCK_EXCL);
1683                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1684                xfs_trans_ihold(tp, ip);
1685
1686                /*
1687                 * normally, we have to run xfs_itruncate_finish sync.
1688                 * But if filesystem is wsync and we're in the inactive
1689                 * path, then we know that nlink == 0, and that the
1690                 * xaction that made nlink == 0 is permanently committed
1691                 * since xfs_remove runs as a synchronous transaction.
1692                 */
1693                error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1694                                (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1695
1696                if (error) {
1697                        xfs_trans_cancel(tp,
1698                                XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1699                        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1700                        return VN_INACTIVE_CACHE;
1701                }
1702        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1703
1704                /*
1705                 * If we get an error while cleaning up a
1706                 * symlink we bail out.
1707                 */
1708                error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1709                        xfs_inactive_symlink_rmt(ip, &tp) :
1710                        xfs_inactive_symlink_local(ip, &tp);
1711
1712                if (error) {
1713                        ASSERT(tp == NULL);
1714                        return VN_INACTIVE_CACHE;
1715                }
1716
1717                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1718                xfs_trans_ihold(tp, ip);
1719        } else {
1720                error = xfs_trans_reserve(tp, 0,
1721                                          XFS_IFREE_LOG_RES(mp),
1722                                          0, XFS_TRANS_PERM_LOG_RES,
1723                                          XFS_INACTIVE_LOG_COUNT);
1724                if (error) {
1725                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
1726                        xfs_trans_cancel(tp, 0);
1727                        return VN_INACTIVE_CACHE;
1728                }
1729
1730                xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1731                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1732                xfs_trans_ihold(tp, ip);
1733        }
1734
1735        /*
1736         * If there are attributes associated with the file
1737         * then blow them away now.  The code calls a routine
1738         * that recursively deconstructs the attribute fork.
1739         * We need to just commit the current transaction
1740         * because we can't use it for xfs_attr_inactive().
1741         */
1742        if (ip->i_d.di_anextents > 0) {
1743                error = xfs_inactive_attrs(ip, &tp);
1744                /*
1745                 * If we got an error, the transaction is already
1746                 * cancelled, and the inode is unlocked. Just get out.
1747                 */
1748                 if (error)
1749                         return VN_INACTIVE_CACHE;
1750        } else if (ip->i_afp) {
1751                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1752        }
1753
1754        /*
1755         * Free the inode.
1756         */
1757        XFS_BMAP_INIT(&free_list, &first_block);
1758        error = xfs_ifree(tp, ip, &free_list);
1759        if (error) {
1760                /*
1761                 * If we fail to free the inode, shut down.  The cancel
1762                 * might do that, we need to make sure.  Otherwise the
1763                 * inode might be lost for a long time or forever.
1764                 */
1765                if (!XFS_FORCED_SHUTDOWN(mp)) {
1766                        cmn_err(CE_NOTE,
1767                "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1768                                error, mp->m_fsname);
1769                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1770                }
1771                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1772        } else {
1773                /*
1774                 * Credit the quota account(s). The inode is gone.
1775                 */
1776                XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1777
1778                /*
1779                 * Just ignore errors at this point.  There is
1780                 * nothing we can do except to try to keep going.
1781                 */
1782                (void) xfs_bmap_finish(&tp,  &free_list, &committed);
1783                (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1784        }
1785        /*
1786         * Release the dquots held by inode, if any.
1787         */
1788        XFS_QM_DQDETACH(mp, ip);
1789
1790        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1791
1792 out:
1793        return VN_INACTIVE_CACHE;
1794}
1795
1796
1797int
1798xfs_lookup(
1799        xfs_inode_t             *dp,
1800        bhv_vname_t             *dentry,
1801        bhv_vnode_t             **vpp)
1802{
1803        xfs_inode_t             *ip;
1804        xfs_ino_t               e_inum;
1805        int                     error;
1806        uint                    lock_mode;
1807
1808        vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
1809
1810        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1811                return XFS_ERROR(EIO);
1812
1813        lock_mode = xfs_ilock_map_shared(dp);
1814        error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
1815        if (!error) {
1816                *vpp = XFS_ITOV(ip);
1817                ITRACE(ip);
1818        }
1819        xfs_iunlock_map_shared(dp, lock_mode);
1820        return error;
1821}
1822
1823int
1824xfs_create(
1825        xfs_inode_t             *dp,
1826        bhv_vname_t             *dentry,
1827        mode_t                  mode,
1828        xfs_dev_t               rdev,
1829        bhv_vnode_t             **vpp,
1830        cred_t                  *credp)
1831{
1832        char                    *name = VNAME(dentry);
1833        xfs_mount_t             *mp = dp->i_mount;
1834        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
1835        xfs_inode_t             *ip;
1836        bhv_vnode_t             *vp = NULL;
1837        xfs_trans_t             *tp;
1838        int                     error;
1839        xfs_bmap_free_t         free_list;
1840        xfs_fsblock_t           first_block;
1841        boolean_t               unlock_dp_on_error = B_FALSE;
1842        int                     dm_event_sent = 0;
1843        uint                    cancel_flags;
1844        int                     committed;
1845        xfs_prid_t              prid;
1846        struct xfs_dquot        *udqp, *gdqp;
1847        uint                    resblks;
1848        int                     namelen;
1849
1850        ASSERT(!*vpp);
1851        vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
1852
1853        namelen = VNAMELEN(dentry);
1854
1855        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1856                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1857                                dir_vp, DM_RIGHT_NULL, NULL,
1858                                DM_RIGHT_NULL, name, NULL,
1859                                mode, 0, 0);
1860
1861                if (error)
1862                        return error;
1863                dm_event_sent = 1;
1864        }
1865
1866        if (XFS_FORCED_SHUTDOWN(mp))
1867                return XFS_ERROR(EIO);
1868
1869        /* Return through std_return after this point. */
1870
1871        udqp = gdqp = NULL;
1872        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1873                prid = dp->i_d.di_projid;
1874        else
1875                prid = (xfs_prid_t)dfltprid;
1876
1877        /*
1878         * Make sure that we have allocated dquot(s) on disk.
1879         */
1880        error = XFS_QM_DQVOPALLOC(mp, dp,
1881                        current_fsuid(credp), current_fsgid(credp), prid,
1882                        XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1883        if (error)
1884                goto std_return;
1885
1886        ip = NULL;
1887
1888        tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1889        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1890        resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1891        /*
1892         * Initially assume that the file does not exist and
1893         * reserve the resources for that case.  If that is not
1894         * the case we'll drop the one we have and get a more
1895         * appropriate transaction later.
1896         */
1897        error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1898                        XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1899        if (error == ENOSPC) {
1900                resblks = 0;
1901                error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1902                                XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1903        }
1904        if (error) {
1905                cancel_flags = 0;
1906                goto error_return;
1907        }
1908
1909        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1910        unlock_dp_on_error = B_TRUE;
1911
1912        XFS_BMAP_INIT(&free_list, &first_block);
1913
1914        ASSERT(ip == NULL);
1915
1916        /*
1917         * Reserve disk quota and the inode.
1918         */
1919        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1920        if (error)
1921                goto error_return;
1922
1923        if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1924                goto error_return;
1925        error = xfs_dir_ialloc(&tp, dp, mode, 1,
1926                        rdev, credp, prid, resblks > 0,
1927                        &ip, &committed);
1928        if (error) {
1929                if (error == ENOSPC)
1930                        goto error_return;
1931                goto abort_return;
1932        }
1933        ITRACE(ip);
1934
1935        /*
1936         * At this point, we've gotten a newly allocated inode.
1937         * It is locked (and joined to the transaction).
1938         */
1939
1940        ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1941
1942        /*
1943         * Now we join the directory inode to the transaction.  We do not do it
1944         * earlier because xfs_dir_ialloc might commit the previous transaction
1945         * (and release all the locks).  An error from here on will result in
1946         * the transaction cancel unlocking dp so don't do it explicitly in the
1947         * error path.
1948         */
1949        VN_HOLD(dir_vp);
1950        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1951        unlock_dp_on_error = B_FALSE;
1952
1953        error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
1954                                        &first_block, &free_list, resblks ?
1955                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1956        if (error) {
1957                ASSERT(error != ENOSPC);
1958                goto abort_return;
1959        }
1960        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1961        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1962
1963        /*
1964         * If this is a synchronous mount, make sure that the
1965         * create transaction goes to disk before returning to
1966         * the user.
1967         */
1968        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1969                xfs_trans_set_sync(tp);
1970        }
1971
1972        dp->i_gen++;
1973
1974        /*
1975         * Attach the dquot(s) to the inodes and modify them incore.
1976         * These ids of the inode couldn't have changed since the new
1977         * inode has been locked ever since it was created.
1978         */
1979        XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
1980
1981        /*
1982         * xfs_trans_commit normally decrements the vnode ref count
1983         * when it unlocks the inode. Since we want to return the
1984         * vnode to the caller, we bump the vnode ref count now.
1985         */
1986        IHOLD(ip);
1987        vp = XFS_ITOV(ip);
1988
1989        error = xfs_bmap_finish(&tp, &free_list, &committed);
1990        if (error) {
1991                xfs_bmap_cancel(&free_list);
1992                goto abort_rele;
1993        }
1994
1995        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1996        if (error) {
1997                IRELE(ip);
1998                tp = NULL;
1999                goto error_return;
2000        }
2001
2002        XFS_QM_DQRELE(mp, udqp);
2003        XFS_QM_DQRELE(mp, gdqp);
2004
2005        *vpp = vp;
2006
2007        /* Fallthrough to std_return with error = 0  */
2008
2009std_return:
2010        if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
2011            DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2012                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2013                        dir_vp, DM_RIGHT_NULL,
2014                        *vpp ? vp:NULL,
2015                        DM_RIGHT_NULL, name, NULL,
2016                        mode, error, 0);
2017        }
2018        return error;
2019
2020 abort_return:
2021        cancel_flags |= XFS_TRANS_ABORT;
2022        /* FALLTHROUGH */
2023
2024 error_return:
2025        if (tp != NULL)
2026                xfs_trans_cancel(tp, cancel_flags);
2027
2028        XFS_QM_DQRELE(mp, udqp);
2029        XFS_QM_DQRELE(mp, gdqp);
2030
2031        if (unlock_dp_on_error)
2032                xfs_iunlock(dp, XFS_ILOCK_EXCL);
2033
2034        goto std_return;
2035
2036 abort_rele:
2037        /*
2038         * Wait until after the current transaction is aborted to
2039         * release the inode.  This prevents recursive transactions
2040         * and deadlocks from xfs_inactive.
2041         */
2042        cancel_flags |= XFS_TRANS_ABORT;
2043        xfs_trans_cancel(tp, cancel_flags);
2044        IRELE(ip);
2045
2046        XFS_QM_DQRELE(mp, udqp);
2047        XFS_QM_DQRELE(mp, gdqp);
2048
2049        goto std_return;
2050}
2051
2052#ifdef DEBUG
2053/*
2054 * Some counters to see if (and how often) we are hitting some deadlock
2055 * prevention code paths.
2056 */
2057
2058int xfs_rm_locks;
2059int xfs_rm_lock_delays;
2060int xfs_rm_attempts;
2061#endif
2062
2063/*
2064 * The following routine will lock the inodes associated with the
2065 * directory and the named entry in the directory. The locks are
2066 * acquired in increasing inode number.
2067 *
2068 * If the entry is "..", then only the directory is locked. The
2069 * vnode ref count will still include that from the .. entry in
2070 * this case.
2071 *
2072 * There is a deadlock we need to worry about. If the locked directory is
2073 * in the AIL, it might be blocking up the log. The next inode we lock
2074 * could be already locked by another thread waiting for log space (e.g
2075 * a permanent log reservation with a long running transaction (see
2076 * xfs_itruncate_finish)). To solve this, we must check if the directory
2077 * is in the ail and use lock_nowait. If we can't lock, we need to
2078 * drop the inode lock on the directory and try again. xfs_iunlock will
2079 * potentially push the tail if we were holding up the log.
2080 */
2081STATIC int
2082xfs_lock_dir_and_entry(
2083        xfs_inode_t     *dp,
2084        xfs_inode_t     *ip)    /* inode of entry 'name' */
2085{
2086        int             attempts;
2087        xfs_ino_t       e_inum;
2088        xfs_inode_t     *ips[2];
2089        xfs_log_item_t  *lp;
2090
2091#ifdef DEBUG
2092        xfs_rm_locks++;
2093#endif
2094        attempts = 0;
2095
2096again:
2097        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2098
2099        e_inum = ip->i_ino;
2100
2101        ITRACE(ip);
2102
2103        /*
2104         * We want to lock in increasing inum. Since we've already
2105         * acquired the lock on the directory, we may need to release
2106         * if if the inum of the entry turns out to be less.
2107         */
2108        if (e_inum > dp->i_ino) {
2109                /*
2110                 * We are already in the right order, so just
2111                 * lock on the inode of the entry.
2112                 * We need to use nowait if dp is in the AIL.
2113                 */
2114
2115                lp = (xfs_log_item_t *)dp->i_itemp;
2116                if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2117                        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2118                                attempts++;
2119#ifdef DEBUG
2120                                xfs_rm_attempts++;
2121#endif
2122
2123                                /*
2124                                 * Unlock dp and try again.
2125                                 * xfs_iunlock will try to push the tail
2126                                 * if the inode is in the AIL.
2127                                 */
2128
2129                                xfs_iunlock(dp, XFS_ILOCK_EXCL);
2130
2131                                if ((attempts % 5) == 0) {
2132                                        delay(1); /* Don't just spin the CPU */
2133#ifdef DEBUG
2134                                        xfs_rm_lock_delays++;
2135#endif
2136                                }
2137                                goto again;
2138                        }
2139                } else {
2140                        xfs_ilock(ip, XFS_ILOCK_EXCL);
2141                }
2142        } else if (e_inum < dp->i_ino) {
2143                xfs_iunlock(dp, XFS_ILOCK_EXCL);
2144
2145                ips[0] = ip;
2146                ips[1] = dp;
2147                xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2148        }
2149        /* else  e_inum == dp->i_ino */
2150        /*     This can happen if we're asked to lock /x/..
2151         *     the entry is "..", which is also the parent directory.
2152         */
2153
2154        return 0;
2155}
2156
2157#ifdef DEBUG
2158int xfs_locked_n;
2159int xfs_small_retries;
2160int xfs_middle_retries;
2161int xfs_lots_retries;
2162int xfs_lock_delays;
2163#endif
2164
2165/*
2166 * Bump the subclass so xfs_lock_inodes() acquires each lock with
2167 * a different value
2168 */
2169static inline int
2170xfs_lock_inumorder(int lock_mode, int subclass)
2171{
2172        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2173                lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2174        if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2175                lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2176
2177        return lock_mode;
2178}
2179
2180/*
2181 * The following routine will lock n inodes in exclusive mode.
2182 * We assume the caller calls us with the inodes in i_ino order.
2183 *
2184 * We need to detect deadlock where an inode that we lock
2185 * is in the AIL and we start waiting for another inode that is locked
2186 * by a thread in a long running transaction (such as truncate). This can
2187 * result in deadlock since the long running trans might need to wait
2188 * for the inode we just locked in order to push the tail and free space
2189 * in the log.
2190 */
2191void
2192xfs_lock_inodes(
2193        xfs_inode_t     **ips,
2194        int             inodes,
2195        int             first_locked,
2196        uint            lock_mode)
2197{
2198        int             attempts = 0, i, j, try_lock;
2199        xfs_log_item_t  *lp;
2200
2201        ASSERT(ips && (inodes >= 2)); /* we need at least two */
2202
2203        if (first_locked) {
2204                try_lock = 1;
2205                i = 1;
2206        } else {
2207                try_lock = 0;
2208                i = 0;
2209        }
2210
2211again:
2212        for (; i < inodes; i++) {
2213                ASSERT(ips[i]);
2214
2215                if (i && (ips[i] == ips[i-1]))  /* Already locked */
2216                        continue;
2217
2218                /*
2219                 * If try_lock is not set yet, make sure all locked inodes
2220                 * are not in the AIL.
2221                 * If any are, set try_lock to be used later.
2222                 */
2223
2224                if (!try_lock) {
2225                        for (j = (i - 1); j >= 0 && !try_lock; j--) {
2226                                lp = (xfs_log_item_t *)ips[j]->i_itemp;
2227                                if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2228                                        try_lock++;
2229                                }
2230                        }
2231                }
2232
2233                /*
2234                 * If any of the previous locks we have locked is in the AIL,
2235                 * we must TRY to get the second and subsequent locks. If
2236                 * we can't get any, we must release all we have
2237                 * and try again.
2238                 */
2239
2240                if (try_lock) {
2241                        /* try_lock must be 0 if i is 0. */
2242                        /*
2243                         * try_lock means we have an inode locked
2244                         * that is in the AIL.
2245                         */
2246                        ASSERT(i != 0);
2247                        if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2248                                attempts++;
2249
2250                                /*
2251                                 * Unlock all previous guys and try again.
2252                                 * xfs_iunlock will try to push the tail
2253                                 * if the inode is in the AIL.
2254                                 */
2255
2256                                for(j = i - 1; j >= 0; j--) {
2257
2258                                        /*
2259                                         * Check to see if we've already
2260                                         * unlocked this one.
2261                                         * Not the first one going back,
2262                                         * and the inode ptr is the same.
2263                                         */
2264                                        if ((j != (i - 1)) && ips[j] ==
2265                                                                ips[j+1])
2266                                                continue;
2267
2268                                        xfs_iunlock(ips[j], lock_mode);
2269                                }
2270
2271                                if ((attempts % 5) == 0) {
2272                                        delay(1); /* Don't just spin the CPU */
2273#ifdef DEBUG
2274                                        xfs_lock_delays++;
2275#endif
2276                                }
2277                                i = 0;
2278                                try_lock = 0;
2279                                goto again;
2280                        }
2281                } else {
2282                        xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2283                }
2284        }
2285
2286#ifdef DEBUG
2287        if (attempts) {
2288                if (attempts < 5) xfs_small_retries++;
2289                else if (attempts < 100) xfs_middle_retries++;
2290                else xfs_lots_retries++;
2291        } else {
2292                xfs_locked_n++;
2293        }
2294#endif
2295}
2296
2297#ifdef  DEBUG
2298#define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2299int remove_which_error_return = 0;
2300#else /* ! DEBUG */
2301#define REMOVE_DEBUG_TRACE(x)
2302#endif  /* ! DEBUG */
2303
2304int
2305xfs_remove(
2306        xfs_inode_t             *dp,
2307        bhv_vname_t             *dentry)
2308{
2309        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2310        char                    *name = VNAME(dentry);
2311        xfs_mount_t             *mp = dp->i_mount;
2312        xfs_inode_t             *ip;
2313        xfs_trans_t             *tp = NULL;
2314        int                     error = 0;
2315        xfs_bmap_free_t         free_list;
2316        xfs_fsblock_t           first_block;
2317        int                     cancel_flags;
2318        int                     committed;
2319        int                     dm_di_mode = 0;
2320        int                     link_zero;
2321        uint                    resblks;
2322        int                     namelen;
2323
2324        vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
2325
2326        if (XFS_FORCED_SHUTDOWN(mp))
2327                return XFS_ERROR(EIO);
2328
2329        namelen = VNAMELEN(dentry);
2330
2331        if (!xfs_get_dir_entry(dentry, &ip)) {
2332                dm_di_mode = ip->i_d.di_mode;
2333                IRELE(ip);
2334        }
2335
2336        if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2337                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2338                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2339                                        name, NULL, dm_di_mode, 0, 0);
2340                if (error)
2341                        return error;
2342        }
2343
2344        /* From this point on, return through std_return */
2345        ip = NULL;
2346
2347        /*
2348         * We need to get a reference to ip before we get our log
2349         * reservation. The reason for this is that we cannot call
2350         * xfs_iget for an inode for which we do not have a reference
2351         * once we've acquired a log reservation. This is because the
2352         * inode we are trying to get might be in xfs_inactive going
2353         * for a log reservation. Since we'll have to wait for the
2354         * inactive code to complete before returning from xfs_iget,
2355         * we need to make sure that we don't have log space reserved
2356         * when we call xfs_iget.  Instead we get an unlocked reference
2357         * to the inode before getting our log reservation.
2358         */
2359        error = xfs_get_dir_entry(dentry, &ip);
2360        if (error) {
2361                REMOVE_DEBUG_TRACE(__LINE__);
2362                goto std_return;
2363        }
2364
2365        dm_di_mode = ip->i_d.di_mode;
2366
2367        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
2368
2369        ITRACE(ip);
2370
2371        error = XFS_QM_DQATTACH(mp, dp, 0);
2372        if (!error && dp != ip)
2373                error = XFS_QM_DQATTACH(mp, ip, 0);
2374        if (error) {
2375                REMOVE_DEBUG_TRACE(__LINE__);
2376                IRELE(ip);
2377                goto std_return;
2378        }
2379
2380        tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2381        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2382        /*
2383         * We try to get the real space reservation first,
2384         * allowing for directory btree deletion(s) implying
2385         * possible bmap insert(s).  If we can't get the space
2386         * reservation then we use 0 instead, and avoid the bmap
2387         * btree insert(s) in the directory code by, if the bmap
2388         * insert tries to happen, instead trimming the LAST
2389         * block from the directory.
2390         */
2391        resblks = XFS_REMOVE_SPACE_RES(mp);
2392        error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2393                        XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2394        if (error == ENOSPC) {
2395                resblks = 0;
2396                error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2397                                XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2398        }
2399        if (error) {
2400                ASSERT(error != ENOSPC);
2401                REMOVE_DEBUG_TRACE(__LINE__);
2402                xfs_trans_cancel(tp, 0);
2403                IRELE(ip);
2404                return error;
2405        }
2406
2407        error = xfs_lock_dir_and_entry(dp, ip);
2408        if (error) {
2409                REMOVE_DEBUG_TRACE(__LINE__);
2410                xfs_trans_cancel(tp, cancel_flags);
2411                IRELE(ip);
2412                goto std_return;
2413        }
2414
2415        /*
2416         * At this point, we've gotten both the directory and the entry
2417         * inodes locked.
2418         */
2419        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2420        if (dp != ip) {
2421                /*
2422                 * Increment vnode ref count only in this case since
2423                 * there's an extra vnode reference in the case where
2424                 * dp == ip.
2425                 */
2426                IHOLD(dp);
2427                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2428        }
2429
2430        /*
2431         * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2432         */
2433        XFS_BMAP_INIT(&free_list, &first_block);
2434        error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2435                                        &first_block, &free_list, 0);
2436        if (error) {
2437                ASSERT(error != ENOENT);
2438                REMOVE_DEBUG_TRACE(__LINE__);
2439                goto error1;
2440        }
2441        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2442
2443        dp->i_gen++;
2444        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2445
2446        error = xfs_droplink(tp, ip);
2447        if (error) {
2448                REMOVE_DEBUG_TRACE(__LINE__);
2449                goto error1;
2450        }
2451
2452        /* Determine if this is the last link while
2453         * we are in the transaction.
2454         */
2455        link_zero = (ip)->i_d.di_nlink==0;
2456
2457        /*
2458         * Take an extra ref on the inode so that it doesn't
2459         * go to xfs_inactive() from within the commit.
2460         */
2461        IHOLD(ip);
2462
2463        /*
2464         * If this is a synchronous mount, make sure that the
2465         * remove transaction goes to disk before returning to
2466         * the user.
2467         */
2468        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2469                xfs_trans_set_sync(tp);
2470        }
2471
2472        error = xfs_bmap_finish(&tp, &free_list, &committed);
2473        if (error) {
2474                REMOVE_DEBUG_TRACE(__LINE__);
2475                goto error_rele;
2476        }
2477
2478        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2479        if (error) {
2480                IRELE(ip);
2481                goto std_return;
2482        }
2483
2484        /*
2485         * Before we drop our extra reference to the inode, purge it
2486         * from the refcache if it is there.  By waiting until afterwards
2487         * to do the IRELE, we ensure that we won't go inactive in the
2488         * xfs_refcache_purge_ip routine (although that would be OK).
2489         */
2490        xfs_refcache_purge_ip(ip);
2491
2492        /*
2493         * If we are using filestreams, kill the stream association.
2494         * If the file is still open it may get a new one but that
2495         * will get killed on last close in xfs_close() so we don't
2496         * have to worry about that.
2497         */
2498        if (link_zero && xfs_inode_is_filestream(ip))
2499                xfs_filestream_deassociate(ip);
2500
2501        vn_trace_exit(ip, __FUNCTION__, (inst_t *)__return_address);
2502
2503        IRELE(ip);
2504
2505/*      Fall through to std_return with error = 0 */
2506 std_return:
2507        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2508                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2509                                dir_vp, DM_RIGHT_NULL,
2510                                NULL, DM_RIGHT_NULL,
2511                                name, NULL, dm_di_mode, error, 0);
2512        }
2513        return error;
2514
2515 error1:
2516        xfs_bmap_cancel(&free_list);
2517        cancel_flags |= XFS_TRANS_ABORT;
2518        xfs_trans_cancel(tp, cancel_flags);
2519        goto std_return;
2520
2521 error_rele:
2522        /*
2523         * In this case make sure to not release the inode until after
2524         * the current transaction is aborted.  Releasing it beforehand
2525         * can cause us to go to xfs_inactive and start a recursive
2526         * transaction which can easily deadlock with the current one.
2527         */
2528        xfs_bmap_cancel(&free_list);
2529        cancel_flags |= XFS_TRANS_ABORT;
2530        xfs_trans_cancel(tp, cancel_flags);
2531
2532        /*
2533         * Before we drop our extra reference to the inode, purge it
2534         * from the refcache if it is there.  By waiting until afterwards
2535         * to do the IRELE, we ensure that we won't go inactive in the
2536         * xfs_refcache_purge_ip routine (although that would be OK).
2537         */
2538        xfs_refcache_purge_ip(ip);
2539
2540        IRELE(ip);
2541
2542        goto std_return;
2543}
2544
2545int
2546xfs_link(
2547        xfs_inode_t             *tdp,
2548        bhv_vnode_t             *src_vp,
2549        bhv_vname_t             *dentry)
2550{
2551        bhv_vnode_t             *target_dir_vp = XFS_ITOV(tdp);
2552        xfs_mount_t             *mp = tdp->i_mount;
2553        xfs_inode_t             *sip = xfs_vtoi(src_vp);
2554        xfs_trans_t             *tp;
2555        xfs_inode_t             *ips[2];
2556        int                     error;
2557        xfs_bmap_free_t         free_list;
2558        xfs_fsblock_t           first_block;
2559        int                     cancel_flags;
2560        int                     committed;
2561        int                     resblks;
2562        char                    *target_name = VNAME(dentry);
2563        int                     target_namelen;
2564
2565        vn_trace_entry(tdp, __FUNCTION__, (inst_t *)__return_address);
2566        vn_trace_entry(xfs_vtoi(src_vp), __FUNCTION__, (inst_t *)__return_address);
2567
2568        target_namelen = VNAMELEN(dentry);
2569        ASSERT(!VN_ISDIR(src_vp));
2570
2571        if (XFS_FORCED_SHUTDOWN(mp))
2572                return XFS_ERROR(EIO);
2573
2574        if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2575                error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2576                                        target_dir_vp, DM_RIGHT_NULL,
2577                                        src_vp, DM_RIGHT_NULL,
2578                                        target_name, NULL, 0, 0, 0);
2579                if (error)
2580                        return error;
2581        }
2582
2583        /* Return through std_return after this point. */
2584
2585        error = XFS_QM_DQATTACH(mp, sip, 0);
2586        if (!error && sip != tdp)
2587                error = XFS_QM_DQATTACH(mp, tdp, 0);
2588        if (error)
2589                goto std_return;
2590
2591        tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2592        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2593        resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2594        error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2595                        XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2596        if (error == ENOSPC) {
2597                resblks = 0;
2598                error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2599                                XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2600        }
2601        if (error) {
2602                cancel_flags = 0;
2603                goto error_return;
2604        }
2605
2606        if (sip->i_ino < tdp->i_ino) {
2607                ips[0] = sip;
2608                ips[1] = tdp;
2609        } else {
2610                ips[0] = tdp;
2611                ips[1] = sip;
2612        }
2613
2614        xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2615
2616        /*
2617         * Increment vnode ref counts since xfs_trans_commit &
2618         * xfs_trans_cancel will both unlock the inodes and
2619         * decrement the associated ref counts.
2620         */
2621        VN_HOLD(src_vp);
2622        VN_HOLD(target_dir_vp);
2623        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2624        xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2625
2626        /*
2627         * If the source has too many links, we can't make any more to it.
2628         */
2629        if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2630                error = XFS_ERROR(EMLINK);
2631                goto error_return;
2632        }
2633
2634        /*
2635         * If we are using project inheritance, we only allow hard link
2636         * creation in our tree when the project IDs are the same; else
2637         * the tree quota mechanism could be circumvented.
2638         */
2639        if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2640                     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2641                error = XFS_ERROR(EXDEV);
2642                goto error_return;
2643        }
2644
2645        if (resblks == 0 &&
2646            (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2647                goto error_return;
2648
2649        XFS_BMAP_INIT(&free_list, &first_block);
2650
2651        error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2652                                   sip->i_ino, &first_block, &free_list,
2653                                   resblks);
2654        if (error)
2655                goto abort_return;
2656        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2657        tdp->i_gen++;
2658        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2659
2660        error = xfs_bumplink(tp, sip);
2661        if (error)
2662                goto abort_return;
2663
2664        /*
2665         * If this is a synchronous mount, make sure that the
2666         * link transaction goes to disk before returning to
2667         * the user.
2668         */
2669        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2670                xfs_trans_set_sync(tp);
2671        }
2672
2673        error = xfs_bmap_finish (&tp, &free_list, &committed);
2674        if (error) {
2675                xfs_bmap_cancel(&free_list);
2676                goto abort_return;
2677        }
2678
2679        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2680        if (error)
2681                goto std_return;
2682
2683        /* Fall through to std_return with error = 0. */
2684std_return:
2685        if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2686                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2687                                target_dir_vp, DM_RIGHT_NULL,
2688                                src_vp, DM_RIGHT_NULL,
2689                                target_name, NULL, 0, error, 0);
2690        }
2691        return error;
2692
2693 abort_return:
2694        cancel_flags |= XFS_TRANS_ABORT;
2695        /* FALLTHROUGH */
2696
2697 error_return:
2698        xfs_trans_cancel(tp, cancel_flags);
2699        goto std_return;
2700}
2701
2702
2703int
2704xfs_mkdir(
2705        xfs_inode_t             *dp,
2706        bhv_vname_t             *dentry,
2707        mode_t                  mode,
2708        bhv_vnode_t             **vpp,
2709        cred_t                  *credp)
2710{
2711        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2712        char                    *dir_name = VNAME(dentry);
2713        int                     dir_namelen = VNAMELEN(dentry);
2714        xfs_mount_t             *mp = dp->i_mount;
2715        xfs_inode_t             *cdp;   /* inode of created dir */
2716        bhv_vnode_t             *cvp;   /* vnode of created dir */
2717        xfs_trans_t             *tp;
2718        int                     cancel_flags;
2719        int                     error;
2720        int                     committed;
2721        xfs_bmap_free_t         free_list;
2722        xfs_fsblock_t           first_block;
2723        boolean_t               unlock_dp_on_error = B_FALSE;
2724        boolean_t               created = B_FALSE;
2725        int                     dm_event_sent = 0;
2726        xfs_prid_t              prid;
2727        struct xfs_dquot        *udqp, *gdqp;
2728        uint                    resblks;
2729
2730        if (XFS_FORCED_SHUTDOWN(mp))
2731                return XFS_ERROR(EIO);
2732
2733        tp = NULL;
2734
2735        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2736                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2737                                        dir_vp, DM_RIGHT_NULL, NULL,
2738                                        DM_RIGHT_NULL, dir_name, NULL,
2739                                        mode, 0, 0);
2740                if (error)
2741                        return error;
2742                dm_event_sent = 1;
2743        }
2744
2745        /* Return through std_return after this point. */
2746
2747        vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
2748
2749        mp = dp->i_mount;
2750        udqp = gdqp = NULL;
2751        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2752                prid = dp->i_d.di_projid;
2753        else
2754                prid = (xfs_prid_t)dfltprid;
2755
2756        /*
2757         * Make sure that we have allocated dquot(s) on disk.
2758         */
2759        error = XFS_QM_DQVOPALLOC(mp, dp,
2760                        current_fsuid(credp), current_fsgid(credp), prid,
2761                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2762        if (error)
2763                goto std_return;
2764
2765        tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2766        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2767        resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2768        error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2769                                  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2770        if (error == ENOSPC) {
2771                resblks = 0;
2772                error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2773                                          XFS_TRANS_PERM_LOG_RES,
2774                                          XFS_MKDIR_LOG_COUNT);
2775        }
2776        if (error) {
2777                cancel_flags = 0;
2778                goto error_return;
2779        }
2780
2781        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2782        unlock_dp_on_error = B_TRUE;
2783
2784        /*
2785         * Check for directory link count overflow.
2786         */
2787        if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2788                error = XFS_ERROR(EMLINK);
2789                goto error_return;
2790        }
2791
2792        /*
2793         * Reserve disk quota and the inode.
2794         */
2795        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2796        if (error)
2797                goto error_return;
2798
2799        if (resblks == 0 &&
2800            (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2801                goto error_return;
2802        /*
2803         * create the directory inode.
2804         */
2805        error = xfs_dir_ialloc(&tp, dp, mode, 2,
2806                        0, credp, prid, resblks > 0,
2807                &cdp, NULL);
2808        if (error) {
2809                if (error == ENOSPC)
2810                        goto error_return;
2811                goto abort_return;
2812        }
2813        ITRACE(cdp);
2814
2815        /*
2816         * Now we add the directory inode to the transaction.
2817         * We waited until now since xfs_dir_ialloc might start
2818         * a new transaction.  Had we joined the transaction
2819         * earlier, the locks might have gotten released. An error
2820         * from here on will result in the transaction cancel
2821         * unlocking dp so don't do it explicitly in the error path.
2822         */
2823        VN_HOLD(dir_vp);
2824        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2825        unlock_dp_on_error = B_FALSE;
2826
2827        XFS_BMAP_INIT(&free_list, &first_block);
2828
2829        error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2830                                   &first_block, &free_list, resblks ?
2831                                   resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2832        if (error) {
2833                ASSERT(error != ENOSPC);
2834                goto error1;
2835        }
2836        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2837
2838        /*
2839         * Bump the in memory version number of the parent directory
2840         * so that other processes accessing it will recognize that
2841         * the directory has changed.
2842         */
2843        dp->i_gen++;
2844
2845        error = xfs_dir_init(tp, cdp, dp);
2846        if (error)
2847                goto error2;
2848
2849        cdp->i_gen = 1;
2850        error = xfs_bumplink(tp, dp);
2851        if (error)
2852                goto error2;
2853
2854        cvp = XFS_ITOV(cdp);
2855
2856        created = B_TRUE;
2857
2858        *vpp = cvp;
2859        IHOLD(cdp);
2860
2861        /*
2862         * Attach the dquots to the new inode and modify the icount incore.
2863         */
2864        XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2865
2866        /*
2867         * If this is a synchronous mount, make sure that the
2868         * mkdir transaction goes to disk before returning to
2869         * the user.
2870         */
2871        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2872                xfs_trans_set_sync(tp);
2873        }
2874
2875        error = xfs_bmap_finish(&tp, &free_list, &committed);
2876        if (error) {
2877                IRELE(cdp);
2878                goto error2;
2879        }
2880
2881        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2882        XFS_QM_DQRELE(mp, udqp);
2883        XFS_QM_DQRELE(mp, gdqp);
2884        if (error) {
2885                IRELE(cdp);
2886        }
2887
2888        /* Fall through to std_return with error = 0 or errno from
2889         * xfs_trans_commit. */
2890
2891std_return:
2892        if ((created || (error != 0 && dm_event_sent != 0)) &&
2893            DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2894                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2895                                        dir_vp, DM_RIGHT_NULL,
2896                                        created ? XFS_ITOV(cdp):NULL,
2897                                        DM_RIGHT_NULL,
2898                                        dir_name, NULL,
2899                                        mode, error, 0);
2900        }
2901        return error;
2902
2903 error2:
2904 error1:
2905        xfs_bmap_cancel(&free_list);
2906 abort_return:
2907        cancel_flags |= XFS_TRANS_ABORT;
2908 error_return:
2909        xfs_trans_cancel(tp, cancel_flags);
2910        XFS_QM_DQRELE(mp, udqp);
2911        XFS_QM_DQRELE(mp, gdqp);
2912
2913        if (unlock_dp_on_error)
2914                xfs_iunlock(dp, XFS_ILOCK_EXCL);
2915
2916        goto std_return;
2917}
2918
2919int
2920xfs_rmdir(
2921        xfs_inode_t             *dp,
2922        bhv_vname_t             *dentry)
2923{
2924        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2925        char                    *name = VNAME(dentry);
2926        int                     namelen = VNAMELEN(dentry);
2927        xfs_mount_t             *mp = dp->i_mount;
2928        xfs_inode_t             *cdp;   /* child directory */
2929        xfs_trans_t             *tp;
2930        int                     error;
2931        xfs_bmap_free_t         free_list;
2932        xfs_fsblock_t           first_block;
2933        int                     cancel_flags;
2934        int                     committed;
2935        int                     dm_di_mode = S_IFDIR;
2936        int                     last_cdp_link;
2937        uint                    resblks;
2938
2939        vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
2940
2941        if (XFS_FORCED_SHUTDOWN(mp))
2942                return XFS_ERROR(EIO);
2943
2944        if (!xfs_get_dir_entry(dentry, &cdp)) {
2945                dm_di_mode = cdp->i_d.di_mode;
2946                IRELE(cdp);
2947        }
2948
2949        if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2950                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
2951                                        dir_vp, DM_RIGHT_NULL,
2952                                        NULL, DM_RIGHT_NULL,
2953                                        name, NULL, dm_di_mode, 0, 0);
2954                if (error)
2955                        return XFS_ERROR(error);
2956        }
2957
2958        /* Return through std_return after this point. */
2959
2960        cdp = NULL;
2961
2962        /*
2963         * We need to get a reference to cdp before we get our log
2964         * reservation.  The reason for this is that we cannot call
2965         * xfs_iget for an inode for which we do not have a reference
2966         * once we've acquired a log reservation.  This is because the
2967         * inode we are trying to get might be in xfs_inactive going
2968         * for a log reservation.  Since we'll have to wait for the
2969         * inactive code to complete before returning from xfs_iget,
2970         * we need to make sure that we don't have log space reserved
2971         * when we call xfs_iget.  Instead we get an unlocked reference
2972         * to the inode before getting our log reservation.
2973         */
2974        error = xfs_get_dir_entry(dentry, &cdp);
2975        if (error) {
2976                REMOVE_DEBUG_TRACE(__LINE__);
2977                goto std_return;
2978        }
2979        mp = dp->i_mount;
2980        dm_di_mode = cdp->i_d.di_mode;
2981
2982        /*
2983         * Get the dquots for the inodes.
2984         */
2985        error = XFS_QM_DQATTACH(mp, dp, 0);
2986        if (!error && dp != cdp)
2987                error = XFS_QM_DQATTACH(mp, cdp, 0);
2988        if (error) {
2989                IRELE(cdp);
2990                REMOVE_DEBUG_TRACE(__LINE__);
2991                goto std_return;
2992        }
2993
2994        tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2995        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2996        /*
2997         * We try to get the real space reservation first,
2998         * allowing for directory btree deletion(s) implying
2999         * possible bmap insert(s).  If we can't get the space
3000         * reservation then we use 0 instead, and avoid the bmap
3001         * btree insert(s) in the directory code by, if the bmap
3002         * insert tries to happen, instead trimming the LAST
3003         * block from the directory.
3004         */
3005        resblks = XFS_REMOVE_SPACE_RES(mp);
3006        error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3007                        XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3008        if (error == ENOSPC) {
3009                resblks = 0;
3010                error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3011                                XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3012        }
3013        if (error) {
3014                ASSERT(error != ENOSPC);
3015                cancel_flags = 0;
3016                IRELE(cdp);
3017                goto error_return;
3018        }
3019        XFS_BMAP_INIT(&free_list, &first_block);
3020
3021        /*
3022         * Now lock the child directory inode and the parent directory
3023         * inode in the proper order.  This will take care of validating
3024         * that the directory entry for the child directory inode has
3025         * not changed while we were obtaining a log reservation.
3026         */
3027        error = xfs_lock_dir_and_entry(dp, cdp);
3028        if (error) {
3029                xfs_trans_cancel(tp, cancel_flags);
3030                IRELE(cdp);
3031                goto std_return;
3032        }
3033
3034        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3035        if (dp != cdp) {
3036                /*
3037                 * Only increment the parent directory vnode count if
3038                 * we didn't bump it in looking up cdp.  The only time
3039                 * we don't bump it is when we're looking up ".".
3040                 */
3041                VN_HOLD(dir_vp);
3042        }
3043
3044        ITRACE(cdp);
3045        xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3046
3047        ASSERT(cdp->i_d.di_nlink >= 2);
3048        if (cdp->i_d.di_nlink != 2) {
3049                error = XFS_ERROR(ENOTEMPTY);
3050                goto error_return;
3051        }
3052        if (!xfs_dir_isempty(cdp)) {
3053                error = XFS_ERROR(ENOTEMPTY);
3054                goto error_return;
3055        }
3056
3057        error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
3058                                        &first_block, &free_list, resblks);
3059        if (error)
3060                goto error1;
3061
3062        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3063
3064        /*
3065         * Bump the in memory generation count on the parent
3066         * directory so that other can know that it has changed.
3067         */
3068        dp->i_gen++;
3069
3070        /*
3071         * Drop the link from cdp's "..".
3072         */
3073        error = xfs_droplink(tp, dp);
3074        if (error) {
3075                goto error1;
3076        }
3077
3078        /*
3079         * Drop the link from dp to cdp.
3080         */
3081        error = xfs_droplink(tp, cdp);
3082        if (error) {
3083                goto error1;
3084        }
3085
3086        /*
3087         * Drop the "." link from cdp to self.
3088         */
3089        error = xfs_droplink(tp, cdp);
3090        if (error) {
3091                goto error1;
3092        }
3093
3094        /* Determine these before committing transaction */
3095        last_cdp_link = (cdp)->i_d.di_nlink==0;
3096
3097        /*
3098         * Take an extra ref on the child vnode so that it
3099         * does not go to xfs_inactive() from within the commit.
3100         */
3101        IHOLD(cdp);
3102
3103        /*
3104         * If this is a synchronous mount, make sure that the
3105         * rmdir transaction goes to disk before returning to
3106         * the user.
3107         */
3108        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3109                xfs_trans_set_sync(tp);
3110        }
3111
3112        error = xfs_bmap_finish (&tp, &free_list, &committed);
3113        if (error) {
3114                xfs_bmap_cancel(&free_list);
3115                xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3116                                 XFS_TRANS_ABORT));
3117                IRELE(cdp);
3118                goto std_return;
3119        }
3120
3121        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3122        if (error) {
3123                IRELE(cdp);
3124                goto std_return;
3125        }
3126
3127
3128        IRELE(cdp);
3129
3130        /* Fall through to std_return with error = 0 or the errno
3131         * from xfs_trans_commit. */
3132 std_return:
3133        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
3134                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3135                                        dir_vp, DM_RIGHT_NULL,
3136                                        NULL, DM_RIGHT_NULL,
3137                                        name, NULL, dm_di_mode,
3138                                        error, 0);
3139        }
3140        return error;
3141
3142 error1:
3143        xfs_bmap_cancel(&free_list);
3144        cancel_flags |= XFS_TRANS_ABORT;
3145        /* FALLTHROUGH */
3146
3147 error_return:
3148        xfs_trans_cancel(tp, cancel_flags);
3149        goto std_return;
3150}
3151
3152int
3153xfs_symlink(
3154        xfs_inode_t             *dp,
3155        bhv_vname_t             *dentry,
3156        char                    *target_path,
3157        mode_t                  mode,
3158        bhv_vnode_t             **vpp,
3159        cred_t                  *credp)
3160{
3161        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
3162        xfs_mount_t             *mp = dp->i_mount;
3163        xfs_trans_t             *tp;
3164        xfs_inode_t             *ip;
3165        int                     error;
3166        int                     pathlen;
3167        xfs_bmap_free_t         free_list;
3168        xfs_fsblock_t           first_block;
3169        boolean_t               unlock_dp_on_error = B_FALSE;
3170        uint                    cancel_flags;
3171        int                     committed;
3172        xfs_fileoff_t           first_fsb;
3173        xfs_filblks_t           fs_blocks;
3174        int                     nmaps;
3175        xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3176        xfs_daddr_t             d;
3177        char                    *cur_chunk;
3178        int                     byte_cnt;
3179        int                     n;
3180        xfs_buf_t               *bp;
3181        xfs_prid_t              prid;
3182        struct xfs_dquot        *udqp, *gdqp;
3183        uint                    resblks;
3184        char                    *link_name = VNAME(dentry);
3185        int                     link_namelen;
3186
3187        *vpp = NULL;
3188        error = 0;
3189        ip = NULL;
3190        tp = NULL;
3191
3192        vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
3193
3194
3195        if (XFS_FORCED_SHUTDOWN(mp))
3196                return XFS_ERROR(EIO);
3197
3198        link_namelen = VNAMELEN(dentry);
3199
3200        /*
3201         * Check component lengths of the target path name.
3202         */
3203        pathlen = strlen(target_path);
3204        if (pathlen >= MAXPATHLEN)      /* total string too long */
3205                return XFS_ERROR(ENAMETOOLONG);
3206        if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3207                int len, total;
3208                char *path;
3209
3210                for (total = 0, path = target_path; total < pathlen;) {
3211                        /*
3212                         * Skip any slashes.
3213                         */
3214                        while(*path == '/') {
3215                                total++;
3216                                path++;
3217                        }
3218
3219                        /*
3220                         * Count up to the next slash or end of path.
3221                         * Error out if the component is bigger than MAXNAMELEN.
3222                         */
3223                        for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3224                                if (++len >= MAXNAMELEN) {
3225                                        error = ENAMETOOLONG;
3226                                        return error;
3227                                }
3228                        }
3229                }
3230        }
3231
3232        if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
3233                error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3234                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3235                                        link_name, target_path, 0, 0, 0);
3236                if (error)
3237                        return error;
3238        }
3239
3240        /* Return through std_return after this point. */
3241
3242        udqp = gdqp = NULL;
3243        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3244                prid = dp->i_d.di_projid;
3245        else
3246                prid = (xfs_prid_t)dfltprid;
3247
3248        /*
3249         * Make sure that we have allocated dquot(s) on disk.
3250         */
3251        error = XFS_QM_DQVOPALLOC(mp, dp,
3252                        current_fsuid(credp), current_fsgid(credp), prid,
3253                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3254        if (error)
3255                goto std_return;
3256
3257        tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3258        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3259        /*
3260         * The symlink will fit into the inode data fork?
3261         * There can't be any attributes so we get the whole variable part.
3262         */
3263        if (pathlen <= XFS_LITINO(mp))
3264                fs_blocks = 0;
3265        else
3266                fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3267        resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3268        error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3269                        XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3270        if (error == ENOSPC && fs_blocks == 0) {
3271                resblks = 0;
3272                error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3273                                XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3274        }
3275        if (error) {
3276                cancel_flags = 0;
3277                goto error_return;
3278        }
3279
3280        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3281        unlock_dp_on_error = B_TRUE;
3282
3283        /*
3284         * Check whether the directory allows new symlinks or not.
3285         */
3286        if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3287                error = XFS_ERROR(EPERM);
3288                goto error_return;
3289        }
3290
3291        /*
3292         * Reserve disk quota : blocks and inode.
3293         */
3294        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3295        if (error)
3296                goto error_return;
3297
3298        /*
3299         * Check for ability to enter directory entry, if no space reserved.
3300         */
3301        if (resblks == 0 &&
3302            (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3303                goto error_return;
3304        /*
3305         * Initialize the bmap freelist prior to calling either
3306         * bmapi or the directory create code.
3307         */
3308        XFS_BMAP_INIT(&free_list, &first_block);
3309
3310        /*
3311         * Allocate an inode for the symlink.
3312         */
3313        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
3314                               1, 0, credp, prid, resblks > 0, &ip, NULL);
3315        if (error) {
3316                if (error == ENOSPC)
3317                        goto error_return;
3318                goto error1;
3319        }
3320        ITRACE(ip);
3321
3322        /*
3323         * An error after we've joined dp to the transaction will result in the
3324         * transaction cancel unlocking dp so don't do it explicitly in the
3325         * error path.
3326         */
3327        VN_HOLD(dir_vp);
3328        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3329        unlock_dp_on_error = B_FALSE;
3330
3331        /*
3332         * Also attach the dquot(s) to it, if applicable.
3333         */
3334        XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3335
3336        if (resblks)
3337                resblks -= XFS_IALLOC_SPACE_RES(mp);
3338        /*
3339         * If the symlink will fit into the inode, write it inline.
3340         */
3341        if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3342                xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3343                memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3344                ip->i_d.di_size = pathlen;
3345
3346                /*
3347                 * The inode was initially created in extent format.
3348                 */
3349                ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3350                ip->i_df.if_flags |= XFS_IFINLINE;
3351
3352                ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3353                xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3354
3355        } else {
3356                first_fsb = 0;
3357                nmaps = SYMLINK_MAPS;
3358
3359                error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3360                                  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3361                                  &first_block, resblks, mval, &nmaps,
3362                                  &free_list, NULL);
3363                if (error) {
3364                        goto error1;
3365                }
3366
3367                if (resblks)
3368                        resblks -= fs_blocks;
3369                ip->i_d.di_size = pathlen;
3370                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3371
3372                cur_chunk = target_path;
3373                for (n = 0; n < nmaps; n++) {
3374                        d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3375                        byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3376                        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3377                                               BTOBB(byte_cnt), 0);
3378                        ASSERT(bp && !XFS_BUF_GETERROR(bp));
3379                        if (pathlen < byte_cnt) {
3380                                byte_cnt = pathlen;
3381                        }
3382                        pathlen -= byte_cnt;
3383
3384                        memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3385                        cur_chunk += byte_cnt;
3386
3387                        xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3388                }
3389        }
3390
3391        /*
3392         * Create the directory entry for the symlink.
3393         */
3394        error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3395                                   &first_block, &free_list, resblks);
3396        if (error)
3397                goto error1;
3398        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3399        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3400
3401        /*
3402         * Bump the in memory version number of the parent directory
3403         * so that other processes accessing it will recognize that
3404         * the directory has changed.
3405         */
3406        dp->i_gen++;
3407
3408        /*
3409         * If this is a synchronous mount, make sure that the
3410         * symlink transaction goes to disk before returning to
3411         * the user.
3412         */
3413        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3414                xfs_trans_set_sync(tp);
3415        }
3416
3417        /*
3418         * xfs_trans_commit normally decrements the vnode ref count
3419         * when it unlocks the inode. Since we want to return the
3420         * vnode to the caller, we bump the vnode ref count now.
3421         */
3422        IHOLD(ip);
3423
3424        error = xfs_bmap_finish(&tp, &free_list, &committed);
3425        if (error) {
3426                goto error2;
3427        }
3428        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3429        XFS_QM_DQRELE(mp, udqp);
3430        XFS_QM_DQRELE(mp, gdqp);
3431
3432        /* Fall through to std_return with error = 0 or errno from
3433         * xfs_trans_commit     */
3434std_return:
3435        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
3436                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3437                                        dir_vp, DM_RIGHT_NULL,
3438                                        error ? NULL : XFS_ITOV(ip),
3439                                        DM_RIGHT_NULL, link_name, target_path,
3440                                        0, error, 0);
3441        }
3442
3443        if (!error) {
3444                bhv_vnode_t *vp;
3445
3446                ASSERT(ip);
3447                vp = XFS_ITOV(ip);
3448                *vpp = vp;
3449        }
3450        return error;
3451
3452 error2:
3453        IRELE(ip);
3454 error1:
3455        xfs_bmap_cancel(&free_list);
3456        cancel_flags |= XFS_TRANS_ABORT;
3457 error_return:
3458        xfs_trans_cancel(tp, cancel_flags);
3459        XFS_QM_DQRELE(mp, udqp);
3460        XFS_QM_DQRELE(mp, gdqp);
3461
3462        if (unlock_dp_on_error)
3463                xfs_iunlock(dp, XFS_ILOCK_EXCL);
3464
3465        goto std_return;
3466}
3467
3468
3469int
3470xfs_fid2(
3471        xfs_inode_t     *ip,
3472        xfs_fid_t       *xfid)
3473{
3474        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
3475
3476        xfid->fid_len = sizeof(xfs_fid_t) - sizeof(xfid->fid_len);
3477        xfid->fid_pad = 0;
3478        /*
3479         * use memcpy because the inode is a long long and there's no
3480         * assurance that xfid->fid_ino is properly aligned.
3481         */
3482        memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3483        xfid->fid_gen = ip->i_d.di_gen;
3484
3485        return 0;
3486}
3487
3488
3489int
3490xfs_rwlock(
3491        xfs_inode_t     *ip,
3492        bhv_vrwlock_t   locktype)
3493{
3494        if (S_ISDIR(ip->i_d.di_mode))
3495                return 1;
3496        if (locktype == VRWLOCK_WRITE) {
3497                xfs_ilock(ip, XFS_IOLOCK_EXCL);
3498        } else if (locktype == VRWLOCK_TRY_READ) {
3499                return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3500        } else if (locktype == VRWLOCK_TRY_WRITE) {
3501                return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3502        } else {
3503                ASSERT((locktype == VRWLOCK_READ) ||
3504                       (locktype == VRWLOCK_WRITE_DIRECT));
3505                xfs_ilock(ip, XFS_IOLOCK_SHARED);
3506        }
3507
3508        return 1;
3509}
3510
3511
3512void
3513xfs_rwunlock(
3514        xfs_inode_t     *ip,
3515        bhv_vrwlock_t   locktype)
3516{
3517        if (S_ISDIR(ip->i_d.di_mode))
3518                return;
3519        if (locktype == VRWLOCK_WRITE) {
3520                /*
3521                 * In the write case, we may have added a new entry to
3522                 * the reference cache.  This might store a pointer to
3523                 * an inode to be released in this inode.  If it is there,
3524                 * clear the pointer and release the inode after unlocking
3525                 * this one.
3526                 */
3527                xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3528        } else {
3529                ASSERT((locktype == VRWLOCK_READ) ||
3530                       (locktype == VRWLOCK_WRITE_DIRECT));
3531                xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3532        }
3533        return;
3534}
3535
3536
3537int
3538xfs_inode_flush(
3539        xfs_inode_t     *ip,
3540        int             flags)
3541{
3542        xfs_mount_t     *mp = ip->i_mount;
3543        xfs_inode_log_item_t *iip = ip->i_itemp;
3544        int             error = 0;
3545
3546        if (XFS_FORCED_SHUTDOWN(mp))
3547                return XFS_ERROR(EIO);
3548
3549        /*
3550         * Bypass inodes which have already been cleaned by
3551         * the inode flush clustering code inside xfs_iflush
3552         */
3553        if ((ip->i_update_core == 0) &&
3554            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3555                return 0;
3556
3557        if (flags & FLUSH_LOG) {
3558                if (iip && iip->ili_last_lsn) {
3559                        xlog_t          *log = mp->m_log;
3560                        xfs_lsn_t       sync_lsn;
3561                        int             s, log_flags = XFS_LOG_FORCE;
3562
3563                        s = GRANT_LOCK(log);
3564                        sync_lsn = log->l_last_sync_lsn;
3565                        GRANT_UNLOCK(log, s);
3566
3567                        if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
3568                                if (flags & FLUSH_SYNC)
3569                                        log_flags |= XFS_LOG_SYNC;
3570                                error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3571                                if (error)
3572                                        return error;
3573                        }
3574
3575                        if (ip->i_update_core == 0)
3576                                return 0;
3577                }
3578        }
3579
3580        /*
3581         * We make this non-blocking if the inode is contended,
3582         * return EAGAIN to indicate to the caller that they
3583         * did not succeed. This prevents the flush path from
3584         * blocking on inodes inside another operation right
3585         * now, they get caught later by xfs_sync.
3586         */
3587        if (flags & FLUSH_INODE) {
3588                int     flush_flags;
3589
3590                if (flags & FLUSH_SYNC) {
3591                        xfs_ilock(ip, XFS_ILOCK_SHARED);
3592                        xfs_iflock(ip);
3593                } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3594                        if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3595                                xfs_iunlock(ip, XFS_ILOCK_SHARED);
3596                                return EAGAIN;
3597                        }
3598                } else {
3599                        return EAGAIN;
3600                }
3601
3602                if (flags & FLUSH_SYNC)
3603                        flush_flags = XFS_IFLUSH_SYNC;
3604                else
3605                        flush_flags = XFS_IFLUSH_ASYNC;
3606
3607                error = xfs_iflush(ip, flush_flags);
3608                xfs_iunlock(ip, XFS_ILOCK_SHARED);
3609        }
3610
3611        return error;
3612}
3613
3614
3615int
3616xfs_set_dmattrs(
3617        xfs_inode_t     *ip,
3618        u_int           evmask,
3619        u_int16_t       state)
3620{
3621        xfs_mount_t     *mp = ip->i_mount;
3622        xfs_trans_t     *tp;
3623        int             error;
3624
3625        if (!capable(CAP_SYS_ADMIN))
3626                return XFS_ERROR(EPERM);
3627
3628        if (XFS_FORCED_SHUTDOWN(mp))
3629                return XFS_ERROR(EIO);
3630
3631        tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3632        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3633        if (error) {
3634                xfs_trans_cancel(tp, 0);
3635                return error;
3636        }
3637        xfs_ilock(ip, XFS_ILOCK_EXCL);
3638        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3639
3640        ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3641        ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3642
3643        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3644        IHOLD(ip);
3645        error = xfs_trans_commit(tp, 0);
3646
3647        return error;
3648}
3649
3650int
3651xfs_reclaim(
3652        xfs_inode_t     *ip)
3653{
3654        bhv_vnode_t     *vp = XFS_ITOV(ip);
3655
3656        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
3657
3658        ASSERT(!VN_MAPPED(vp));
3659
3660        /* bad inode, get out here ASAP */
3661        if (VN_BAD(vp)) {
3662                xfs_ireclaim(ip);
3663                return 0;
3664        }
3665
3666        vn_iowait(ip);
3667
3668        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3669
3670        /*
3671         * Make sure the atime in the XFS inode is correct before freeing the
3672         * Linux inode.
3673         */
3674        xfs_synchronize_atime(ip);
3675
3676        /*
3677         * If we have nothing to flush with this inode then complete the
3678         * teardown now, otherwise break the link between the xfs inode and the
3679         * linux inode and clean up the xfs inode later. This avoids flushing
3680         * the inode to disk during the delete operation itself.
3681         *
3682         * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3683         * first to ensure that xfs_iunpin() will never see an xfs inode
3684         * that has a linux inode being reclaimed. Synchronisation is provided
3685         * by the i_flags_lock.
3686         */
3687        if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3688                xfs_ilock(ip, XFS_ILOCK_EXCL);
3689                xfs_iflock(ip);
3690                return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3691        } else {
3692                xfs_mount_t     *mp = ip->i_mount;
3693
3694                /* Protect sync and unpin from us */
3695                XFS_MOUNT_ILOCK(mp);
3696                spin_lock(&ip->i_flags_lock);
3697                __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3698                vn_to_inode(vp)->i_private = NULL;
3699                ip->i_vnode = NULL;
3700                spin_unlock(&ip->i_flags_lock);
3701                list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3702                XFS_MOUNT_IUNLOCK(mp);
3703        }
3704        return 0;
3705}
3706
3707int
3708xfs_finish_reclaim(
3709        xfs_inode_t     *ip,
3710        int             locked,
3711        int             sync_mode)
3712{
3713        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
3714        bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3715        int             error;
3716
3717        if (vp && VN_BAD(vp))
3718                goto reclaim;
3719
3720        /* The hash lock here protects a thread in xfs_iget_core from
3721         * racing with us on linking the inode back with a vnode.
3722         * Once we have the XFS_IRECLAIM flag set it will not touch
3723         * us.
3724         */
3725        write_lock(&pag->pag_ici_lock);
3726        spin_lock(&ip->i_flags_lock);
3727        if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3728            (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3729                spin_unlock(&ip->i_flags_lock);
3730                write_unlock(&pag->pag_ici_lock);
3731                if (locked) {
3732                        xfs_ifunlock(ip);
3733                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
3734                }
3735                return 1;
3736        }
3737        __xfs_iflags_set(ip, XFS_IRECLAIM);
3738        spin_unlock(&ip->i_flags_lock);
3739        write_unlock(&pag->pag_ici_lock);
3740        xfs_put_perag(ip->i_mount, pag);
3741
3742        /*
3743         * If the inode is still dirty, then flush it out.  If the inode
3744         * is not in the AIL, then it will be OK to flush it delwri as
3745         * long as xfs_iflush() does not keep any references to the inode.
3746         * We leave that decision up to xfs_iflush() since it has the
3747         * knowledge of whether it's OK to simply do a delwri flush of
3748         * the inode or whether we need to wait until the inode is
3749         * pulled from the AIL.
3750         * We get the flush lock regardless, though, just to make sure
3751         * we don't free it while it is being flushed.
3752         */
3753        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3754                if (!locked) {
3755                        xfs_ilock(ip, XFS_ILOCK_EXCL);
3756                        xfs_iflock(ip);
3757                }
3758
3759                if (ip->i_update_core ||
3760                    ((ip->i_itemp != NULL) &&
3761                     (ip->i_itemp->ili_format.ilf_fields != 0))) {
3762                        error = xfs_iflush(ip, sync_mode);
3763                        /*
3764                         * If we hit an error, typically because of filesystem
3765                         * shutdown, we don't need to let vn_reclaim to know
3766                         * because we're gonna reclaim the inode anyway.
3767                         */
3768                        if (error) {
3769                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
3770                                goto reclaim;
3771                        }
3772                        xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3773                }
3774
3775                ASSERT(ip->i_update_core == 0);
3776                ASSERT(ip->i_itemp == NULL ||
3777                       ip->i_itemp->ili_format.ilf_fields == 0);
3778                xfs_iunlock(ip, XFS_ILOCK_EXCL);
3779        } else if (locked) {
3780                /*
3781                 * We are not interested in doing an iflush if we're
3782                 * in the process of shutting down the filesystem forcibly.
3783                 * So, just reclaim the inode.
3784                 */
3785                xfs_ifunlock(ip);
3786                xfs_iunlock(ip, XFS_ILOCK_EXCL);
3787        }
3788
3789 reclaim:
3790        xfs_ireclaim(ip);
3791        return 0;
3792}
3793
3794int
3795xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3796{
3797        int             purged;
3798        xfs_inode_t     *ip, *n;
3799        int             done = 0;
3800
3801        while (!done) {
3802                purged = 0;
3803                XFS_MOUNT_ILOCK(mp);
3804                list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3805                        if (noblock) {
3806                                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3807                                        continue;
3808                                if (xfs_ipincount(ip) ||
3809                                    !xfs_iflock_nowait(ip)) {
3810                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
3811                                        continue;
3812                                }
3813                        }
3814                        XFS_MOUNT_IUNLOCK(mp);
3815                        if (xfs_finish_reclaim(ip, noblock,
3816                                        XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3817                                delay(1);
3818                        purged = 1;
3819                        break;
3820                }
3821
3822                done = !purged;
3823        }
3824
3825        XFS_MOUNT_IUNLOCK(mp);
3826        return 0;
3827}
3828
3829/*
3830 * xfs_alloc_file_space()
3831 *      This routine allocates disk space for the given file.
3832 *
3833 *      If alloc_type == 0, this request is for an ALLOCSP type
3834 *      request which will change the file size.  In this case, no
3835 *      DMAPI event will be generated by the call.  A TRUNCATE event
3836 *      will be generated later by xfs_setattr.
3837 *
3838 *      If alloc_type != 0, this request is for a RESVSP type
3839 *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
3840 *      lower block boundary byte address is less than the file's
3841 *      length.
3842 *
3843 * RETURNS:
3844 *       0 on success
3845 *      errno on error
3846 *
3847 */
3848STATIC int
3849xfs_alloc_file_space(
3850        xfs_inode_t             *ip,
3851        xfs_off_t               offset,
3852        xfs_off_t               len,
3853        int                     alloc_type,
3854        int                     attr_flags)
3855{
3856        xfs_mount_t             *mp = ip->i_mount;
3857        xfs_off_t               count;
3858        xfs_filblks_t           allocated_fsb;
3859        xfs_filblks_t           allocatesize_fsb;
3860        xfs_extlen_t            extsz, temp;
3861        xfs_fileoff_t           startoffset_fsb;
3862        xfs_fsblock_t           firstfsb;
3863        int                     nimaps;
3864        int                     bmapi_flag;
3865        int                     quota_flag;
3866        int                     rt;
3867        xfs_trans_t             *tp;
3868        xfs_bmbt_irec_t         imaps[1], *imapp;
3869        xfs_bmap_free_t         free_list;
3870        uint                    qblocks, resblks, resrtextents;
3871        int                     committed;
3872        int                     error;
3873
3874        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
3875
3876        if (XFS_FORCED_SHUTDOWN(mp))
3877                return XFS_ERROR(EIO);
3878
3879        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3880                return error;
3881
3882        if (len <= 0)
3883                return XFS_ERROR(EINVAL);
3884
3885        rt = XFS_IS_REALTIME_INODE(ip);
3886        extsz = xfs_get_extsz_hint(ip);
3887
3888        count = len;
3889        imapp = &imaps[0];
3890        nimaps = 1;
3891        bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
3892        startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
3893        allocatesize_fsb = XFS_B_TO_FSB(mp, count);
3894
3895        /*      Generate a DMAPI event if needed.       */
3896        if (alloc_type != 0 && offset < ip->i_size &&
3897                        (attr_flags&ATTR_DMI) == 0  &&
3898                        DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3899                xfs_off_t           end_dmi_offset;
3900
3901                end_dmi_offset = offset+len;
3902                if (end_dmi_offset > ip->i_size)
3903                        end_dmi_offset = ip->i_size;
3904                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
3905                        offset, end_dmi_offset - offset,
3906                        0, NULL);
3907                if (error)
3908                        return error;
3909        }
3910
3911        /*
3912         * Allocate file space until done or until there is an error
3913         */
3914retry:
3915        while (allocatesize_fsb && !error) {
3916                xfs_fileoff_t   s, e;
3917
3918                /*
3919                 * Determine space reservations for data/realtime.
3920                 */
3921                if (unlikely(extsz)) {
3922                        s = startoffset_fsb;
3923                        do_div(s, extsz);
3924                        s *= extsz;
3925                        e = startoffset_fsb + allocatesize_fsb;
3926                        if ((temp = do_mod(startoffset_fsb, extsz)))
3927                                e += temp;
3928                        if ((temp = do_mod(e, extsz)))
3929                                e += extsz - temp;
3930                } else {
3931                        s = 0;
3932                        e = allocatesize_fsb;
3933                }
3934
3935                if (unlikely(rt)) {
3936                        resrtextents = qblocks = (uint)(e - s);
3937                        resrtextents /= mp->m_sb.sb_rextsize;
3938                        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3939                        quota_flag = XFS_QMOPT_RES_RTBLKS;
3940                } else {
3941                        resrtextents = 0;
3942                        resblks = qblocks = \
3943                                XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
3944                        quota_flag = XFS_QMOPT_RES_REGBLKS;
3945                }
3946
3947                /*
3948                 * Allocate and setup the transaction.
3949                 */
3950                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3951                error = xfs_trans_reserve(tp, resblks,
3952                                          XFS_WRITE_LOG_RES(mp), resrtextents,
3953                                          XFS_TRANS_PERM_LOG_RES,
3954                                          XFS_WRITE_LOG_COUNT);
3955                /*
3956                 * Check for running out of space
3957                 */
3958                if (error) {
3959                        /*
3960                         * Free the transaction structure.
3961                         */
3962                        ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3963                        xfs_trans_cancel(tp, 0);
3964                        break;
3965                }
3966                xfs_ilock(ip, XFS_ILOCK_EXCL);
3967                error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
3968                                                      qblocks, 0, quota_flag);
3969                if (error)
3970                        goto error1;
3971
3972                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3973                xfs_trans_ihold(tp, ip);
3974
3975                /*
3976                 * Issue the xfs_bmapi() call to allocate the blocks
3977                 */
3978                XFS_BMAP_INIT(&free_list, &firstfsb);
3979                error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
3980                                  allocatesize_fsb, bmapi_flag,
3981                                  &firstfsb, 0, imapp, &nimaps,
3982                                  &free_list, NULL);
3983                if (error) {
3984                        goto error0;
3985                }
3986
3987                /*
3988                 * Complete the transaction
3989                 */
3990                error = xfs_bmap_finish(&tp, &free_list, &committed);
3991                if (error) {
3992                        goto error0;
3993                }
3994
3995                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3996                xfs_iunlock(ip, XFS_ILOCK_EXCL);
3997                if (error) {
3998                        break;
3999                }
4000
4001                allocated_fsb = imapp->br_blockcount;
4002
4003                if (nimaps == 0) {
4004                        error = XFS_ERROR(ENOSPC);
4005                        break;
4006                }
4007
4008                startoffset_fsb += allocated_fsb;
4009                allocatesize_fsb -= allocated_fsb;
4010        }
4011dmapi_enospc_check:
4012        if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
4013            DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
4014                error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4015                                XFS_ITOV(ip), DM_RIGHT_NULL,
4016                                XFS_ITOV(ip), DM_RIGHT_NULL,
4017                                NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4018                if (error == 0)
4019                        goto retry;     /* Maybe DMAPI app. has made space */
4020                /* else fall through with error from XFS_SEND_DATA */
4021        }
4022
4023        return error;
4024
4025error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4026        xfs_bmap_cancel(&free_list);
4027        XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4028
4029error1: /* Just cancel transaction */
4030        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4031        xfs_iunlock(ip, XFS_ILOCK_EXCL);
4032        goto dmapi_enospc_check;
4033}
4034
4035/*
4036 * Zero file bytes between startoff and endoff inclusive.
4037 * The iolock is held exclusive and no blocks are buffered.
4038 */
4039STATIC int
4040xfs_zero_remaining_bytes(
4041        xfs_inode_t             *ip,
4042        xfs_off_t               startoff,
4043        xfs_off_t               endoff)
4044{
4045        xfs_bmbt_irec_t         imap;
4046        xfs_fileoff_t           offset_fsb;
4047        xfs_off_t               lastoffset;
4048        xfs_off_t               offset;
4049        xfs_buf_t               *bp;
4050        xfs_mount_t             *mp = ip->i_mount;
4051        int                     nimap;
4052        int                     error = 0;
4053
4054        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4055                                ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4056                                mp->m_rtdev_targp : mp->m_ddev_targp);
4057
4058        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4059                offset_fsb = XFS_B_TO_FSBT(mp, offset);
4060                nimap = 1;
4061                error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4062                        NULL, 0, &imap, &nimap, NULL, NULL);
4063                if (error || nimap < 1)
4064                        break;
4065                ASSERT(imap.br_blockcount >= 1);
4066                ASSERT(imap.br_startoff == offset_fsb);
4067                lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4068                if (lastoffset > endoff)
4069                        lastoffset = endoff;
4070                if (imap.br_startblock == HOLESTARTBLOCK)
4071                        continue;
4072                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4073                if (imap.br_state == XFS_EXT_UNWRITTEN)
4074                        continue;
4075                XFS_BUF_UNDONE(bp);
4076                XFS_BUF_UNWRITE(bp);
4077                XFS_BUF_READ(bp);
4078                XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4079                xfsbdstrat(mp, bp);
4080                if ((error = xfs_iowait(bp))) {
4081                        xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4082                                          mp, bp, XFS_BUF_ADDR(bp));
4083                        break;
4084                }
4085                memset(XFS_BUF_PTR(bp) +
4086                        (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4087                      0, lastoffset - offset + 1);
4088                XFS_BUF_UNDONE(bp);
4089                XFS_BUF_UNREAD(bp);
4090                XFS_BUF_WRITE(bp);
4091                xfsbdstrat(mp, bp);
4092                if ((error = xfs_iowait(bp))) {
4093                        xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4094                                          mp, bp, XFS_BUF_ADDR(bp));
4095                        break;
4096                }
4097        }
4098        xfs_buf_free(bp);
4099        return error;
4100}
4101
4102/*
4103 * xfs_free_file_space()
4104 *      This routine frees disk space for the given file.
4105 *
4106 *      This routine is only called by xfs_change_file_space
4107 *      for an UNRESVSP type call.
4108 *
4109 * RETURNS:
4110 *       0 on success
4111 *      errno on error
4112 *
4113 */
4114STATIC int
4115xfs_free_file_space(
4116        xfs_inode_t             *ip,
4117        xfs_off_t               offset,
4118        xfs_off_t               len,
4119        int                     attr_flags)
4120{
4121        bhv_vnode_t             *vp;
4122        int                     committed;
4123        int                     done;
4124        xfs_off_t               end_dmi_offset;
4125        xfs_fileoff_t           endoffset_fsb;
4126        int                     error;
4127        xfs_fsblock_t           firstfsb;
4128        xfs_bmap_free_t         free_list;
4129        xfs_bmbt_irec_t         imap;
4130        xfs_off_t               ioffset;
4131        xfs_extlen_t            mod=0;
4132        xfs_mount_t             *mp;
4133        int                     nimap;
4134        uint                    resblks;
4135        uint                    rounding;
4136        int                     rt;
4137        xfs_fileoff_t           startoffset_fsb;
4138        xfs_trans_t             *tp;
4139        int                     need_iolock = 1;
4140
4141        vp = XFS_ITOV(ip);
4142        mp = ip->i_mount;
4143
4144        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
4145
4146        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4147                return error;
4148
4149        error = 0;
4150        if (len <= 0)   /* if nothing being freed */
4151                return error;
4152        rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4153        startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4154        end_dmi_offset = offset + len;
4155        endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4156
4157        if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
4158            DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
4159                if (end_dmi_offset > ip->i_size)
4160                        end_dmi_offset = ip->i_size;
4161                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4162                                offset, end_dmi_offset - offset,
4163                                AT_DELAY_FLAG(attr_flags), NULL);
4164                if (error)
4165                        return error;
4166        }
4167
4168        if (attr_flags & ATTR_NOLOCK)
4169                need_iolock = 0;
4170        if (need_iolock) {
4171                xfs_ilock(ip, XFS_IOLOCK_EXCL);
4172                vn_iowait(ip);  /* wait for the completion of any pending DIOs */
4173        }
4174
4175        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
4176        ioffset = offset & ~(rounding - 1);
4177
4178        if (VN_CACHED(vp) != 0) {
4179                xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4180                                ctooff(offtoct(ioffset)), -1);
4181                error = xfs_flushinval_pages(ip,
4182                                ctooff(offtoct(ioffset)),
4183                                -1, FI_REMAPF_LOCKED);
4184                if (error)
4185                        goto out_unlock_iolock;
4186        }
4187
4188        /*
4189         * Need to zero the stuff we're not freeing, on disk.
4190         * If its a realtime file & can't use unwritten extents then we
4191         * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4192         * will take care of it for us.
4193         */
4194        if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4195                nimap = 1;
4196                error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4197                        1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4198                if (error)
4199                        goto out_unlock_iolock;
4200                ASSERT(nimap == 0 || nimap == 1);
4201                if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4202                        xfs_daddr_t     block;
4203
4204                        ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4205                        block = imap.br_startblock;
4206                        mod = do_div(block, mp->m_sb.sb_rextsize);
4207                        if (mod)
4208                                startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4209                }
4210                nimap = 1;
4211                error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4212                        1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4213                if (error)
4214                        goto out_unlock_iolock;
4215                ASSERT(nimap == 0 || nimap == 1);
4216                if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4217                        ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4218                        mod++;
4219                        if (mod && (mod != mp->m_sb.sb_rextsize))
4220                                endoffset_fsb -= mod;
4221                }
4222        }
4223        if ((done = (endoffset_fsb <= startoffset_fsb)))
4224                /*
4225                 * One contiguous piece to clear
4226                 */
4227                error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4228        else {
4229                /*
4230                 * Some full blocks, possibly two pieces to clear
4231                 */
4232                if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4233                        error = xfs_zero_remaining_bytes(ip, offset,
4234                                XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4235                if (!error &&
4236                    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4237                        error = xfs_zero_remaining_bytes(ip,
4238                                XFS_FSB_TO_B(mp, endoffset_fsb),
4239                                offset + len - 1);
4240        }
4241
4242        /*
4243         * free file space until done or until there is an error
4244         */
4245        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4246        while (!error && !done) {
4247
4248                /*
4249                 * allocate and setup the transaction. Allow this
4250                 * transaction to dip into the reserve blocks to ensure
4251                 * the freeing of the space succeeds at ENOSPC.
4252                 */
4253                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4254                tp->t_flags |= XFS_TRANS_RESERVE;
4255                error = xfs_trans_reserve(tp,
4256                                          resblks,
4257                                          XFS_WRITE_LOG_RES(mp),
4258                                          0,
4259                                          XFS_TRANS_PERM_LOG_RES,
4260                                          XFS_WRITE_LOG_COUNT);
4261
4262                /*
4263                 * check for running out of space
4264                 */
4265                if (error) {
4266                        /*
4267                         * Free the transaction structure.
4268                         */
4269                        ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4270                        xfs_trans_cancel(tp, 0);
4271                        break;
4272                }
4273                xfs_ilock(ip, XFS_ILOCK_EXCL);
4274                error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4275                                ip->i_udquot, ip->i_gdquot, resblks, 0,
4276                                XFS_QMOPT_RES_REGBLKS);
4277                if (error)
4278                        goto error1;
4279
4280                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4281                xfs_trans_ihold(tp, ip);
4282
4283                /*
4284                 * issue the bunmapi() call to free the blocks
4285                 */
4286                XFS_BMAP_INIT(&free_list, &firstfsb);
4287                error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4288                                  endoffset_fsb - startoffset_fsb,
4289                                  0, 2, &firstfsb, &free_list, NULL, &done);
4290                if (error) {
4291                        goto error0;
4292                }
4293
4294                /*
4295                 * complete the transaction
4296                 */
4297                error = xfs_bmap_finish(&tp, &free_list, &committed);
4298                if (error) {
4299                        goto error0;
4300                }
4301
4302                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4303                xfs_iunlock(ip, XFS_ILOCK_EXCL);
4304        }
4305
4306 out_unlock_iolock:
4307        if (need_iolock)
4308                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4309        return error;
4310
4311 error0:
4312        xfs_bmap_cancel(&free_list);
4313 error1:
4314        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4315        xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4316                    XFS_ILOCK_EXCL);
4317        return error;
4318}
4319
4320/*
4321 * xfs_change_file_space()
4322 *      This routine allocates or frees disk space for the given file.
4323 *      The user specified parameters are checked for alignment and size
4324 *      limitations.
4325 *
4326 * RETURNS:
4327 *       0 on success
4328 *      errno on error
4329 *
4330 */
4331int
4332xfs_change_file_space(
4333        xfs_inode_t     *ip,
4334        int             cmd,
4335        xfs_flock64_t   *bf,
4336        xfs_off_t       offset,
4337        cred_t          *credp,
4338        int             attr_flags)
4339{
4340        xfs_mount_t     *mp = ip->i_mount;
4341        int             clrprealloc;
4342        int             error;
4343        xfs_fsize_t     fsize;
4344        int             setprealloc;
4345        xfs_off_t       startoffset;
4346        xfs_off_t       llen;
4347        xfs_trans_t     *tp;
4348        bhv_vattr_t     va;
4349
4350        vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
4351
4352        /*
4353         * must be a regular file and have write permission
4354         */
4355        if (!S_ISREG(ip->i_d.di_mode))
4356                return XFS_ERROR(EINVAL);
4357
4358        xfs_ilock(ip, XFS_ILOCK_SHARED);
4359
4360        if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4361                xfs_iunlock(ip, XFS_ILOCK_SHARED);
4362                return error;
4363        }
4364
4365        xfs_iunlock(ip, XFS_ILOCK_SHARED);
4366
4367        switch (bf->l_whence) {
4368        case 0: /*SEEK_SET*/
4369                break;
4370        case 1: /*SEEK_CUR*/
4371                bf->l_start += offset;
4372                break;
4373        case 2: /*SEEK_END*/
4374                bf->l_start += ip->i_size;
4375                break;
4376        default:
4377                return XFS_ERROR(EINVAL);
4378        }
4379
4380        llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4381
4382        if (   (bf->l_start < 0)
4383            || (bf->l_start > XFS_MAXIOFFSET(mp))
4384            || (bf->l_start + llen < 0)
4385            || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4386                return XFS_ERROR(EINVAL);
4387
4388        bf->l_whence = 0;
4389
4390        startoffset = bf->l_start;
4391        fsize = ip->i_size;
4392
4393        /*
4394         * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4395         * file space.
4396         * These calls do NOT zero the data space allocated to the file,
4397         * nor do they change the file size.
4398         *
4399         * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4400         * space.
4401         * These calls cause the new file data to be zeroed and the file
4402         * size to be changed.
4403         */
4404        setprealloc = clrprealloc = 0;
4405
4406        switch (cmd) {
4407        case XFS_IOC_RESVSP:
4408        case XFS_IOC_RESVSP64:
4409                error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4410                                                                1, attr_flags);
4411                if (error)
4412                        return error;
4413                setprealloc = 1;
4414                break;
4415
4416        case XFS_IOC_UNRESVSP:
4417        case XFS_IOC_UNRESVSP64:
4418                if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4419                                                                attr_flags)))
4420                        return error;
4421                break;
4422
4423        case XFS_IOC_ALLOCSP:
4424        case XFS_IOC_ALLOCSP64:
4425        case XFS_IOC_FREESP:
4426        case XFS_IOC_FREESP64:
4427                if (startoffset > fsize) {
4428                        error = xfs_alloc_file_space(ip, fsize,
4429                                        startoffset - fsize, 0, attr_flags);
4430                        if (error)
4431                                break;
4432                }
4433
4434                va.va_mask = XFS_AT_SIZE;
4435                va.va_size = startoffset;
4436
4437                error = xfs_setattr(ip, &va, attr_flags, credp);
4438
4439                if (error)
4440                        return error;
4441
4442                clrprealloc = 1;
4443                break;
4444
4445        default:
4446                ASSERT(0);
4447                return XFS_ERROR(EINVAL);
4448        }
4449
4450        /*
4451         * update the inode timestamp, mode, and prealloc flag bits
4452         */
4453        tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4454
4455        if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4456                                      0, 0, 0))) {
4457                /* ASSERT(0); */
4458                xfs_trans_cancel(tp, 0);
4459                return error;
4460        }
4461
4462        xfs_ilock(ip, XFS_ILOCK_EXCL);
4463
4464        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4465        xfs_trans_ihold(tp, ip);
4466
4467        if ((attr_flags & ATTR_DMI) == 0) {
4468                ip->i_d.di_mode &= ~S_ISUID;
4469
4470                /*
4471                 * Note that we don't have to worry about mandatory
4472                 * file locking being disabled here because we only
4473                 * clear the S_ISGID bit if the Group execute bit is
4474                 * on, but if it was on then mandatory locking wouldn't
4475                 * have been enabled.
4476                 */
4477                if (ip->i_d.di_mode & S_IXGRP)
4478                        ip->i_d.di_mode &= ~S_ISGID;
4479
4480                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4481        }
4482        if (setprealloc)
4483                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4484        else if (clrprealloc)
4485                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4486
4487        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4488        xfs_trans_set_sync(tp);
4489
4490        error = xfs_trans_commit(tp, 0);
4491
4492        xfs_iunlock(ip, XFS_ILOCK_EXCL);
4493
4494        return error;
4495}
4496