linux/fs/xfs/xfs_inode.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include <linux/log2.h>
  19
  20#include "xfs.h"
  21#include "xfs_fs.h"
  22#include "xfs_shared.h"
  23#include "xfs_format.h"
  24#include "xfs_log_format.h"
  25#include "xfs_trans_resv.h"
  26#include "xfs_sb.h"
  27#include "xfs_mount.h"
  28#include "xfs_defer.h"
  29#include "xfs_inode.h"
  30#include "xfs_da_format.h"
  31#include "xfs_da_btree.h"
  32#include "xfs_dir2.h"
  33#include "xfs_attr_sf.h"
  34#include "xfs_attr.h"
  35#include "xfs_trans_space.h"
  36#include "xfs_trans.h"
  37#include "xfs_buf_item.h"
  38#include "xfs_inode_item.h"
  39#include "xfs_ialloc.h"
  40#include "xfs_bmap.h"
  41#include "xfs_bmap_util.h"
  42#include "xfs_error.h"
  43#include "xfs_quota.h"
  44#include "xfs_filestream.h"
  45#include "xfs_cksum.h"
  46#include "xfs_trace.h"
  47#include "xfs_icache.h"
  48#include "xfs_symlink.h"
  49#include "xfs_trans_priv.h"
  50#include "xfs_log.h"
  51#include "xfs_bmap_btree.h"
  52#include "xfs_reflink.h"
  53
  54kmem_zone_t *xfs_inode_zone;
  55
  56/*
  57 * Used in xfs_itruncate_extents().  This is the maximum number of extents
  58 * freed from a file in a single transaction.
  59 */
  60#define XFS_ITRUNC_MAX_EXTENTS  2
  61
  62STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
  63STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
  64STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
  65
  66/*
  67 * helper function to extract extent size hint from inode
  68 */
  69xfs_extlen_t
  70xfs_get_extsz_hint(
  71        struct xfs_inode        *ip)
  72{
  73        if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
  74                return ip->i_d.di_extsize;
  75        if (XFS_IS_REALTIME_INODE(ip))
  76                return ip->i_mount->m_sb.sb_rextsize;
  77        return 0;
  78}
  79
  80/*
  81 * Helper function to extract CoW extent size hint from inode.
  82 * Between the extent size hint and the CoW extent size hint, we
  83 * return the greater of the two.  If the value is zero (automatic),
  84 * use the default size.
  85 */
  86xfs_extlen_t
  87xfs_get_cowextsz_hint(
  88        struct xfs_inode        *ip)
  89{
  90        xfs_extlen_t            a, b;
  91
  92        a = 0;
  93        if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
  94                a = ip->i_d.di_cowextsize;
  95        b = xfs_get_extsz_hint(ip);
  96
  97        a = max(a, b);
  98        if (a == 0)
  99                return XFS_DEFAULT_COWEXTSZ_HINT;
 100        return a;
 101}
 102
 103/*
 104 * These two are wrapper routines around the xfs_ilock() routine used to
 105 * centralize some grungy code.  They are used in places that wish to lock the
 106 * inode solely for reading the extents.  The reason these places can't just
 107 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
 108 * bringing in of the extents from disk for a file in b-tree format.  If the
 109 * inode is in b-tree format, then we need to lock the inode exclusively until
 110 * the extents are read in.  Locking it exclusively all the time would limit
 111 * our parallelism unnecessarily, though.  What we do instead is check to see
 112 * if the extents have been read in yet, and only lock the inode exclusively
 113 * if they have not.
 114 *
 115 * The functions return a value which should be given to the corresponding
 116 * xfs_iunlock() call.
 117 */
 118uint
 119xfs_ilock_data_map_shared(
 120        struct xfs_inode        *ip)
 121{
 122        uint                    lock_mode = XFS_ILOCK_SHARED;
 123
 124        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
 125            (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
 126                lock_mode = XFS_ILOCK_EXCL;
 127        xfs_ilock(ip, lock_mode);
 128        return lock_mode;
 129}
 130
 131uint
 132xfs_ilock_attr_map_shared(
 133        struct xfs_inode        *ip)
 134{
 135        uint                    lock_mode = XFS_ILOCK_SHARED;
 136
 137        if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
 138            (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
 139                lock_mode = XFS_ILOCK_EXCL;
 140        xfs_ilock(ip, lock_mode);
 141        return lock_mode;
 142}
 143
 144/*
 145 * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
 146 * the i_lock.  This routine allows various combinations of the locks to be
 147 * obtained.
 148 *
 149 * The 3 locks should always be ordered so that the IO lock is obtained first,
 150 * the mmap lock second and the ilock last in order to prevent deadlock.
 151 *
 152 * Basic locking order:
 153 *
 154 * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
 155 *
 156 * mmap_sem locking order:
 157 *
 158 * i_iolock -> page lock -> mmap_sem
 159 * mmap_sem -> i_mmap_lock -> page_lock
 160 *
 161 * The difference in mmap_sem locking order mean that we cannot hold the
 162 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
 163 * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
 164 * in get_user_pages() to map the user pages into the kernel address space for
 165 * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
 166 * page faults already hold the mmap_sem.
 167 *
 168 * Hence to serialise fully against both syscall and mmap based IO, we need to
 169 * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
 170 * taken in places where we need to invalidate the page cache in a race
 171 * free manner (e.g. truncate, hole punch and other extent manipulation
 172 * functions).
 173 */
 174void
 175xfs_ilock(
 176        xfs_inode_t             *ip,
 177        uint                    lock_flags)
 178{
 179        trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 180
 181        /*
 182         * You can't set both SHARED and EXCL for the same lock,
 183         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 184         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 185         */
 186        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 187               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 188        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 189               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 190        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 191               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 192        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 193
 194        if (lock_flags & XFS_IOLOCK_EXCL)
 195                mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 196        else if (lock_flags & XFS_IOLOCK_SHARED)
 197                mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 198
 199        if (lock_flags & XFS_MMAPLOCK_EXCL)
 200                mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
 201        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 202                mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
 203
 204        if (lock_flags & XFS_ILOCK_EXCL)
 205                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 206        else if (lock_flags & XFS_ILOCK_SHARED)
 207                mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 208}
 209
 210/*
 211 * This is just like xfs_ilock(), except that the caller
 212 * is guaranteed not to sleep.  It returns 1 if it gets
 213 * the requested locks and 0 otherwise.  If the IO lock is
 214 * obtained but the inode lock cannot be, then the IO lock
 215 * is dropped before returning.
 216 *
 217 * ip -- the inode being locked
 218 * lock_flags -- this parameter indicates the inode's locks to be
 219 *       to be locked.  See the comment for xfs_ilock() for a list
 220 *       of valid values.
 221 */
 222int
 223xfs_ilock_nowait(
 224        xfs_inode_t             *ip,
 225        uint                    lock_flags)
 226{
 227        trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
 228
 229        /*
 230         * You can't set both SHARED and EXCL for the same lock,
 231         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 232         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 233         */
 234        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 235               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 236        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 237               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 238        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 239               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 240        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 241
 242        if (lock_flags & XFS_IOLOCK_EXCL) {
 243                if (!mrtryupdate(&ip->i_iolock))
 244                        goto out;
 245        } else if (lock_flags & XFS_IOLOCK_SHARED) {
 246                if (!mrtryaccess(&ip->i_iolock))
 247                        goto out;
 248        }
 249
 250        if (lock_flags & XFS_MMAPLOCK_EXCL) {
 251                if (!mrtryupdate(&ip->i_mmaplock))
 252                        goto out_undo_iolock;
 253        } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
 254                if (!mrtryaccess(&ip->i_mmaplock))
 255                        goto out_undo_iolock;
 256        }
 257
 258        if (lock_flags & XFS_ILOCK_EXCL) {
 259                if (!mrtryupdate(&ip->i_lock))
 260                        goto out_undo_mmaplock;
 261        } else if (lock_flags & XFS_ILOCK_SHARED) {
 262                if (!mrtryaccess(&ip->i_lock))
 263                        goto out_undo_mmaplock;
 264        }
 265        return 1;
 266
 267out_undo_mmaplock:
 268        if (lock_flags & XFS_MMAPLOCK_EXCL)
 269                mrunlock_excl(&ip->i_mmaplock);
 270        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 271                mrunlock_shared(&ip->i_mmaplock);
 272out_undo_iolock:
 273        if (lock_flags & XFS_IOLOCK_EXCL)
 274                mrunlock_excl(&ip->i_iolock);
 275        else if (lock_flags & XFS_IOLOCK_SHARED)
 276                mrunlock_shared(&ip->i_iolock);
 277out:
 278        return 0;
 279}
 280
 281/*
 282 * xfs_iunlock() is used to drop the inode locks acquired with
 283 * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
 284 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
 285 * that we know which locks to drop.
 286 *
 287 * ip -- the inode being unlocked
 288 * lock_flags -- this parameter indicates the inode's locks to be
 289 *       to be unlocked.  See the comment for xfs_ilock() for a list
 290 *       of valid values for this parameter.
 291 *
 292 */
 293void
 294xfs_iunlock(
 295        xfs_inode_t             *ip,
 296        uint                    lock_flags)
 297{
 298        /*
 299         * You can't set both SHARED and EXCL for the same lock,
 300         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 301         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 302         */
 303        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 304               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 305        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 306               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 307        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 308               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 309        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 310        ASSERT(lock_flags != 0);
 311
 312        if (lock_flags & XFS_IOLOCK_EXCL)
 313                mrunlock_excl(&ip->i_iolock);
 314        else if (lock_flags & XFS_IOLOCK_SHARED)
 315                mrunlock_shared(&ip->i_iolock);
 316
 317        if (lock_flags & XFS_MMAPLOCK_EXCL)
 318                mrunlock_excl(&ip->i_mmaplock);
 319        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 320                mrunlock_shared(&ip->i_mmaplock);
 321
 322        if (lock_flags & XFS_ILOCK_EXCL)
 323                mrunlock_excl(&ip->i_lock);
 324        else if (lock_flags & XFS_ILOCK_SHARED)
 325                mrunlock_shared(&ip->i_lock);
 326
 327        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 328}
 329
 330/*
 331 * give up write locks.  the i/o lock cannot be held nested
 332 * if it is being demoted.
 333 */
 334void
 335xfs_ilock_demote(
 336        xfs_inode_t             *ip,
 337        uint                    lock_flags)
 338{
 339        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
 340        ASSERT((lock_flags &
 341                ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 342
 343        if (lock_flags & XFS_ILOCK_EXCL)
 344                mrdemote(&ip->i_lock);
 345        if (lock_flags & XFS_MMAPLOCK_EXCL)
 346                mrdemote(&ip->i_mmaplock);
 347        if (lock_flags & XFS_IOLOCK_EXCL)
 348                mrdemote(&ip->i_iolock);
 349
 350        trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 351}
 352
 353#if defined(DEBUG) || defined(XFS_WARN)
 354int
 355xfs_isilocked(
 356        xfs_inode_t             *ip,
 357        uint                    lock_flags)
 358{
 359        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
 360                if (!(lock_flags & XFS_ILOCK_SHARED))
 361                        return !!ip->i_lock.mr_writer;
 362                return rwsem_is_locked(&ip->i_lock.mr_lock);
 363        }
 364
 365        if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
 366                if (!(lock_flags & XFS_MMAPLOCK_SHARED))
 367                        return !!ip->i_mmaplock.mr_writer;
 368                return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
 369        }
 370
 371        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 372                if (!(lock_flags & XFS_IOLOCK_SHARED))
 373                        return !!ip->i_iolock.mr_writer;
 374                return rwsem_is_locked(&ip->i_iolock.mr_lock);
 375        }
 376
 377        ASSERT(0);
 378        return 0;
 379}
 380#endif
 381
 382#ifdef DEBUG
 383int xfs_locked_n;
 384int xfs_small_retries;
 385int xfs_middle_retries;
 386int xfs_lots_retries;
 387int xfs_lock_delays;
 388#endif
 389
 390/*
 391 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
 392 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
 393 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
 394 * errors and warnings.
 395 */
 396#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
 397static bool
 398xfs_lockdep_subclass_ok(
 399        int subclass)
 400{
 401        return subclass < MAX_LOCKDEP_SUBCLASSES;
 402}
 403#else
 404#define xfs_lockdep_subclass_ok(subclass)       (true)
 405#endif
 406
 407/*
 408 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
 409 * value. This can be called for any type of inode lock combination, including
 410 * parent locking. Care must be taken to ensure we don't overrun the subclass
 411 * storage fields in the class mask we build.
 412 */
 413static inline int
 414xfs_lock_inumorder(int lock_mode, int subclass)
 415{
 416        int     class = 0;
 417
 418        ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
 419                              XFS_ILOCK_RTSUM)));
 420        ASSERT(xfs_lockdep_subclass_ok(subclass));
 421
 422        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 423                ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
 424                ASSERT(xfs_lockdep_subclass_ok(subclass +
 425                                                XFS_IOLOCK_PARENT_VAL));
 426                class += subclass << XFS_IOLOCK_SHIFT;
 427                if (lock_mode & XFS_IOLOCK_PARENT)
 428                        class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
 429        }
 430
 431        if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
 432                ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
 433                class += subclass << XFS_MMAPLOCK_SHIFT;
 434        }
 435
 436        if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
 437                ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
 438                class += subclass << XFS_ILOCK_SHIFT;
 439        }
 440
 441        return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
 442}
 443
 444/*
 445 * The following routine will lock n inodes in exclusive mode.  We assume the
 446 * caller calls us with the inodes in i_ino order.
 447 *
 448 * We need to detect deadlock where an inode that we lock is in the AIL and we
 449 * start waiting for another inode that is locked by a thread in a long running
 450 * transaction (such as truncate). This can result in deadlock since the long
 451 * running trans might need to wait for the inode we just locked in order to
 452 * push the tail and free space in the log.
 453 *
 454 * xfs_lock_inodes() can only be used to lock one type of lock at a time -
 455 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
 456 * lock more than one at a time, lockdep will report false positives saying we
 457 * have violated locking orders.
 458 */
 459static void
 460xfs_lock_inodes(
 461        xfs_inode_t     **ips,
 462        int             inodes,
 463        uint            lock_mode)
 464{
 465        int             attempts = 0, i, j, try_lock;
 466        xfs_log_item_t  *lp;
 467
 468        /*
 469         * Currently supports between 2 and 5 inodes with exclusive locking.  We
 470         * support an arbitrary depth of locking here, but absolute limits on
 471         * inodes depend on the the type of locking and the limits placed by
 472         * lockdep annotations in xfs_lock_inumorder.  These are all checked by
 473         * the asserts.
 474         */
 475        ASSERT(ips && inodes >= 2 && inodes <= 5);
 476        ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
 477                            XFS_ILOCK_EXCL));
 478        ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
 479                              XFS_ILOCK_SHARED)));
 480        ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
 481                inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
 482        ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
 483                inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
 484        ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
 485                inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
 486
 487        if (lock_mode & XFS_IOLOCK_EXCL) {
 488                ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
 489        } else if (lock_mode & XFS_MMAPLOCK_EXCL)
 490                ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
 491
 492        try_lock = 0;
 493        i = 0;
 494again:
 495        for (; i < inodes; i++) {
 496                ASSERT(ips[i]);
 497
 498                if (i && (ips[i] == ips[i - 1]))        /* Already locked */
 499                        continue;
 500
 501                /*
 502                 * If try_lock is not set yet, make sure all locked inodes are
 503                 * not in the AIL.  If any are, set try_lock to be used later.
 504                 */
 505                if (!try_lock) {
 506                        for (j = (i - 1); j >= 0 && !try_lock; j--) {
 507                                lp = (xfs_log_item_t *)ips[j]->i_itemp;
 508                                if (lp && (lp->li_flags & XFS_LI_IN_AIL))
 509                                        try_lock++;
 510                        }
 511                }
 512
 513                /*
 514                 * If any of the previous locks we have locked is in the AIL,
 515                 * we must TRY to get the second and subsequent locks. If
 516                 * we can't get any, we must release all we have
 517                 * and try again.
 518                 */
 519                if (!try_lock) {
 520                        xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
 521                        continue;
 522                }
 523
 524                /* try_lock means we have an inode locked that is in the AIL. */
 525                ASSERT(i != 0);
 526                if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
 527                        continue;
 528
 529                /*
 530                 * Unlock all previous guys and try again.  xfs_iunlock will try
 531                 * to push the tail if the inode is in the AIL.
 532                 */
 533                attempts++;
 534                for (j = i - 1; j >= 0; j--) {
 535                        /*
 536                         * Check to see if we've already unlocked this one.  Not
 537                         * the first one going back, and the inode ptr is the
 538                         * same.
 539                         */
 540                        if (j != (i - 1) && ips[j] == ips[j + 1])
 541                                continue;
 542
 543                        xfs_iunlock(ips[j], lock_mode);
 544                }
 545
 546                if ((attempts % 5) == 0) {
 547                        delay(1); /* Don't just spin the CPU */
 548#ifdef DEBUG
 549                        xfs_lock_delays++;
 550#endif
 551                }
 552                i = 0;
 553                try_lock = 0;
 554                goto again;
 555        }
 556
 557#ifdef DEBUG
 558        if (attempts) {
 559                if (attempts < 5) xfs_small_retries++;
 560                else if (attempts < 100) xfs_middle_retries++;
 561                else xfs_lots_retries++;
 562        } else {
 563                xfs_locked_n++;
 564        }
 565#endif
 566}
 567
 568/*
 569 * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
 570 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
 571 * lock more than one at a time, lockdep will report false positives saying we
 572 * have violated locking orders.
 573 */
 574void
 575xfs_lock_two_inodes(
 576        xfs_inode_t             *ip0,
 577        xfs_inode_t             *ip1,
 578        uint                    lock_mode)
 579{
 580        xfs_inode_t             *temp;
 581        int                     attempts = 0;
 582        xfs_log_item_t          *lp;
 583
 584        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 585                ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
 586                ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 587        } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
 588                ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 589
 590        ASSERT(ip0->i_ino != ip1->i_ino);
 591
 592        if (ip0->i_ino > ip1->i_ino) {
 593                temp = ip0;
 594                ip0 = ip1;
 595                ip1 = temp;
 596        }
 597
 598 again:
 599        xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
 600
 601        /*
 602         * If the first lock we have locked is in the AIL, we must TRY to get
 603         * the second lock. If we can't get it, we must release the first one
 604         * and try again.
 605         */
 606        lp = (xfs_log_item_t *)ip0->i_itemp;
 607        if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
 608                if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
 609                        xfs_iunlock(ip0, lock_mode);
 610                        if ((++attempts % 5) == 0)
 611                                delay(1); /* Don't just spin the CPU */
 612                        goto again;
 613                }
 614        } else {
 615                xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
 616        }
 617}
 618
 619
 620void
 621__xfs_iflock(
 622        struct xfs_inode        *ip)
 623{
 624        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
 625        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
 626
 627        do {
 628                prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 629                if (xfs_isiflocked(ip))
 630                        io_schedule();
 631        } while (!xfs_iflock_nowait(ip));
 632
 633        finish_wait(wq, &wait.wait);
 634}
 635
 636STATIC uint
 637_xfs_dic2xflags(
 638        __uint16_t              di_flags,
 639        uint64_t                di_flags2,
 640        bool                    has_attr)
 641{
 642        uint                    flags = 0;
 643
 644        if (di_flags & XFS_DIFLAG_ANY) {
 645                if (di_flags & XFS_DIFLAG_REALTIME)
 646                        flags |= FS_XFLAG_REALTIME;
 647                if (di_flags & XFS_DIFLAG_PREALLOC)
 648                        flags |= FS_XFLAG_PREALLOC;
 649                if (di_flags & XFS_DIFLAG_IMMUTABLE)
 650                        flags |= FS_XFLAG_IMMUTABLE;
 651                if (di_flags & XFS_DIFLAG_APPEND)
 652                        flags |= FS_XFLAG_APPEND;
 653                if (di_flags & XFS_DIFLAG_SYNC)
 654                        flags |= FS_XFLAG_SYNC;
 655                if (di_flags & XFS_DIFLAG_NOATIME)
 656                        flags |= FS_XFLAG_NOATIME;
 657                if (di_flags & XFS_DIFLAG_NODUMP)
 658                        flags |= FS_XFLAG_NODUMP;
 659                if (di_flags & XFS_DIFLAG_RTINHERIT)
 660                        flags |= FS_XFLAG_RTINHERIT;
 661                if (di_flags & XFS_DIFLAG_PROJINHERIT)
 662                        flags |= FS_XFLAG_PROJINHERIT;
 663                if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 664                        flags |= FS_XFLAG_NOSYMLINKS;
 665                if (di_flags & XFS_DIFLAG_EXTSIZE)
 666                        flags |= FS_XFLAG_EXTSIZE;
 667                if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 668                        flags |= FS_XFLAG_EXTSZINHERIT;
 669                if (di_flags & XFS_DIFLAG_NODEFRAG)
 670                        flags |= FS_XFLAG_NODEFRAG;
 671                if (di_flags & XFS_DIFLAG_FILESTREAM)
 672                        flags |= FS_XFLAG_FILESTREAM;
 673        }
 674
 675        if (di_flags2 & XFS_DIFLAG2_ANY) {
 676                if (di_flags2 & XFS_DIFLAG2_DAX)
 677                        flags |= FS_XFLAG_DAX;
 678                if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
 679                        flags |= FS_XFLAG_COWEXTSIZE;
 680        }
 681
 682        if (has_attr)
 683                flags |= FS_XFLAG_HASATTR;
 684
 685        return flags;
 686}
 687
 688uint
 689xfs_ip2xflags(
 690        struct xfs_inode        *ip)
 691{
 692        struct xfs_icdinode     *dic = &ip->i_d;
 693
 694        return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
 695}
 696
 697/*
 698 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 699 * is allowed, otherwise it has to be an exact match. If a CI match is found,
 700 * ci_name->name will point to a the actual name (caller must free) or
 701 * will be set to NULL if an exact match is found.
 702 */
 703int
 704xfs_lookup(
 705        xfs_inode_t             *dp,
 706        struct xfs_name         *name,
 707        xfs_inode_t             **ipp,
 708        struct xfs_name         *ci_name)
 709{
 710        xfs_ino_t               inum;
 711        int                     error;
 712
 713        trace_xfs_lookup(dp, name);
 714
 715        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 716                return -EIO;
 717
 718        xfs_ilock(dp, XFS_IOLOCK_SHARED);
 719        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 720        if (error)
 721                goto out_unlock;
 722
 723        error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
 724        if (error)
 725                goto out_free_name;
 726
 727        xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 728        return 0;
 729
 730out_free_name:
 731        if (ci_name)
 732                kmem_free(ci_name->name);
 733out_unlock:
 734        xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 735        *ipp = NULL;
 736        return error;
 737}
 738
 739/*
 740 * Allocate an inode on disk and return a copy of its in-core version.
 741 * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
 742 * appropriately within the inode.  The uid and gid for the inode are
 743 * set according to the contents of the given cred structure.
 744 *
 745 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
 746 * has a free inode available, call xfs_iget() to obtain the in-core
 747 * version of the allocated inode.  Finally, fill in the inode and
 748 * log its initial contents.  In this case, ialloc_context would be
 749 * set to NULL.
 750 *
 751 * If xfs_dialloc() does not have an available inode, it will replenish
 752 * its supply by doing an allocation. Since we can only do one
 753 * allocation within a transaction without deadlocks, we must commit
 754 * the current transaction before returning the inode itself.
 755 * In this case, therefore, we will set ialloc_context and return.
 756 * The caller should then commit the current transaction, start a new
 757 * transaction, and call xfs_ialloc() again to actually get the inode.
 758 *
 759 * To ensure that some other process does not grab the inode that
 760 * was allocated during the first call to xfs_ialloc(), this routine
 761 * also returns the [locked] bp pointing to the head of the freelist
 762 * as ialloc_context.  The caller should hold this buffer across
 763 * the commit and pass it back into this routine on the second call.
 764 *
 765 * If we are allocating quota inodes, we do not have a parent inode
 766 * to attach to or associate with (i.e. pip == NULL) because they
 767 * are not linked into the directory structure - they are attached
 768 * directly to the superblock - and so have no parent.
 769 */
 770static int
 771xfs_ialloc(
 772        xfs_trans_t     *tp,
 773        xfs_inode_t     *pip,
 774        umode_t         mode,
 775        xfs_nlink_t     nlink,
 776        xfs_dev_t       rdev,
 777        prid_t          prid,
 778        int             okalloc,
 779        xfs_buf_t       **ialloc_context,
 780        xfs_inode_t     **ipp)
 781{
 782        struct xfs_mount *mp = tp->t_mountp;
 783        xfs_ino_t       ino;
 784        xfs_inode_t     *ip;
 785        uint            flags;
 786        int             error;
 787        struct timespec tv;
 788        struct inode    *inode;
 789
 790        /*
 791         * Call the space management code to pick
 792         * the on-disk inode to be allocated.
 793         */
 794        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
 795                            ialloc_context, &ino);
 796        if (error)
 797                return error;
 798        if (*ialloc_context || ino == NULLFSINO) {
 799                *ipp = NULL;
 800                return 0;
 801        }
 802        ASSERT(*ialloc_context == NULL);
 803
 804        /*
 805         * Get the in-core inode with the lock held exclusively.
 806         * This is because we're setting fields here we need
 807         * to prevent others from looking at until we're done.
 808         */
 809        error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
 810                         XFS_ILOCK_EXCL, &ip);
 811        if (error)
 812                return error;
 813        ASSERT(ip != NULL);
 814        inode = VFS_I(ip);
 815
 816        /*
 817         * We always convert v1 inodes to v2 now - we only support filesystems
 818         * with >= v2 inode capability, so there is no reason for ever leaving
 819         * an inode in v1 format.
 820         */
 821        if (ip->i_d.di_version == 1)
 822                ip->i_d.di_version = 2;
 823
 824        inode->i_mode = mode;
 825        set_nlink(inode, nlink);
 826        ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
 827        ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
 828        xfs_set_projid(ip, prid);
 829
 830        if (pip && XFS_INHERIT_GID(pip)) {
 831                ip->i_d.di_gid = pip->i_d.di_gid;
 832                if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
 833                        inode->i_mode |= S_ISGID;
 834        }
 835
 836        /*
 837         * If the group ID of the new file does not match the effective group
 838         * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
 839         * (and only if the irix_sgid_inherit compatibility variable is set).
 840         */
 841        if ((irix_sgid_inherit) &&
 842            (inode->i_mode & S_ISGID) &&
 843            (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
 844                inode->i_mode &= ~S_ISGID;
 845
 846        ip->i_d.di_size = 0;
 847        ip->i_d.di_nextents = 0;
 848        ASSERT(ip->i_d.di_nblocks == 0);
 849
 850        tv = current_time(inode);
 851        inode->i_mtime = tv;
 852        inode->i_atime = tv;
 853        inode->i_ctime = tv;
 854
 855        ip->i_d.di_extsize = 0;
 856        ip->i_d.di_dmevmask = 0;
 857        ip->i_d.di_dmstate = 0;
 858        ip->i_d.di_flags = 0;
 859
 860        if (ip->i_d.di_version == 3) {
 861                inode->i_version = 1;
 862                ip->i_d.di_flags2 = 0;
 863                ip->i_d.di_cowextsize = 0;
 864                ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
 865                ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
 866        }
 867
 868
 869        flags = XFS_ILOG_CORE;
 870        switch (mode & S_IFMT) {
 871        case S_IFIFO:
 872        case S_IFCHR:
 873        case S_IFBLK:
 874        case S_IFSOCK:
 875                ip->i_d.di_format = XFS_DINODE_FMT_DEV;
 876                ip->i_df.if_u2.if_rdev = rdev;
 877                ip->i_df.if_flags = 0;
 878                flags |= XFS_ILOG_DEV;
 879                break;
 880        case S_IFREG:
 881        case S_IFDIR:
 882                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
 883                        uint64_t        di_flags2 = 0;
 884                        uint            di_flags = 0;
 885
 886                        if (S_ISDIR(mode)) {
 887                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 888                                        di_flags |= XFS_DIFLAG_RTINHERIT;
 889                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 890                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 891                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
 892                                }
 893                                if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 894                                        di_flags |= XFS_DIFLAG_PROJINHERIT;
 895                        } else if (S_ISREG(mode)) {
 896                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 897                                        di_flags |= XFS_DIFLAG_REALTIME;
 898                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 899                                        di_flags |= XFS_DIFLAG_EXTSIZE;
 900                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
 901                                }
 902                        }
 903                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
 904                            xfs_inherit_noatime)
 905                                di_flags |= XFS_DIFLAG_NOATIME;
 906                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
 907                            xfs_inherit_nodump)
 908                                di_flags |= XFS_DIFLAG_NODUMP;
 909                        if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
 910                            xfs_inherit_sync)
 911                                di_flags |= XFS_DIFLAG_SYNC;
 912                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
 913                            xfs_inherit_nosymlinks)
 914                                di_flags |= XFS_DIFLAG_NOSYMLINKS;
 915                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
 916                            xfs_inherit_nodefrag)
 917                                di_flags |= XFS_DIFLAG_NODEFRAG;
 918                        if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
 919                                di_flags |= XFS_DIFLAG_FILESTREAM;
 920                        if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
 921                                di_flags2 |= XFS_DIFLAG2_DAX;
 922
 923                        ip->i_d.di_flags |= di_flags;
 924                        ip->i_d.di_flags2 |= di_flags2;
 925                }
 926                if (pip &&
 927                    (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
 928                    pip->i_d.di_version == 3 &&
 929                    ip->i_d.di_version == 3) {
 930                        if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
 931                                ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
 932                                ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
 933                        }
 934                }
 935                /* FALLTHROUGH */
 936        case S_IFLNK:
 937                ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
 938                ip->i_df.if_flags = XFS_IFEXTENTS;
 939                ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
 940                ip->i_df.if_u1.if_extents = NULL;
 941                break;
 942        default:
 943                ASSERT(0);
 944        }
 945        /*
 946         * Attribute fork settings for new inode.
 947         */
 948        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 949        ip->i_d.di_anextents = 0;
 950
 951        /*
 952         * Log the new values stuffed into the inode.
 953         */
 954        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 955        xfs_trans_log_inode(tp, ip, flags);
 956
 957        /* now that we have an i_mode we can setup the inode structure */
 958        xfs_setup_inode(ip);
 959
 960        *ipp = ip;
 961        return 0;
 962}
 963
 964/*
 965 * Allocates a new inode from disk and return a pointer to the
 966 * incore copy. This routine will internally commit the current
 967 * transaction and allocate a new one if the Space Manager needed
 968 * to do an allocation to replenish the inode free-list.
 969 *
 970 * This routine is designed to be called from xfs_create and
 971 * xfs_create_dir.
 972 *
 973 */
 974int
 975xfs_dir_ialloc(
 976        xfs_trans_t     **tpp,          /* input: current transaction;
 977                                           output: may be a new transaction. */
 978        xfs_inode_t     *dp,            /* directory within whose allocate
 979                                           the inode. */
 980        umode_t         mode,
 981        xfs_nlink_t     nlink,
 982        xfs_dev_t       rdev,
 983        prid_t          prid,           /* project id */
 984        int             okalloc,        /* ok to allocate new space */
 985        xfs_inode_t     **ipp,          /* pointer to inode; it will be
 986                                           locked. */
 987        int             *committed)
 988
 989{
 990        xfs_trans_t     *tp;
 991        xfs_inode_t     *ip;
 992        xfs_buf_t       *ialloc_context = NULL;
 993        int             code;
 994        void            *dqinfo;
 995        uint            tflags;
 996
 997        tp = *tpp;
 998        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 999
1000        /*
1001         * xfs_ialloc will return a pointer to an incore inode if
1002         * the Space Manager has an available inode on the free
1003         * list. Otherwise, it will do an allocation and replenish
1004         * the freelist.  Since we can only do one allocation per
1005         * transaction without deadlocks, we will need to commit the
1006         * current transaction and start a new one.  We will then
1007         * need to call xfs_ialloc again to get the inode.
1008         *
1009         * If xfs_ialloc did an allocation to replenish the freelist,
1010         * it returns the bp containing the head of the freelist as
1011         * ialloc_context. We will hold a lock on it across the
1012         * transaction commit so that no other process can steal
1013         * the inode(s) that we've just allocated.
1014         */
1015        code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
1016                          &ialloc_context, &ip);
1017
1018        /*
1019         * Return an error if we were unable to allocate a new inode.
1020         * This should only happen if we run out of space on disk or
1021         * encounter a disk error.
1022         */
1023        if (code) {
1024                *ipp = NULL;
1025                return code;
1026        }
1027        if (!ialloc_context && !ip) {
1028                *ipp = NULL;
1029                return -ENOSPC;
1030        }
1031
1032        /*
1033         * If the AGI buffer is non-NULL, then we were unable to get an
1034         * inode in one operation.  We need to commit the current
1035         * transaction and call xfs_ialloc() again.  It is guaranteed
1036         * to succeed the second time.
1037         */
1038        if (ialloc_context) {
1039                /*
1040                 * Normally, xfs_trans_commit releases all the locks.
1041                 * We call bhold to hang on to the ialloc_context across
1042                 * the commit.  Holding this buffer prevents any other
1043                 * processes from doing any allocations in this
1044                 * allocation group.
1045                 */
1046                xfs_trans_bhold(tp, ialloc_context);
1047
1048                /*
1049                 * We want the quota changes to be associated with the next
1050                 * transaction, NOT this one. So, detach the dqinfo from this
1051                 * and attach it to the next transaction.
1052                 */
1053                dqinfo = NULL;
1054                tflags = 0;
1055                if (tp->t_dqinfo) {
1056                        dqinfo = (void *)tp->t_dqinfo;
1057                        tp->t_dqinfo = NULL;
1058                        tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
1059                        tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
1060                }
1061
1062                code = xfs_trans_roll(&tp, NULL);
1063                if (committed != NULL)
1064                        *committed = 1;
1065
1066                /*
1067                 * Re-attach the quota info that we detached from prev trx.
1068                 */
1069                if (dqinfo) {
1070                        tp->t_dqinfo = dqinfo;
1071                        tp->t_flags |= tflags;
1072                }
1073
1074                if (code) {
1075                        xfs_buf_relse(ialloc_context);
1076                        *tpp = tp;
1077                        *ipp = NULL;
1078                        return code;
1079                }
1080                xfs_trans_bjoin(tp, ialloc_context);
1081
1082                /*
1083                 * Call ialloc again. Since we've locked out all
1084                 * other allocations in this allocation group,
1085                 * this call should always succeed.
1086                 */
1087                code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
1088                                  okalloc, &ialloc_context, &ip);
1089
1090                /*
1091                 * If we get an error at this point, return to the caller
1092                 * so that the current transaction can be aborted.
1093                 */
1094                if (code) {
1095                        *tpp = tp;
1096                        *ipp = NULL;
1097                        return code;
1098                }
1099                ASSERT(!ialloc_context && ip);
1100
1101        } else {
1102                if (committed != NULL)
1103                        *committed = 0;
1104        }
1105
1106        *ipp = ip;
1107        *tpp = tp;
1108
1109        return 0;
1110}
1111
1112/*
1113 * Decrement the link count on an inode & log the change.  If this causes the
1114 * link count to go to zero, move the inode to AGI unlinked list so that it can
1115 * be freed when the last active reference goes away via xfs_inactive().
1116 */
1117static int                      /* error */
1118xfs_droplink(
1119        xfs_trans_t *tp,
1120        xfs_inode_t *ip)
1121{
1122        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1123
1124        drop_nlink(VFS_I(ip));
1125        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1126
1127        if (VFS_I(ip)->i_nlink)
1128                return 0;
1129
1130        return xfs_iunlink(tp, ip);
1131}
1132
1133/*
1134 * Increment the link count on an inode & log the change.
1135 */
1136static int
1137xfs_bumplink(
1138        xfs_trans_t *tp,
1139        xfs_inode_t *ip)
1140{
1141        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1142
1143        ASSERT(ip->i_d.di_version > 1);
1144        inc_nlink(VFS_I(ip));
1145        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1146        return 0;
1147}
1148
1149int
1150xfs_create(
1151        xfs_inode_t             *dp,
1152        struct xfs_name         *name,
1153        umode_t                 mode,
1154        xfs_dev_t               rdev,
1155        xfs_inode_t             **ipp)
1156{
1157        int                     is_dir = S_ISDIR(mode);
1158        struct xfs_mount        *mp = dp->i_mount;
1159        struct xfs_inode        *ip = NULL;
1160        struct xfs_trans        *tp = NULL;
1161        int                     error;
1162        struct xfs_defer_ops    dfops;
1163        xfs_fsblock_t           first_block;
1164        bool                    unlock_dp_on_error = false;
1165        prid_t                  prid;
1166        struct xfs_dquot        *udqp = NULL;
1167        struct xfs_dquot        *gdqp = NULL;
1168        struct xfs_dquot        *pdqp = NULL;
1169        struct xfs_trans_res    *tres;
1170        uint                    resblks;
1171
1172        trace_xfs_create(dp, name);
1173
1174        if (XFS_FORCED_SHUTDOWN(mp))
1175                return -EIO;
1176
1177        prid = xfs_get_initial_prid(dp);
1178
1179        /*
1180         * Make sure that we have allocated dquot(s) on disk.
1181         */
1182        error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1183                                        xfs_kgid_to_gid(current_fsgid()), prid,
1184                                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1185                                        &udqp, &gdqp, &pdqp);
1186        if (error)
1187                return error;
1188
1189        if (is_dir) {
1190                rdev = 0;
1191                resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1192                tres = &M_RES(mp)->tr_mkdir;
1193        } else {
1194                resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1195                tres = &M_RES(mp)->tr_create;
1196        }
1197
1198        /*
1199         * Initially assume that the file does not exist and
1200         * reserve the resources for that case.  If that is not
1201         * the case we'll drop the one we have and get a more
1202         * appropriate transaction later.
1203         */
1204        error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1205        if (error == -ENOSPC) {
1206                /* flush outstanding delalloc blocks and retry */
1207                xfs_flush_inodes(mp);
1208                error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1209        }
1210        if (error == -ENOSPC) {
1211                /* No space at all so try a "no-allocation" reservation */
1212                resblks = 0;
1213                error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
1214        }
1215        if (error)
1216                goto out_release_inode;
1217
1218        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
1219                      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
1220        unlock_dp_on_error = true;
1221
1222        xfs_defer_init(&dfops, &first_block);
1223
1224        /*
1225         * Reserve disk quota and the inode.
1226         */
1227        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1228                                                pdqp, resblks, 1, 0);
1229        if (error)
1230                goto out_trans_cancel;
1231
1232        if (!resblks) {
1233                error = xfs_dir_canenter(tp, dp, name);
1234                if (error)
1235                        goto out_trans_cancel;
1236        }
1237
1238        /*
1239         * A newly created regular or special file just has one directory
1240         * entry pointing to them, but a directory also the "." entry
1241         * pointing to itself.
1242         */
1243        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1244                               prid, resblks > 0, &ip, NULL);
1245        if (error)
1246                goto out_trans_cancel;
1247
1248        /*
1249         * Now we join the directory inode to the transaction.  We do not do it
1250         * earlier because xfs_dir_ialloc might commit the previous transaction
1251         * (and release all the locks).  An error from here on will result in
1252         * the transaction cancel unlocking dp so don't do it explicitly in the
1253         * error path.
1254         */
1255        xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1256        unlock_dp_on_error = false;
1257
1258        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1259                                        &first_block, &dfops, resblks ?
1260                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1261        if (error) {
1262                ASSERT(error != -ENOSPC);
1263                goto out_trans_cancel;
1264        }
1265        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1266        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1267
1268        if (is_dir) {
1269                error = xfs_dir_init(tp, ip, dp);
1270                if (error)
1271                        goto out_bmap_cancel;
1272
1273                error = xfs_bumplink(tp, dp);
1274                if (error)
1275                        goto out_bmap_cancel;
1276        }
1277
1278        /*
1279         * If this is a synchronous mount, make sure that the
1280         * create transaction goes to disk before returning to
1281         * the user.
1282         */
1283        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1284                xfs_trans_set_sync(tp);
1285
1286        /*
1287         * Attach the dquot(s) to the inodes and modify them incore.
1288         * These ids of the inode couldn't have changed since the new
1289         * inode has been locked ever since it was created.
1290         */
1291        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1292
1293        error = xfs_defer_finish(&tp, &dfops, NULL);
1294        if (error)
1295                goto out_bmap_cancel;
1296
1297        error = xfs_trans_commit(tp);
1298        if (error)
1299                goto out_release_inode;
1300
1301        xfs_qm_dqrele(udqp);
1302        xfs_qm_dqrele(gdqp);
1303        xfs_qm_dqrele(pdqp);
1304
1305        *ipp = ip;
1306        return 0;
1307
1308 out_bmap_cancel:
1309        xfs_defer_cancel(&dfops);
1310 out_trans_cancel:
1311        xfs_trans_cancel(tp);
1312 out_release_inode:
1313        /*
1314         * Wait until after the current transaction is aborted to finish the
1315         * setup of the inode and release the inode.  This prevents recursive
1316         * transactions and deadlocks from xfs_inactive.
1317         */
1318        if (ip) {
1319                xfs_finish_inode_setup(ip);
1320                IRELE(ip);
1321        }
1322
1323        xfs_qm_dqrele(udqp);
1324        xfs_qm_dqrele(gdqp);
1325        xfs_qm_dqrele(pdqp);
1326
1327        if (unlock_dp_on_error)
1328                xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1329        return error;
1330}
1331
1332int
1333xfs_create_tmpfile(
1334        struct xfs_inode        *dp,
1335        struct dentry           *dentry,
1336        umode_t                 mode,
1337        struct xfs_inode        **ipp)
1338{
1339        struct xfs_mount        *mp = dp->i_mount;
1340        struct xfs_inode        *ip = NULL;
1341        struct xfs_trans        *tp = NULL;
1342        int                     error;
1343        prid_t                  prid;
1344        struct xfs_dquot        *udqp = NULL;
1345        struct xfs_dquot        *gdqp = NULL;
1346        struct xfs_dquot        *pdqp = NULL;
1347        struct xfs_trans_res    *tres;
1348        uint                    resblks;
1349
1350        if (XFS_FORCED_SHUTDOWN(mp))
1351                return -EIO;
1352
1353        prid = xfs_get_initial_prid(dp);
1354
1355        /*
1356         * Make sure that we have allocated dquot(s) on disk.
1357         */
1358        error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1359                                xfs_kgid_to_gid(current_fsgid()), prid,
1360                                XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1361                                &udqp, &gdqp, &pdqp);
1362        if (error)
1363                return error;
1364
1365        resblks = XFS_IALLOC_SPACE_RES(mp);
1366        tres = &M_RES(mp)->tr_create_tmpfile;
1367
1368        error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1369        if (error == -ENOSPC) {
1370                /* No space at all so try a "no-allocation" reservation */
1371                resblks = 0;
1372                error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
1373        }
1374        if (error)
1375                goto out_release_inode;
1376
1377        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1378                                                pdqp, resblks, 1, 0);
1379        if (error)
1380                goto out_trans_cancel;
1381
1382        error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
1383                                prid, resblks > 0, &ip, NULL);
1384        if (error)
1385                goto out_trans_cancel;
1386
1387        if (mp->m_flags & XFS_MOUNT_WSYNC)
1388                xfs_trans_set_sync(tp);
1389
1390        /*
1391         * Attach the dquot(s) to the inodes and modify them incore.
1392         * These ids of the inode couldn't have changed since the new
1393         * inode has been locked ever since it was created.
1394         */
1395        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1396
1397        error = xfs_iunlink(tp, ip);
1398        if (error)
1399                goto out_trans_cancel;
1400
1401        error = xfs_trans_commit(tp);
1402        if (error)
1403                goto out_release_inode;
1404
1405        xfs_qm_dqrele(udqp);
1406        xfs_qm_dqrele(gdqp);
1407        xfs_qm_dqrele(pdqp);
1408
1409        *ipp = ip;
1410        return 0;
1411
1412 out_trans_cancel:
1413        xfs_trans_cancel(tp);
1414 out_release_inode:
1415        /*
1416         * Wait until after the current transaction is aborted to finish the
1417         * setup of the inode and release the inode.  This prevents recursive
1418         * transactions and deadlocks from xfs_inactive.
1419         */
1420        if (ip) {
1421                xfs_finish_inode_setup(ip);
1422                IRELE(ip);
1423        }
1424
1425        xfs_qm_dqrele(udqp);
1426        xfs_qm_dqrele(gdqp);
1427        xfs_qm_dqrele(pdqp);
1428
1429        return error;
1430}
1431
1432int
1433xfs_link(
1434        xfs_inode_t             *tdp,
1435        xfs_inode_t             *sip,
1436        struct xfs_name         *target_name)
1437{
1438        xfs_mount_t             *mp = tdp->i_mount;
1439        xfs_trans_t             *tp;
1440        int                     error;
1441        struct xfs_defer_ops    dfops;
1442        xfs_fsblock_t           first_block;
1443        int                     resblks;
1444
1445        trace_xfs_link(tdp, target_name);
1446
1447        ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1448
1449        if (XFS_FORCED_SHUTDOWN(mp))
1450                return -EIO;
1451
1452        error = xfs_qm_dqattach(sip, 0);
1453        if (error)
1454                goto std_return;
1455
1456        error = xfs_qm_dqattach(tdp, 0);
1457        if (error)
1458                goto std_return;
1459
1460        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1461        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
1462        if (error == -ENOSPC) {
1463                resblks = 0;
1464                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
1465        }
1466        if (error)
1467                goto std_return;
1468
1469        xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
1470        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1471
1472        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1473        xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1474
1475        /*
1476         * If we are using project inheritance, we only allow hard link
1477         * creation in our tree when the project IDs are the same; else
1478         * the tree quota mechanism could be circumvented.
1479         */
1480        if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1481                     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1482                error = -EXDEV;
1483                goto error_return;
1484        }
1485
1486        if (!resblks) {
1487                error = xfs_dir_canenter(tp, tdp, target_name);
1488                if (error)
1489                        goto error_return;
1490        }
1491
1492        xfs_defer_init(&dfops, &first_block);
1493
1494        /*
1495         * Handle initial link state of O_TMPFILE inode
1496         */
1497        if (VFS_I(sip)->i_nlink == 0) {
1498                error = xfs_iunlink_remove(tp, sip);
1499                if (error)
1500                        goto error_return;
1501        }
1502
1503        error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1504                                        &first_block, &dfops, resblks);
1505        if (error)
1506                goto error_return;
1507        xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1508        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1509
1510        error = xfs_bumplink(tp, sip);
1511        if (error)
1512                goto error_return;
1513
1514        /*
1515         * If this is a synchronous mount, make sure that the
1516         * link transaction goes to disk before returning to
1517         * the user.
1518         */
1519        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1520                xfs_trans_set_sync(tp);
1521
1522        error = xfs_defer_finish(&tp, &dfops, NULL);
1523        if (error) {
1524                xfs_defer_cancel(&dfops);
1525                goto error_return;
1526        }
1527
1528        return xfs_trans_commit(tp);
1529
1530 error_return:
1531        xfs_trans_cancel(tp);
1532 std_return:
1533        return error;
1534}
1535
1536/*
1537 * Free up the underlying blocks past new_size.  The new size must be smaller
1538 * than the current size.  This routine can be used both for the attribute and
1539 * data fork, and does not modify the inode size, which is left to the caller.
1540 *
1541 * The transaction passed to this routine must have made a permanent log
1542 * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1543 * given transaction and start new ones, so make sure everything involved in
1544 * the transaction is tidy before calling here.  Some transaction will be
1545 * returned to the caller to be committed.  The incoming transaction must
1546 * already include the inode, and both inode locks must be held exclusively.
1547 * The inode must also be "held" within the transaction.  On return the inode
1548 * will be "held" within the returned transaction.  This routine does NOT
1549 * require any disk space to be reserved for it within the transaction.
1550 *
1551 * If we get an error, we must return with the inode locked and linked into the
1552 * current transaction. This keeps things simple for the higher level code,
1553 * because it always knows that the inode is locked and held in the transaction
1554 * that returns to it whether errors occur or not.  We don't mark the inode
1555 * dirty on error so that transactions can be easily aborted if possible.
1556 */
1557int
1558xfs_itruncate_extents(
1559        struct xfs_trans        **tpp,
1560        struct xfs_inode        *ip,
1561        int                     whichfork,
1562        xfs_fsize_t             new_size)
1563{
1564        struct xfs_mount        *mp = ip->i_mount;
1565        struct xfs_trans        *tp = *tpp;
1566        struct xfs_defer_ops    dfops;
1567        xfs_fsblock_t           first_block;
1568        xfs_fileoff_t           first_unmap_block;
1569        xfs_fileoff_t           last_block;
1570        xfs_filblks_t           unmap_len;
1571        int                     error = 0;
1572        int                     done = 0;
1573
1574        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1575        ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1576               xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1577        ASSERT(new_size <= XFS_ISIZE(ip));
1578        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1579        ASSERT(ip->i_itemp != NULL);
1580        ASSERT(ip->i_itemp->ili_lock_flags == 0);
1581        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1582
1583        trace_xfs_itruncate_extents_start(ip, new_size);
1584
1585        /*
1586         * Since it is possible for space to become allocated beyond
1587         * the end of the file (in a crash where the space is allocated
1588         * but the inode size is not yet updated), simply remove any
1589         * blocks which show up between the new EOF and the maximum
1590         * possible file size.  If the first block to be removed is
1591         * beyond the maximum file size (ie it is the same as last_block),
1592         * then there is nothing to do.
1593         */
1594        first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1595        last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1596        if (first_unmap_block == last_block)
1597                return 0;
1598
1599        ASSERT(first_unmap_block < last_block);
1600        unmap_len = last_block - first_unmap_block + 1;
1601        while (!done) {
1602                xfs_defer_init(&dfops, &first_block);
1603                error = xfs_bunmapi(tp, ip,
1604                                    first_unmap_block, unmap_len,
1605                                    xfs_bmapi_aflag(whichfork),
1606                                    XFS_ITRUNC_MAX_EXTENTS,
1607                                    &first_block, &dfops,
1608                                    &done);
1609                if (error)
1610                        goto out_bmap_cancel;
1611
1612                /*
1613                 * Duplicate the transaction that has the permanent
1614                 * reservation and commit the old transaction.
1615                 */
1616                error = xfs_defer_finish(&tp, &dfops, ip);
1617                if (error)
1618                        goto out_bmap_cancel;
1619
1620                error = xfs_trans_roll(&tp, ip);
1621                if (error)
1622                        goto out;
1623        }
1624
1625        /* Remove all pending CoW reservations. */
1626        error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block,
1627                        last_block);
1628        if (error)
1629                goto out;
1630
1631        /*
1632         * Clear the reflink flag if we truncated everything.
1633         */
1634        if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
1635                ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
1636                xfs_inode_clear_cowblocks_tag(ip);
1637        }
1638
1639        /*
1640         * Always re-log the inode so that our permanent transaction can keep
1641         * on rolling it forward in the log.
1642         */
1643        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1644
1645        trace_xfs_itruncate_extents_end(ip, new_size);
1646
1647out:
1648        *tpp = tp;
1649        return error;
1650out_bmap_cancel:
1651        /*
1652         * If the bunmapi call encounters an error, return to the caller where
1653         * the transaction can be properly aborted.  We just need to make sure
1654         * we're not holding any resources that we were not when we came in.
1655         */
1656        xfs_defer_cancel(&dfops);
1657        goto out;
1658}
1659
1660int
1661xfs_release(
1662        xfs_inode_t     *ip)
1663{
1664        xfs_mount_t     *mp = ip->i_mount;
1665        int             error;
1666
1667        if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1668                return 0;
1669
1670        /* If this is a read-only mount, don't do this (would generate I/O) */
1671        if (mp->m_flags & XFS_MOUNT_RDONLY)
1672                return 0;
1673
1674        if (!XFS_FORCED_SHUTDOWN(mp)) {
1675                int truncated;
1676
1677                /*
1678                 * If we previously truncated this file and removed old data
1679                 * in the process, we want to initiate "early" writeout on
1680                 * the last close.  This is an attempt to combat the notorious
1681                 * NULL files problem which is particularly noticeable from a
1682                 * truncate down, buffered (re-)write (delalloc), followed by
1683                 * a crash.  What we are effectively doing here is
1684                 * significantly reducing the time window where we'd otherwise
1685                 * be exposed to that problem.
1686                 */
1687                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1688                if (truncated) {
1689                        xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1690                        if (ip->i_delayed_blks > 0) {
1691                                error = filemap_flush(VFS_I(ip)->i_mapping);
1692                                if (error)
1693                                        return error;
1694                        }
1695                }
1696        }
1697
1698        if (VFS_I(ip)->i_nlink == 0)
1699                return 0;
1700
1701        if (xfs_can_free_eofblocks(ip, false)) {
1702
1703                /*
1704                 * If we can't get the iolock just skip truncating the blocks
1705                 * past EOF because we could deadlock with the mmap_sem
1706                 * otherwise.  We'll get another chance to drop them once the
1707                 * last reference to the inode is dropped, so we'll never leak
1708                 * blocks permanently.
1709                 *
1710                 * Further, check if the inode is being opened, written and
1711                 * closed frequently and we have delayed allocation blocks
1712                 * outstanding (e.g. streaming writes from the NFS server),
1713                 * truncating the blocks past EOF will cause fragmentation to
1714                 * occur.
1715                 *
1716                 * In this case don't do the truncation, either, but we have to
1717                 * be careful how we detect this case. Blocks beyond EOF show
1718                 * up as i_delayed_blks even when the inode is clean, so we
1719                 * need to truncate them away first before checking for a dirty
1720                 * release. Hence on the first dirty close we will still remove
1721                 * the speculative allocation, but after that we will leave it
1722                 * in place.
1723                 */
1724                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1725                        return 0;
1726
1727                error = xfs_free_eofblocks(mp, ip, true);
1728                if (error && error != -EAGAIN)
1729                        return error;
1730
1731                /* delalloc blocks after truncation means it really is dirty */
1732                if (ip->i_delayed_blks)
1733                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1734        }
1735        return 0;
1736}
1737
1738/*
1739 * xfs_inactive_truncate
1740 *
1741 * Called to perform a truncate when an inode becomes unlinked.
1742 */
1743STATIC int
1744xfs_inactive_truncate(
1745        struct xfs_inode *ip)
1746{
1747        struct xfs_mount        *mp = ip->i_mount;
1748        struct xfs_trans        *tp;
1749        int                     error;
1750
1751        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
1752        if (error) {
1753                ASSERT(XFS_FORCED_SHUTDOWN(mp));
1754                return error;
1755        }
1756
1757        xfs_ilock(ip, XFS_ILOCK_EXCL);
1758        xfs_trans_ijoin(tp, ip, 0);
1759
1760        /*
1761         * Log the inode size first to prevent stale data exposure in the event
1762         * of a system crash before the truncate completes. See the related
1763         * comment in xfs_vn_setattr_size() for details.
1764         */
1765        ip->i_d.di_size = 0;
1766        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1767
1768        error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1769        if (error)
1770                goto error_trans_cancel;
1771
1772        ASSERT(ip->i_d.di_nextents == 0);
1773
1774        error = xfs_trans_commit(tp);
1775        if (error)
1776                goto error_unlock;
1777
1778        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1779        return 0;
1780
1781error_trans_cancel:
1782        xfs_trans_cancel(tp);
1783error_unlock:
1784        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1785        return error;
1786}
1787
1788/*
1789 * xfs_inactive_ifree()
1790 *
1791 * Perform the inode free when an inode is unlinked.
1792 */
1793STATIC int
1794xfs_inactive_ifree(
1795        struct xfs_inode *ip)
1796{
1797        struct xfs_defer_ops    dfops;
1798        xfs_fsblock_t           first_block;
1799        struct xfs_mount        *mp = ip->i_mount;
1800        struct xfs_trans        *tp;
1801        int                     error;
1802
1803        /*
1804         * The ifree transaction might need to allocate blocks for record
1805         * insertion to the finobt. We don't want to fail here at ENOSPC, so
1806         * allow ifree to dip into the reserved block pool if necessary.
1807         *
1808         * Freeing large sets of inodes generally means freeing inode chunks,
1809         * directory and file data blocks, so this should be relatively safe.
1810         * Only under severe circumstances should it be possible to free enough
1811         * inodes to exhaust the reserve block pool via finobt expansion while
1812         * at the same time not creating free space in the filesystem.
1813         *
1814         * Send a warning if the reservation does happen to fail, as the inode
1815         * now remains allocated and sits on the unlinked list until the fs is
1816         * repaired.
1817         */
1818        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1819                        XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
1820        if (error) {
1821                if (error == -ENOSPC) {
1822                        xfs_warn_ratelimited(mp,
1823                        "Failed to remove inode(s) from unlinked list. "
1824                        "Please free space, unmount and run xfs_repair.");
1825                } else {
1826                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
1827                }
1828                return error;
1829        }
1830
1831        xfs_ilock(ip, XFS_ILOCK_EXCL);
1832        xfs_trans_ijoin(tp, ip, 0);
1833
1834        xfs_defer_init(&dfops, &first_block);
1835        error = xfs_ifree(tp, ip, &dfops);
1836        if (error) {
1837                /*
1838                 * If we fail to free the inode, shut down.  The cancel
1839                 * might do that, we need to make sure.  Otherwise the
1840                 * inode might be lost for a long time or forever.
1841                 */
1842                if (!XFS_FORCED_SHUTDOWN(mp)) {
1843                        xfs_notice(mp, "%s: xfs_ifree returned error %d",
1844                                __func__, error);
1845                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1846                }
1847                xfs_trans_cancel(tp);
1848                xfs_iunlock(ip, XFS_ILOCK_EXCL);
1849                return error;
1850        }
1851
1852        /*
1853         * Credit the quota account(s). The inode is gone.
1854         */
1855        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1856
1857        /*
1858         * Just ignore errors at this point.  There is nothing we can do except
1859         * to try to keep going. Make sure it's not a silent error.
1860         */
1861        error = xfs_defer_finish(&tp, &dfops, NULL);
1862        if (error) {
1863                xfs_notice(mp, "%s: xfs_defer_finish returned error %d",
1864                        __func__, error);
1865                xfs_defer_cancel(&dfops);
1866        }
1867        error = xfs_trans_commit(tp);
1868        if (error)
1869                xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1870                        __func__, error);
1871
1872        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1873        return 0;
1874}
1875
1876/*
1877 * xfs_inactive
1878 *
1879 * This is called when the vnode reference count for the vnode
1880 * goes to zero.  If the file has been unlinked, then it must
1881 * now be truncated.  Also, we clear all of the read-ahead state
1882 * kept for the inode here since the file is now closed.
1883 */
1884void
1885xfs_inactive(
1886        xfs_inode_t     *ip)
1887{
1888        struct xfs_mount        *mp;
1889        int                     error;
1890        int                     truncate = 0;
1891
1892        /*
1893         * If the inode is already free, then there can be nothing
1894         * to clean up here.
1895         */
1896        if (VFS_I(ip)->i_mode == 0) {
1897                ASSERT(ip->i_df.if_real_bytes == 0);
1898                ASSERT(ip->i_df.if_broot_bytes == 0);
1899                return;
1900        }
1901
1902        mp = ip->i_mount;
1903        ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
1904
1905        /* If this is a read-only mount, don't do this (would generate I/O) */
1906        if (mp->m_flags & XFS_MOUNT_RDONLY)
1907                return;
1908
1909        if (VFS_I(ip)->i_nlink != 0) {
1910                /*
1911                 * force is true because we are evicting an inode from the
1912                 * cache. Post-eof blocks must be freed, lest we end up with
1913                 * broken free space accounting.
1914                 */
1915                if (xfs_can_free_eofblocks(ip, true))
1916                        xfs_free_eofblocks(mp, ip, false);
1917
1918                return;
1919        }
1920
1921        if (S_ISREG(VFS_I(ip)->i_mode) &&
1922            (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
1923             ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
1924                truncate = 1;
1925
1926        error = xfs_qm_dqattach(ip, 0);
1927        if (error)
1928                return;
1929
1930        if (S_ISLNK(VFS_I(ip)->i_mode))
1931                error = xfs_inactive_symlink(ip);
1932        else if (truncate)
1933                error = xfs_inactive_truncate(ip);
1934        if (error)
1935                return;
1936
1937        /*
1938         * If there are attributes associated with the file then blow them away
1939         * now.  The code calls a routine that recursively deconstructs the
1940         * attribute fork. If also blows away the in-core attribute fork.
1941         */
1942        if (XFS_IFORK_Q(ip)) {
1943                error = xfs_attr_inactive(ip);
1944                if (error)
1945                        return;
1946        }
1947
1948        ASSERT(!ip->i_afp);
1949        ASSERT(ip->i_d.di_anextents == 0);
1950        ASSERT(ip->i_d.di_forkoff == 0);
1951
1952        /*
1953         * Free the inode.
1954         */
1955        error = xfs_inactive_ifree(ip);
1956        if (error)
1957                return;
1958
1959        /*
1960         * Release the dquots held by inode, if any.
1961         */
1962        xfs_qm_dqdetach(ip);
1963}
1964
1965/*
1966 * This is called when the inode's link count goes to 0 or we are creating a
1967 * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
1968 * set to true as the link count is dropped to zero by the VFS after we've
1969 * created the file successfully, so we have to add it to the unlinked list
1970 * while the link count is non-zero.
1971 *
1972 * We place the on-disk inode on a list in the AGI.  It will be pulled from this
1973 * list when the inode is freed.
1974 */
1975STATIC int
1976xfs_iunlink(
1977        struct xfs_trans *tp,
1978        struct xfs_inode *ip)
1979{
1980        xfs_mount_t     *mp = tp->t_mountp;
1981        xfs_agi_t       *agi;
1982        xfs_dinode_t    *dip;
1983        xfs_buf_t       *agibp;
1984        xfs_buf_t       *ibp;
1985        xfs_agino_t     agino;
1986        short           bucket_index;
1987        int             offset;
1988        int             error;
1989
1990        ASSERT(VFS_I(ip)->i_mode != 0);
1991
1992        /*
1993         * Get the agi buffer first.  It ensures lock ordering
1994         * on the list.
1995         */
1996        error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1997        if (error)
1998                return error;
1999        agi = XFS_BUF_TO_AGI(agibp);
2000
2001        /*
2002         * Get the index into the agi hash table for the
2003         * list this inode will go on.
2004         */
2005        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2006        ASSERT(agino != 0);
2007        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2008        ASSERT(agi->agi_unlinked[bucket_index]);
2009        ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
2010
2011        if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
2012                /*
2013                 * There is already another inode in the bucket we need
2014                 * to add ourselves to.  Add us at the front of the list.
2015                 * Here we put the head pointer into our next pointer,
2016                 * and then we fall through to point the head at us.
2017                 */
2018                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2019                                       0, 0);
2020                if (error)
2021                        return error;
2022
2023                ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
2024                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
2025                offset = ip->i_imap.im_boffset +
2026                        offsetof(xfs_dinode_t, di_next_unlinked);
2027
2028                /* need to recalc the inode CRC if appropriate */
2029                xfs_dinode_calc_crc(mp, dip);
2030
2031                xfs_trans_inode_buf(tp, ibp);
2032                xfs_trans_log_buf(tp, ibp, offset,
2033                                  (offset + sizeof(xfs_agino_t) - 1));
2034                xfs_inobp_check(mp, ibp);
2035        }
2036
2037        /*
2038         * Point the bucket head pointer at the inode being inserted.
2039         */
2040        ASSERT(agino != 0);
2041        agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
2042        offset = offsetof(xfs_agi_t, agi_unlinked) +
2043                (sizeof(xfs_agino_t) * bucket_index);
2044        xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
2045        xfs_trans_log_buf(tp, agibp, offset,
2046                          (offset + sizeof(xfs_agino_t) - 1));
2047        return 0;
2048}
2049
2050/*
2051 * Pull the on-disk inode from the AGI unlinked list.
2052 */
2053STATIC int
2054xfs_iunlink_remove(
2055        xfs_trans_t     *tp,
2056        xfs_inode_t     *ip)
2057{
2058        xfs_ino_t       next_ino;
2059        xfs_mount_t     *mp;
2060        xfs_agi_t       *agi;
2061        xfs_dinode_t    *dip;
2062        xfs_buf_t       *agibp;
2063        xfs_buf_t       *ibp;
2064        xfs_agnumber_t  agno;
2065        xfs_agino_t     agino;
2066        xfs_agino_t     next_agino;
2067        xfs_buf_t       *last_ibp;
2068        xfs_dinode_t    *last_dip = NULL;
2069        short           bucket_index;
2070        int             offset, last_offset = 0;
2071        int             error;
2072
2073        mp = tp->t_mountp;
2074        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2075
2076        /*
2077         * Get the agi buffer first.  It ensures lock ordering
2078         * on the list.
2079         */
2080        error = xfs_read_agi(mp, tp, agno, &agibp);
2081        if (error)
2082                return error;
2083
2084        agi = XFS_BUF_TO_AGI(agibp);
2085
2086        /*
2087         * Get the index into the agi hash table for the
2088         * list this inode will go on.
2089         */
2090        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2091        ASSERT(agino != 0);
2092        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2093        ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
2094        ASSERT(agi->agi_unlinked[bucket_index]);
2095
2096        if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
2097                /*
2098                 * We're at the head of the list.  Get the inode's on-disk
2099                 * buffer to see if there is anyone after us on the list.
2100                 * Only modify our next pointer if it is not already NULLAGINO.
2101                 * This saves us the overhead of dealing with the buffer when
2102                 * there is no need to change it.
2103                 */
2104                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2105                                       0, 0);
2106                if (error) {
2107                        xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
2108                                __func__, error);
2109                        return error;
2110                }
2111                next_agino = be32_to_cpu(dip->di_next_unlinked);
2112                ASSERT(next_agino != 0);
2113                if (next_agino != NULLAGINO) {
2114                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2115                        offset = ip->i_imap.im_boffset +
2116                                offsetof(xfs_dinode_t, di_next_unlinked);
2117
2118                        /* need to recalc the inode CRC if appropriate */
2119                        xfs_dinode_calc_crc(mp, dip);
2120
2121                        xfs_trans_inode_buf(tp, ibp);
2122                        xfs_trans_log_buf(tp, ibp, offset,
2123                                          (offset + sizeof(xfs_agino_t) - 1));
2124                        xfs_inobp_check(mp, ibp);
2125                } else {
2126                        xfs_trans_brelse(tp, ibp);
2127                }
2128                /*
2129                 * Point the bucket head pointer at the next inode.
2130                 */
2131                ASSERT(next_agino != 0);
2132                ASSERT(next_agino != agino);
2133                agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2134                offset = offsetof(xfs_agi_t, agi_unlinked) +
2135                        (sizeof(xfs_agino_t) * bucket_index);
2136                xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
2137                xfs_trans_log_buf(tp, agibp, offset,
2138                                  (offset + sizeof(xfs_agino_t) - 1));
2139        } else {
2140                /*
2141                 * We need to search the list for the inode being freed.
2142                 */
2143                next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2144                last_ibp = NULL;
2145                while (next_agino != agino) {
2146                        struct xfs_imap imap;
2147
2148                        if (last_ibp)
2149                                xfs_trans_brelse(tp, last_ibp);
2150
2151                        imap.im_blkno = 0;
2152                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
2153
2154                        error = xfs_imap(mp, tp, next_ino, &imap, 0);
2155                        if (error) {
2156                                xfs_warn(mp,
2157        "%s: xfs_imap returned error %d.",
2158                                         __func__, error);
2159                                return error;
2160                        }
2161
2162                        error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
2163                                               &last_ibp, 0, 0);
2164                        if (error) {
2165                                xfs_warn(mp,
2166        "%s: xfs_imap_to_bp returned error %d.",
2167                                        __func__, error);
2168                                return error;
2169                        }
2170
2171                        last_offset = imap.im_boffset;
2172                        next_agino = be32_to_cpu(last_dip->di_next_unlinked);
2173                        ASSERT(next_agino != NULLAGINO);
2174                        ASSERT(next_agino != 0);
2175                }
2176
2177                /*
2178                 * Now last_ibp points to the buffer previous to us on the
2179                 * unlinked list.  Pull us from the list.
2180                 */
2181                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2182                                       0, 0);
2183                if (error) {
2184                        xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
2185                                __func__, error);
2186                        return error;
2187                }
2188                next_agino = be32_to_cpu(dip->di_next_unlinked);
2189                ASSERT(next_agino != 0);
2190                ASSERT(next_agino != agino);
2191                if (next_agino != NULLAGINO) {
2192                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2193                        offset = ip->i_imap.im_boffset +
2194                                offsetof(xfs_dinode_t, di_next_unlinked);
2195
2196                        /* need to recalc the inode CRC if appropriate */
2197                        xfs_dinode_calc_crc(mp, dip);
2198
2199                        xfs_trans_inode_buf(tp, ibp);
2200                        xfs_trans_log_buf(tp, ibp, offset,
2201                                          (offset + sizeof(xfs_agino_t) - 1));
2202                        xfs_inobp_check(mp, ibp);
2203                } else {
2204                        xfs_trans_brelse(tp, ibp);
2205                }
2206                /*
2207                 * Point the previous inode on the list to the next inode.
2208                 */
2209                last_dip->di_next_unlinked = cpu_to_be32(next_agino);
2210                ASSERT(next_agino != 0);
2211                offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2212
2213                /* need to recalc the inode CRC if appropriate */
2214                xfs_dinode_calc_crc(mp, last_dip);
2215
2216                xfs_trans_inode_buf(tp, last_ibp);
2217                xfs_trans_log_buf(tp, last_ibp, offset,
2218                                  (offset + sizeof(xfs_agino_t) - 1));
2219                xfs_inobp_check(mp, last_ibp);
2220        }
2221        return 0;
2222}
2223
2224/*
2225 * A big issue when freeing the inode cluster is that we _cannot_ skip any
2226 * inodes that are in memory - they all must be marked stale and attached to
2227 * the cluster buffer.
2228 */
2229STATIC int
2230xfs_ifree_cluster(
2231        xfs_inode_t             *free_ip,
2232        xfs_trans_t             *tp,
2233        struct xfs_icluster     *xic)
2234{
2235        xfs_mount_t             *mp = free_ip->i_mount;
2236        int                     blks_per_cluster;
2237        int                     inodes_per_cluster;
2238        int                     nbufs;
2239        int                     i, j;
2240        int                     ioffset;
2241        xfs_daddr_t             blkno;
2242        xfs_buf_t               *bp;
2243        xfs_inode_t             *ip;
2244        xfs_inode_log_item_t    *iip;
2245        xfs_log_item_t          *lip;
2246        struct xfs_perag        *pag;
2247        xfs_ino_t               inum;
2248
2249        inum = xic->first_ino;
2250        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2251        blks_per_cluster = xfs_icluster_size_fsb(mp);
2252        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
2253        nbufs = mp->m_ialloc_blks / blks_per_cluster;
2254
2255        for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
2256                /*
2257                 * The allocation bitmap tells us which inodes of the chunk were
2258                 * physically allocated. Skip the cluster if an inode falls into
2259                 * a sparse region.
2260                 */
2261                ioffset = inum - xic->first_ino;
2262                if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2263                        ASSERT(do_mod(ioffset, inodes_per_cluster) == 0);
2264                        continue;
2265                }
2266
2267                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2268                                         XFS_INO_TO_AGBNO(mp, inum));
2269
2270                /*
2271                 * We obtain and lock the backing buffer first in the process
2272                 * here, as we have to ensure that any dirty inode that we
2273                 * can't get the flush lock on is attached to the buffer.
2274                 * If we scan the in-memory inodes first, then buffer IO can
2275                 * complete before we get a lock on it, and hence we may fail
2276                 * to mark all the active inodes on the buffer stale.
2277                 */
2278                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2279                                        mp->m_bsize * blks_per_cluster,
2280                                        XBF_UNMAPPED);
2281
2282                if (!bp)
2283                        return -ENOMEM;
2284
2285                /*
2286                 * This buffer may not have been correctly initialised as we
2287                 * didn't read it from disk. That's not important because we are
2288                 * only using to mark the buffer as stale in the log, and to
2289                 * attach stale cached inodes on it. That means it will never be
2290                 * dispatched for IO. If it is, we want to know about it, and we
2291                 * want it to fail. We can acheive this by adding a write
2292                 * verifier to the buffer.
2293                 */
2294                 bp->b_ops = &xfs_inode_buf_ops;
2295
2296                /*
2297                 * Walk the inodes already attached to the buffer and mark them
2298                 * stale. These will all have the flush locks held, so an
2299                 * in-memory inode walk can't lock them. By marking them all
2300                 * stale first, we will not attempt to lock them in the loop
2301                 * below as the XFS_ISTALE flag will be set.
2302                 */
2303                lip = bp->b_fspriv;
2304                while (lip) {
2305                        if (lip->li_type == XFS_LI_INODE) {
2306                                iip = (xfs_inode_log_item_t *)lip;
2307                                ASSERT(iip->ili_logged == 1);
2308                                lip->li_cb = xfs_istale_done;
2309                                xfs_trans_ail_copy_lsn(mp->m_ail,
2310                                                        &iip->ili_flush_lsn,
2311                                                        &iip->ili_item.li_lsn);
2312                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2313                        }
2314                        lip = lip->li_bio_list;
2315                }
2316
2317
2318                /*
2319                 * For each inode in memory attempt to add it to the inode
2320                 * buffer and set it up for being staled on buffer IO
2321                 * completion.  This is safe as we've locked out tail pushing
2322                 * and flushing by locking the buffer.
2323                 *
2324                 * We have already marked every inode that was part of a
2325                 * transaction stale above, which means there is no point in
2326                 * even trying to lock them.
2327                 */
2328                for (i = 0; i < inodes_per_cluster; i++) {
2329retry:
2330                        rcu_read_lock();
2331                        ip = radix_tree_lookup(&pag->pag_ici_root,
2332                                        XFS_INO_TO_AGINO(mp, (inum + i)));
2333
2334                        /* Inode not in memory, nothing to do */
2335                        if (!ip) {
2336                                rcu_read_unlock();
2337                                continue;
2338                        }
2339
2340                        /*
2341                         * because this is an RCU protected lookup, we could
2342                         * find a recently freed or even reallocated inode
2343                         * during the lookup. We need to check under the
2344                         * i_flags_lock for a valid inode here. Skip it if it
2345                         * is not valid, the wrong inode or stale.
2346                         */
2347                        spin_lock(&ip->i_flags_lock);
2348                        if (ip->i_ino != inum + i ||
2349                            __xfs_iflags_test(ip, XFS_ISTALE)) {
2350                                spin_unlock(&ip->i_flags_lock);
2351                                rcu_read_unlock();
2352                                continue;
2353                        }
2354                        spin_unlock(&ip->i_flags_lock);
2355
2356                        /*
2357                         * Don't try to lock/unlock the current inode, but we
2358                         * _cannot_ skip the other inodes that we did not find
2359                         * in the list attached to the buffer and are not
2360                         * already marked stale. If we can't lock it, back off
2361                         * and retry.
2362                         */
2363                        if (ip != free_ip &&
2364                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2365                                rcu_read_unlock();
2366                                delay(1);
2367                                goto retry;
2368                        }
2369                        rcu_read_unlock();
2370
2371                        xfs_iflock(ip);
2372                        xfs_iflags_set(ip, XFS_ISTALE);
2373
2374                        /*
2375                         * we don't need to attach clean inodes or those only
2376                         * with unlogged changes (which we throw away, anyway).
2377                         */
2378                        iip = ip->i_itemp;
2379                        if (!iip || xfs_inode_clean(ip)) {
2380                                ASSERT(ip != free_ip);
2381                                xfs_ifunlock(ip);
2382                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2383                                continue;
2384                        }
2385
2386                        iip->ili_last_fields = iip->ili_fields;
2387                        iip->ili_fields = 0;
2388                        iip->ili_fsync_fields = 0;
2389                        iip->ili_logged = 1;
2390                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2391                                                &iip->ili_item.li_lsn);
2392
2393                        xfs_buf_attach_iodone(bp, xfs_istale_done,
2394                                                  &iip->ili_item);
2395
2396                        if (ip != free_ip)
2397                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2398                }
2399
2400                xfs_trans_stale_inode_buf(tp, bp);
2401                xfs_trans_binval(tp, bp);
2402        }
2403
2404        xfs_perag_put(pag);
2405        return 0;
2406}
2407
2408/*
2409 * This is called to return an inode to the inode free list.
2410 * The inode should already be truncated to 0 length and have
2411 * no pages associated with it.  This routine also assumes that
2412 * the inode is already a part of the transaction.
2413 *
2414 * The on-disk copy of the inode will have been added to the list
2415 * of unlinked inodes in the AGI. We need to remove the inode from
2416 * that list atomically with respect to freeing it here.
2417 */
2418int
2419xfs_ifree(
2420        xfs_trans_t     *tp,
2421        xfs_inode_t     *ip,
2422        struct xfs_defer_ops    *dfops)
2423{
2424        int                     error;
2425        struct xfs_icluster     xic = { 0 };
2426
2427        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2428        ASSERT(VFS_I(ip)->i_nlink == 0);
2429        ASSERT(ip->i_d.di_nextents == 0);
2430        ASSERT(ip->i_d.di_anextents == 0);
2431        ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2432        ASSERT(ip->i_d.di_nblocks == 0);
2433
2434        /*
2435         * Pull the on-disk inode from the AGI unlinked list.
2436         */
2437        error = xfs_iunlink_remove(tp, ip);
2438        if (error)
2439                return error;
2440
2441        error = xfs_difree(tp, ip->i_ino, dfops, &xic);
2442        if (error)
2443                return error;
2444
2445        VFS_I(ip)->i_mode = 0;          /* mark incore inode as free */
2446        ip->i_d.di_flags = 0;
2447        ip->i_d.di_dmevmask = 0;
2448        ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
2449        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2450        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2451        /*
2452         * Bump the generation count so no one will be confused
2453         * by reincarnations of this inode.
2454         */
2455        VFS_I(ip)->i_generation++;
2456        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2457
2458        if (xic.deleted)
2459                error = xfs_ifree_cluster(ip, tp, &xic);
2460
2461        return error;
2462}
2463
2464/*
2465 * This is called to unpin an inode.  The caller must have the inode locked
2466 * in at least shared mode so that the buffer cannot be subsequently pinned
2467 * once someone is waiting for it to be unpinned.
2468 */
2469static void
2470xfs_iunpin(
2471        struct xfs_inode        *ip)
2472{
2473        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2474
2475        trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2476
2477        /* Give the log a push to start the unpinning I/O */
2478        xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2479
2480}
2481
2482static void
2483__xfs_iunpin_wait(
2484        struct xfs_inode        *ip)
2485{
2486        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2487        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2488
2489        xfs_iunpin(ip);
2490
2491        do {
2492                prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2493                if (xfs_ipincount(ip))
2494                        io_schedule();
2495        } while (xfs_ipincount(ip));
2496        finish_wait(wq, &wait.wait);
2497}
2498
2499void
2500xfs_iunpin_wait(
2501        struct xfs_inode        *ip)
2502{
2503        if (xfs_ipincount(ip))
2504                __xfs_iunpin_wait(ip);
2505}
2506
2507/*
2508 * Removing an inode from the namespace involves removing the directory entry
2509 * and dropping the link count on the inode. Removing the directory entry can
2510 * result in locking an AGF (directory blocks were freed) and removing a link
2511 * count can result in placing the inode on an unlinked list which results in
2512 * locking an AGI.
2513 *
2514 * The big problem here is that we have an ordering constraint on AGF and AGI
2515 * locking - inode allocation locks the AGI, then can allocate a new extent for
2516 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2517 * removes the inode from the unlinked list, requiring that we lock the AGI
2518 * first, and then freeing the inode can result in an inode chunk being freed
2519 * and hence freeing disk space requiring that we lock an AGF.
2520 *
2521 * Hence the ordering that is imposed by other parts of the code is AGI before
2522 * AGF. This means we cannot remove the directory entry before we drop the inode
2523 * reference count and put it on the unlinked list as this results in a lock
2524 * order of AGF then AGI, and this can deadlock against inode allocation and
2525 * freeing. Therefore we must drop the link counts before we remove the
2526 * directory entry.
2527 *
2528 * This is still safe from a transactional point of view - it is not until we
2529 * get to xfs_defer_finish() that we have the possibility of multiple
2530 * transactions in this operation. Hence as long as we remove the directory
2531 * entry and drop the link count in the first transaction of the remove
2532 * operation, there are no transactional constraints on the ordering here.
2533 */
2534int
2535xfs_remove(
2536        xfs_inode_t             *dp,
2537        struct xfs_name         *name,
2538        xfs_inode_t             *ip)
2539{
2540        xfs_mount_t             *mp = dp->i_mount;
2541        xfs_trans_t             *tp = NULL;
2542        int                     is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2543        int                     error = 0;
2544        struct xfs_defer_ops    dfops;
2545        xfs_fsblock_t           first_block;
2546        uint                    resblks;
2547
2548        trace_xfs_remove(dp, name);
2549
2550        if (XFS_FORCED_SHUTDOWN(mp))
2551                return -EIO;
2552
2553        error = xfs_qm_dqattach(dp, 0);
2554        if (error)
2555                goto std_return;
2556
2557        error = xfs_qm_dqattach(ip, 0);
2558        if (error)
2559                goto std_return;
2560
2561        /*
2562         * We try to get the real space reservation first,
2563         * allowing for directory btree deletion(s) implying
2564         * possible bmap insert(s).  If we can't get the space
2565         * reservation then we use 0 instead, and avoid the bmap
2566         * btree insert(s) in the directory code by, if the bmap
2567         * insert tries to happen, instead trimming the LAST
2568         * block from the directory.
2569         */
2570        resblks = XFS_REMOVE_SPACE_RES(mp);
2571        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
2572        if (error == -ENOSPC) {
2573                resblks = 0;
2574                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
2575                                &tp);
2576        }
2577        if (error) {
2578                ASSERT(error != -ENOSPC);
2579                goto std_return;
2580        }
2581
2582        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
2583        xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
2584
2585        xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
2586        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2587
2588        /*
2589         * If we're removing a directory perform some additional validation.
2590         */
2591        if (is_dir) {
2592                ASSERT(VFS_I(ip)->i_nlink >= 2);
2593                if (VFS_I(ip)->i_nlink != 2) {
2594                        error = -ENOTEMPTY;
2595                        goto out_trans_cancel;
2596                }
2597                if (!xfs_dir_isempty(ip)) {
2598                        error = -ENOTEMPTY;
2599                        goto out_trans_cancel;
2600                }
2601
2602                /* Drop the link from ip's "..".  */
2603                error = xfs_droplink(tp, dp);
2604                if (error)
2605                        goto out_trans_cancel;
2606
2607                /* Drop the "." link from ip to self.  */
2608                error = xfs_droplink(tp, ip);
2609                if (error)
2610                        goto out_trans_cancel;
2611        } else {
2612                /*
2613                 * When removing a non-directory we need to log the parent
2614                 * inode here.  For a directory this is done implicitly
2615                 * by the xfs_droplink call for the ".." entry.
2616                 */
2617                xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2618        }
2619        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2620
2621        /* Drop the link from dp to ip. */
2622        error = xfs_droplink(tp, ip);
2623        if (error)
2624                goto out_trans_cancel;
2625
2626        xfs_defer_init(&dfops, &first_block);
2627        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2628                                        &first_block, &dfops, resblks);
2629        if (error) {
2630                ASSERT(error != -ENOENT);
2631                goto out_bmap_cancel;
2632        }
2633
2634        /*
2635         * If this is a synchronous mount, make sure that the
2636         * remove transaction goes to disk before returning to
2637         * the user.
2638         */
2639        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2640                xfs_trans_set_sync(tp);
2641
2642        error = xfs_defer_finish(&tp, &dfops, NULL);
2643        if (error)
2644                goto out_bmap_cancel;
2645
2646        error = xfs_trans_commit(tp);
2647        if (error)
2648                goto std_return;
2649
2650        if (is_dir && xfs_inode_is_filestream(ip))
2651                xfs_filestream_deassociate(ip);
2652
2653        return 0;
2654
2655 out_bmap_cancel:
2656        xfs_defer_cancel(&dfops);
2657 out_trans_cancel:
2658        xfs_trans_cancel(tp);
2659 std_return:
2660        return error;
2661}
2662
2663/*
2664 * Enter all inodes for a rename transaction into a sorted array.
2665 */
2666#define __XFS_SORT_INODES       5
2667STATIC void
2668xfs_sort_for_rename(
2669        struct xfs_inode        *dp1,   /* in: old (source) directory inode */
2670        struct xfs_inode        *dp2,   /* in: new (target) directory inode */
2671        struct xfs_inode        *ip1,   /* in: inode of old entry */
2672        struct xfs_inode        *ip2,   /* in: inode of new entry */
2673        struct xfs_inode        *wip,   /* in: whiteout inode */
2674        struct xfs_inode        **i_tab,/* out: sorted array of inodes */
2675        int                     *num_inodes)  /* in/out: inodes in array */
2676{
2677        int                     i, j;
2678
2679        ASSERT(*num_inodes == __XFS_SORT_INODES);
2680        memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2681
2682        /*
2683         * i_tab contains a list of pointers to inodes.  We initialize
2684         * the table here & we'll sort it.  We will then use it to
2685         * order the acquisition of the inode locks.
2686         *
2687         * Note that the table may contain duplicates.  e.g., dp1 == dp2.
2688         */
2689        i = 0;
2690        i_tab[i++] = dp1;
2691        i_tab[i++] = dp2;
2692        i_tab[i++] = ip1;
2693        if (ip2)
2694                i_tab[i++] = ip2;
2695        if (wip)
2696                i_tab[i++] = wip;
2697        *num_inodes = i;
2698
2699        /*
2700         * Sort the elements via bubble sort.  (Remember, there are at
2701         * most 5 elements to sort, so this is adequate.)
2702         */
2703        for (i = 0; i < *num_inodes; i++) {
2704                for (j = 1; j < *num_inodes; j++) {
2705                        if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2706                                struct xfs_inode *temp = i_tab[j];
2707                                i_tab[j] = i_tab[j-1];
2708                                i_tab[j-1] = temp;
2709                        }
2710                }
2711        }
2712}
2713
2714static int
2715xfs_finish_rename(
2716        struct xfs_trans        *tp,
2717        struct xfs_defer_ops    *dfops)
2718{
2719        int                     error;
2720
2721        /*
2722         * If this is a synchronous mount, make sure that the rename transaction
2723         * goes to disk before returning to the user.
2724         */
2725        if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2726                xfs_trans_set_sync(tp);
2727
2728        error = xfs_defer_finish(&tp, dfops, NULL);
2729        if (error) {
2730                xfs_defer_cancel(dfops);
2731                xfs_trans_cancel(tp);
2732                return error;
2733        }
2734
2735        return xfs_trans_commit(tp);
2736}
2737
2738/*
2739 * xfs_cross_rename()
2740 *
2741 * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
2742 */
2743STATIC int
2744xfs_cross_rename(
2745        struct xfs_trans        *tp,
2746        struct xfs_inode        *dp1,
2747        struct xfs_name         *name1,
2748        struct xfs_inode        *ip1,
2749        struct xfs_inode        *dp2,
2750        struct xfs_name         *name2,
2751        struct xfs_inode        *ip2,
2752        struct xfs_defer_ops    *dfops,
2753        xfs_fsblock_t           *first_block,
2754        int                     spaceres)
2755{
2756        int             error = 0;
2757        int             ip1_flags = 0;
2758        int             ip2_flags = 0;
2759        int             dp2_flags = 0;
2760
2761        /* Swap inode number for dirent in first parent */
2762        error = xfs_dir_replace(tp, dp1, name1,
2763                                ip2->i_ino,
2764                                first_block, dfops, spaceres);
2765        if (error)
2766                goto out_trans_abort;
2767
2768        /* Swap inode number for dirent in second parent */
2769        error = xfs_dir_replace(tp, dp2, name2,
2770                                ip1->i_ino,
2771                                first_block, dfops, spaceres);
2772        if (error)
2773                goto out_trans_abort;
2774
2775        /*
2776         * If we're renaming one or more directories across different parents,
2777         * update the respective ".." entries (and link counts) to match the new
2778         * parents.
2779         */
2780        if (dp1 != dp2) {
2781                dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2782
2783                if (S_ISDIR(VFS_I(ip2)->i_mode)) {
2784                        error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2785                                                dp1->i_ino, first_block,
2786                                                dfops, spaceres);
2787                        if (error)
2788                                goto out_trans_abort;
2789
2790                        /* transfer ip2 ".." reference to dp1 */
2791                        if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
2792                                error = xfs_droplink(tp, dp2);
2793                                if (error)
2794                                        goto out_trans_abort;
2795                                error = xfs_bumplink(tp, dp1);
2796                                if (error)
2797                                        goto out_trans_abort;
2798                        }
2799
2800                        /*
2801                         * Although ip1 isn't changed here, userspace needs
2802                         * to be warned about the change, so that applications
2803                         * relying on it (like backup ones), will properly
2804                         * notify the change
2805                         */
2806                        ip1_flags |= XFS_ICHGTIME_CHG;
2807                        ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2808                }
2809
2810                if (S_ISDIR(VFS_I(ip1)->i_mode)) {
2811                        error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
2812                                                dp2->i_ino, first_block,
2813                                                dfops, spaceres);
2814                        if (error)
2815                                goto out_trans_abort;
2816
2817                        /* transfer ip1 ".." reference to dp2 */
2818                        if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
2819                                error = xfs_droplink(tp, dp1);
2820                                if (error)
2821                                        goto out_trans_abort;
2822                                error = xfs_bumplink(tp, dp2);
2823                                if (error)
2824                                        goto out_trans_abort;
2825                        }
2826
2827                        /*
2828                         * Although ip2 isn't changed here, userspace needs
2829                         * to be warned about the change, so that applications
2830                         * relying on it (like backup ones), will properly
2831                         * notify the change
2832                         */
2833                        ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2834                        ip2_flags |= XFS_ICHGTIME_CHG;
2835                }
2836        }
2837
2838        if (ip1_flags) {
2839                xfs_trans_ichgtime(tp, ip1, ip1_flags);
2840                xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
2841        }
2842        if (ip2_flags) {
2843                xfs_trans_ichgtime(tp, ip2, ip2_flags);
2844                xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
2845        }
2846        if (dp2_flags) {
2847                xfs_trans_ichgtime(tp, dp2, dp2_flags);
2848                xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
2849        }
2850        xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2851        xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
2852        return xfs_finish_rename(tp, dfops);
2853
2854out_trans_abort:
2855        xfs_defer_cancel(dfops);
2856        xfs_trans_cancel(tp);
2857        return error;
2858}
2859
2860/*
2861 * xfs_rename_alloc_whiteout()
2862 *
2863 * Return a referenced, unlinked, unlocked inode that that can be used as a
2864 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
2865 * crash between allocating the inode and linking it into the rename transaction
2866 * recovery will free the inode and we won't leak it.
2867 */
2868static int
2869xfs_rename_alloc_whiteout(
2870        struct xfs_inode        *dp,
2871        struct xfs_inode        **wip)
2872{
2873        struct xfs_inode        *tmpfile;
2874        int                     error;
2875
2876        error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile);
2877        if (error)
2878                return error;
2879
2880        /*
2881         * Prepare the tmpfile inode as if it were created through the VFS.
2882         * Otherwise, the link increment paths will complain about nlink 0->1.
2883         * Drop the link count as done by d_tmpfile(), complete the inode setup
2884         * and flag it as linkable.
2885         */
2886        drop_nlink(VFS_I(tmpfile));
2887        xfs_setup_iops(tmpfile);
2888        xfs_finish_inode_setup(tmpfile);
2889        VFS_I(tmpfile)->i_state |= I_LINKABLE;
2890
2891        *wip = tmpfile;
2892        return 0;
2893}
2894
2895/*
2896 * xfs_rename
2897 */
2898int
2899xfs_rename(
2900        struct xfs_inode        *src_dp,
2901        struct xfs_name         *src_name,
2902        struct xfs_inode        *src_ip,
2903        struct xfs_inode        *target_dp,
2904        struct xfs_name         *target_name,
2905        struct xfs_inode        *target_ip,
2906        unsigned int            flags)
2907{
2908        struct xfs_mount        *mp = src_dp->i_mount;
2909        struct xfs_trans        *tp;
2910        struct xfs_defer_ops    dfops;
2911        xfs_fsblock_t           first_block;
2912        struct xfs_inode        *wip = NULL;            /* whiteout inode */
2913        struct xfs_inode        *inodes[__XFS_SORT_INODES];
2914        int                     num_inodes = __XFS_SORT_INODES;
2915        bool                    new_parent = (src_dp != target_dp);
2916        bool                    src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
2917        int                     spaceres;
2918        int                     error;
2919
2920        trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2921
2922        if ((flags & RENAME_EXCHANGE) && !target_ip)
2923                return -EINVAL;
2924
2925        /*
2926         * If we are doing a whiteout operation, allocate the whiteout inode
2927         * we will be placing at the target and ensure the type is set
2928         * appropriately.
2929         */
2930        if (flags & RENAME_WHITEOUT) {
2931                ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
2932                error = xfs_rename_alloc_whiteout(target_dp, &wip);
2933                if (error)
2934                        return error;
2935
2936                /* setup target dirent info as whiteout */
2937                src_name->type = XFS_DIR3_FT_CHRDEV;
2938        }
2939
2940        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
2941                                inodes, &num_inodes);
2942
2943        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2944        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
2945        if (error == -ENOSPC) {
2946                spaceres = 0;
2947                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
2948                                &tp);
2949        }
2950        if (error)
2951                goto out_release_wip;
2952
2953        /*
2954         * Attach the dquots to the inodes
2955         */
2956        error = xfs_qm_vop_rename_dqattach(inodes);
2957        if (error)
2958                goto out_trans_cancel;
2959
2960        /*
2961         * Lock all the participating inodes. Depending upon whether
2962         * the target_name exists in the target directory, and
2963         * whether the target directory is the same as the source
2964         * directory, we can lock from 2 to 4 inodes.
2965         */
2966        if (!new_parent)
2967                xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
2968        else
2969                xfs_lock_two_inodes(src_dp, target_dp,
2970                                    XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
2971
2972        xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2973
2974        /*
2975         * Join all the inodes to the transaction. From this point on,
2976         * we can rely on either trans_commit or trans_cancel to unlock
2977         * them.
2978         */
2979        xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
2980        if (new_parent)
2981                xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
2982        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2983        if (target_ip)
2984                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2985        if (wip)
2986                xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
2987
2988        /*
2989         * If we are using project inheritance, we only allow renames
2990         * into our tree when the project IDs are the same; else the
2991         * tree quota mechanism would be circumvented.
2992         */
2993        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2994                     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
2995                error = -EXDEV;
2996                goto out_trans_cancel;
2997        }
2998
2999        xfs_defer_init(&dfops, &first_block);
3000
3001        /* RENAME_EXCHANGE is unique from here on. */
3002        if (flags & RENAME_EXCHANGE)
3003                return xfs_cross_rename(tp, src_dp, src_name, src_ip,
3004                                        target_dp, target_name, target_ip,
3005                                        &dfops, &first_block, spaceres);
3006
3007        /*
3008         * Set up the target.
3009         */
3010        if (target_ip == NULL) {
3011                /*
3012                 * If there's no space reservation, check the entry will
3013                 * fit before actually inserting it.
3014                 */
3015                if (!spaceres) {
3016                        error = xfs_dir_canenter(tp, target_dp, target_name);
3017                        if (error)
3018                                goto out_trans_cancel;
3019                }
3020                /*
3021                 * If target does not exist and the rename crosses
3022                 * directories, adjust the target directory link count
3023                 * to account for the ".." reference from the new entry.
3024                 */
3025                error = xfs_dir_createname(tp, target_dp, target_name,
3026                                                src_ip->i_ino, &first_block,
3027                                                &dfops, spaceres);
3028                if (error)
3029                        goto out_bmap_cancel;
3030
3031                xfs_trans_ichgtime(tp, target_dp,
3032                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3033
3034                if (new_parent && src_is_directory) {
3035                        error = xfs_bumplink(tp, target_dp);
3036                        if (error)
3037                                goto out_bmap_cancel;
3038                }
3039        } else { /* target_ip != NULL */
3040                /*
3041                 * If target exists and it's a directory, check that both
3042                 * target and source are directories and that target can be
3043                 * destroyed, or that neither is a directory.
3044                 */
3045                if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
3046                        /*
3047                         * Make sure target dir is empty.
3048                         */
3049                        if (!(xfs_dir_isempty(target_ip)) ||
3050                            (VFS_I(target_ip)->i_nlink > 2)) {
3051                                error = -EEXIST;
3052                                goto out_trans_cancel;
3053                        }
3054                }
3055
3056                /*
3057                 * Link the source inode under the target name.
3058                 * If the source inode is a directory and we are moving
3059                 * it across directories, its ".." entry will be
3060                 * inconsistent until we replace that down below.
3061                 *
3062                 * In case there is already an entry with the same
3063                 * name at the destination directory, remove it first.
3064                 */
3065                error = xfs_dir_replace(tp, target_dp, target_name,
3066                                        src_ip->i_ino,
3067                                        &first_block, &dfops, spaceres);
3068                if (error)
3069                        goto out_bmap_cancel;
3070
3071                xfs_trans_ichgtime(tp, target_dp,
3072                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3073
3074                /*
3075                 * Decrement the link count on the target since the target
3076                 * dir no longer points to it.
3077                 */
3078                error = xfs_droplink(tp, target_ip);
3079                if (error)
3080                        goto out_bmap_cancel;
3081
3082                if (src_is_directory) {
3083                        /*
3084                         * Drop the link from the old "." entry.
3085                         */
3086                        error = xfs_droplink(tp, target_ip);
3087                        if (error)
3088                                goto out_bmap_cancel;
3089                }
3090        } /* target_ip != NULL */
3091
3092        /*
3093         * Remove the source.
3094         */
3095        if (new_parent && src_is_directory) {
3096                /*
3097                 * Rewrite the ".." entry to point to the new
3098                 * directory.
3099                 */
3100                error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
3101                                        target_dp->i_ino,
3102                                        &first_block, &dfops, spaceres);
3103                ASSERT(error != -EEXIST);
3104                if (error)
3105                        goto out_bmap_cancel;
3106        }
3107
3108        /*
3109         * We always want to hit the ctime on the source inode.
3110         *
3111         * This isn't strictly required by the standards since the source
3112         * inode isn't really being changed, but old unix file systems did
3113         * it and some incremental backup programs won't work without it.
3114         */
3115        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
3116        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
3117
3118        /*
3119         * Adjust the link count on src_dp.  This is necessary when
3120         * renaming a directory, either within one parent when
3121         * the target existed, or across two parent directories.
3122         */
3123        if (src_is_directory && (new_parent || target_ip != NULL)) {
3124
3125                /*
3126                 * Decrement link count on src_directory since the
3127                 * entry that's moved no longer points to it.
3128                 */
3129                error = xfs_droplink(tp, src_dp);
3130                if (error)
3131                        goto out_bmap_cancel;
3132        }
3133
3134        /*
3135         * For whiteouts, we only need to update the source dirent with the
3136         * inode number of the whiteout inode rather than removing it
3137         * altogether.
3138         */
3139        if (wip) {
3140                error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3141                                        &first_block, &dfops, spaceres);
3142        } else
3143                error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3144                                           &first_block, &dfops, spaceres);
3145        if (error)
3146                goto out_bmap_cancel;
3147
3148        /*
3149         * For whiteouts, we need to bump the link count on the whiteout inode.
3150         * This means that failures all the way up to this point leave the inode
3151         * on the unlinked list and so cleanup is a simple matter of dropping
3152         * the remaining reference to it. If we fail here after bumping the link
3153         * count, we're shutting down the filesystem so we'll never see the
3154         * intermediate state on disk.
3155         */
3156        if (wip) {
3157                ASSERT(VFS_I(wip)->i_nlink == 0);
3158                error = xfs_bumplink(tp, wip);
3159                if (error)
3160                        goto out_bmap_cancel;
3161                error = xfs_iunlink_remove(tp, wip);
3162                if (error)
3163                        goto out_bmap_cancel;
3164                xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
3165
3166                /*
3167                 * Now we have a real link, clear the "I'm a tmpfile" state
3168                 * flag from the inode so it doesn't accidentally get misused in
3169                 * future.
3170                 */
3171                VFS_I(wip)->i_state &= ~I_LINKABLE;
3172        }
3173
3174        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3175        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3176        if (new_parent)
3177                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3178
3179        error = xfs_finish_rename(tp, &dfops);
3180        if (wip)
3181                IRELE(wip);
3182        return error;
3183
3184out_bmap_cancel:
3185        xfs_defer_cancel(&dfops);
3186out_trans_cancel:
3187        xfs_trans_cancel(tp);
3188out_release_wip:
3189        if (wip)
3190                IRELE(wip);
3191        return error;
3192}
3193
3194STATIC int
3195xfs_iflush_cluster(
3196        struct xfs_inode        *ip,
3197        struct xfs_buf          *bp)
3198{
3199        struct xfs_mount        *mp = ip->i_mount;
3200        struct xfs_perag        *pag;
3201        unsigned long           first_index, mask;
3202        unsigned long           inodes_per_cluster;
3203        int                     cilist_size;
3204        struct xfs_inode        **cilist;
3205        struct xfs_inode        *cip;
3206        int                     nr_found;
3207        int                     clcount = 0;
3208        int                     bufwasdelwri;
3209        int                     i;
3210
3211        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
3212
3213        inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
3214        cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
3215        cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
3216        if (!cilist)
3217                goto out_put;
3218
3219        mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
3220        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
3221        rcu_read_lock();
3222        /* really need a gang lookup range call here */
3223        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
3224                                        first_index, inodes_per_cluster);
3225        if (nr_found == 0)
3226                goto out_free;
3227
3228        for (i = 0; i < nr_found; i++) {
3229                cip = cilist[i];
3230                if (cip == ip)
3231                        continue;
3232
3233                /*
3234                 * because this is an RCU protected lookup, we could find a
3235                 * recently freed or even reallocated inode during the lookup.
3236                 * We need to check under the i_flags_lock for a valid inode
3237                 * here. Skip it if it is not valid or the wrong inode.
3238                 */
3239                spin_lock(&cip->i_flags_lock);
3240                if (!cip->i_ino ||
3241                    __xfs_iflags_test(cip, XFS_ISTALE)) {
3242                        spin_unlock(&cip->i_flags_lock);
3243                        continue;
3244                }
3245
3246                /*
3247                 * Once we fall off the end of the cluster, no point checking
3248                 * any more inodes in the list because they will also all be
3249                 * outside the cluster.
3250                 */
3251                if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
3252                        spin_unlock(&cip->i_flags_lock);
3253                        break;
3254                }
3255                spin_unlock(&cip->i_flags_lock);
3256
3257                /*
3258                 * Do an un-protected check to see if the inode is dirty and
3259                 * is a candidate for flushing.  These checks will be repeated
3260                 * later after the appropriate locks are acquired.
3261                 */
3262                if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
3263                        continue;
3264
3265                /*
3266                 * Try to get locks.  If any are unavailable or it is pinned,
3267                 * then this inode cannot be flushed and is skipped.
3268                 */
3269
3270                if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
3271                        continue;
3272                if (!xfs_iflock_nowait(cip)) {
3273                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
3274                        continue;
3275                }
3276                if (xfs_ipincount(cip)) {
3277                        xfs_ifunlock(cip);
3278                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
3279                        continue;
3280                }
3281
3282
3283                /*
3284                 * Check the inode number again, just to be certain we are not
3285                 * racing with freeing in xfs_reclaim_inode(). See the comments
3286                 * in that function for more information as to why the initial
3287                 * check is not sufficient.
3288                 */
3289                if (!cip->i_ino) {
3290                        xfs_ifunlock(cip);
3291                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
3292                        continue;
3293                }
3294
3295                /*
3296                 * arriving here means that this inode can be flushed.  First
3297                 * re-check that it's dirty before flushing.
3298                 */
3299                if (!xfs_inode_clean(cip)) {
3300                        int     error;
3301                        error = xfs_iflush_int(cip, bp);
3302                        if (error) {
3303                                xfs_iunlock(cip, XFS_ILOCK_SHARED);
3304                                goto cluster_corrupt_out;
3305                        }
3306                        clcount++;
3307                } else {
3308                        xfs_ifunlock(cip);
3309                }
3310                xfs_iunlock(cip, XFS_ILOCK_SHARED);
3311        }
3312
3313        if (clcount) {
3314                XFS_STATS_INC(mp, xs_icluster_flushcnt);
3315                XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3316        }
3317
3318out_free:
3319        rcu_read_unlock();
3320        kmem_free(cilist);
3321out_put:
3322        xfs_perag_put(pag);
3323        return 0;
3324
3325
3326cluster_corrupt_out:
3327        /*
3328         * Corruption detected in the clustering loop.  Invalidate the
3329         * inode buffer and shut down the filesystem.
3330         */
3331        rcu_read_unlock();
3332        /*
3333         * Clean up the buffer.  If it was delwri, just release it --
3334         * brelse can handle it with no problems.  If not, shut down the
3335         * filesystem before releasing the buffer.
3336         */
3337        bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
3338        if (bufwasdelwri)
3339                xfs_buf_relse(bp);
3340
3341        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3342
3343        if (!bufwasdelwri) {
3344                /*
3345                 * Just like incore_relse: if we have b_iodone functions,
3346                 * mark the buffer as an error and call them.  Otherwise
3347                 * mark it as stale and brelse.
3348                 */
3349                if (bp->b_iodone) {
3350                        bp->b_flags &= ~XBF_DONE;
3351                        xfs_buf_stale(bp);
3352                        xfs_buf_ioerror(bp, -EIO);
3353                        xfs_buf_ioend(bp);
3354                } else {
3355                        xfs_buf_stale(bp);
3356                        xfs_buf_relse(bp);
3357                }
3358        }
3359
3360        /*
3361         * Unlocks the flush lock
3362         */
3363        xfs_iflush_abort(cip, false);
3364        kmem_free(cilist);
3365        xfs_perag_put(pag);
3366        return -EFSCORRUPTED;
3367}
3368
3369/*
3370 * Flush dirty inode metadata into the backing buffer.
3371 *
3372 * The caller must have the inode lock and the inode flush lock held.  The
3373 * inode lock will still be held upon return to the caller, and the inode
3374 * flush lock will be released after the inode has reached the disk.
3375 *
3376 * The caller must write out the buffer returned in *bpp and release it.
3377 */
3378int
3379xfs_iflush(
3380        struct xfs_inode        *ip,
3381        struct xfs_buf          **bpp)
3382{
3383        struct xfs_mount        *mp = ip->i_mount;
3384        struct xfs_buf          *bp = NULL;
3385        struct xfs_dinode       *dip;
3386        int                     error;
3387
3388        XFS_STATS_INC(mp, xs_iflush_count);
3389
3390        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3391        ASSERT(xfs_isiflocked(ip));
3392        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3393               ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3394
3395        *bpp = NULL;
3396
3397        xfs_iunpin_wait(ip);
3398
3399        /*
3400         * For stale inodes we cannot rely on the backing buffer remaining
3401         * stale in cache for the remaining life of the stale inode and so
3402         * xfs_imap_to_bp() below may give us a buffer that no longer contains
3403         * inodes below. We have to check this after ensuring the inode is
3404         * unpinned so that it is safe to reclaim the stale inode after the
3405         * flush call.
3406         */
3407        if (xfs_iflags_test(ip, XFS_ISTALE)) {
3408                xfs_ifunlock(ip);
3409                return 0;
3410        }
3411
3412        /*
3413         * This may have been unpinned because the filesystem is shutting
3414         * down forcibly. If that's the case we must not write this inode
3415         * to disk, because the log record didn't make it to disk.
3416         *
3417         * We also have to remove the log item from the AIL in this case,
3418         * as we wait for an empty AIL as part of the unmount process.
3419         */
3420        if (XFS_FORCED_SHUTDOWN(mp)) {
3421                error = -EIO;
3422                goto abort_out;
3423        }
3424
3425        /*
3426         * Get the buffer containing the on-disk inode. We are doing a try-lock
3427         * operation here, so we may get  an EAGAIN error. In that case, we
3428         * simply want to return with the inode still dirty.
3429         *
3430         * If we get any other error, we effectively have a corruption situation
3431         * and we cannot flush the inode, so we treat it the same as failing
3432         * xfs_iflush_int().
3433         */
3434        error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
3435                               0);
3436        if (error == -EAGAIN) {
3437                xfs_ifunlock(ip);
3438                return error;
3439        }
3440        if (error)
3441                goto corrupt_out;
3442
3443        /*
3444         * First flush out the inode that xfs_iflush was called with.
3445         */
3446        error = xfs_iflush_int(ip, bp);
3447        if (error)
3448                goto corrupt_out;
3449
3450        /*
3451         * If the buffer is pinned then push on the log now so we won't
3452         * get stuck waiting in the write for too long.
3453         */
3454        if (xfs_buf_ispinned(bp))
3455                xfs_log_force(mp, 0);
3456
3457        /*
3458         * inode clustering:
3459         * see if other inodes can be gathered into this write
3460         */
3461        error = xfs_iflush_cluster(ip, bp);
3462        if (error)
3463                goto cluster_corrupt_out;
3464
3465        *bpp = bp;
3466        return 0;
3467
3468corrupt_out:
3469        if (bp)
3470                xfs_buf_relse(bp);
3471        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3472cluster_corrupt_out:
3473        error = -EFSCORRUPTED;
3474abort_out:
3475        /*
3476         * Unlocks the flush lock
3477         */
3478        xfs_iflush_abort(ip, false);
3479        return error;
3480}
3481
3482STATIC int
3483xfs_iflush_int(
3484        struct xfs_inode        *ip,
3485        struct xfs_buf          *bp)
3486{
3487        struct xfs_inode_log_item *iip = ip->i_itemp;
3488        struct xfs_dinode       *dip;
3489        struct xfs_mount        *mp = ip->i_mount;
3490
3491        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3492        ASSERT(xfs_isiflocked(ip));
3493        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3494               ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3495        ASSERT(iip != NULL && iip->ili_fields != 0);
3496        ASSERT(ip->i_d.di_version > 1);
3497
3498        /* set *dip = inode's place in the buffer */
3499        dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3500
3501        if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3502                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
3503                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3504                        "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
3505                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3506                goto corrupt_out;
3507        }
3508        if (S_ISREG(VFS_I(ip)->i_mode)) {
3509                if (XFS_TEST_ERROR(
3510                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3511                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3512                    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
3513                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3514                                "%s: Bad regular inode %Lu, ptr 0x%p",
3515                                __func__, ip->i_ino, ip);
3516                        goto corrupt_out;
3517                }
3518        } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3519                if (XFS_TEST_ERROR(
3520                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3521                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3522                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3523                    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
3524                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3525                                "%s: Bad directory inode %Lu, ptr 0x%p",
3526                                __func__, ip->i_ino, ip);
3527                        goto corrupt_out;
3528                }
3529        }
3530        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3531                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
3532                                XFS_RANDOM_IFLUSH_5)) {
3533                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3534                        "%s: detected corrupt incore inode %Lu, "
3535                        "total extents = %d, nblocks = %Ld, ptr 0x%p",
3536                        __func__, ip->i_ino,
3537                        ip->i_d.di_nextents + ip->i_d.di_anextents,
3538                        ip->i_d.di_nblocks, ip);
3539                goto corrupt_out;
3540        }
3541        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3542                                mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
3543                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3544                        "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
3545                        __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
3546                goto corrupt_out;
3547        }
3548
3549        /*
3550         * Inode item log recovery for v2 inodes are dependent on the
3551         * di_flushiter count for correct sequencing. We bump the flush
3552         * iteration count so we can detect flushes which postdate a log record
3553         * during recovery. This is redundant as we now log every change and
3554         * hence this can't happen but we need to still do it to ensure
3555         * backwards compatibility with old kernels that predate logging all
3556         * inode changes.
3557         */
3558        if (ip->i_d.di_version < 3)
3559                ip->i_d.di_flushiter++;
3560
3561        /*
3562         * Copy the dirty parts of the inode into the on-disk inode.  We always
3563         * copy out the core of the inode, because if the inode is dirty at all
3564         * the core must be.
3565         */
3566        xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3567
3568        /* Wrap, we never let the log put out DI_MAX_FLUSH */
3569        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3570                ip->i_d.di_flushiter = 0;
3571
3572        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3573        if (XFS_IFORK_Q(ip))
3574                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3575        xfs_inobp_check(mp, bp);
3576
3577        /*
3578         * We've recorded everything logged in the inode, so we'd like to clear
3579         * the ili_fields bits so we don't log and flush things unnecessarily.
3580         * However, we can't stop logging all this information until the data
3581         * we've copied into the disk buffer is written to disk.  If we did we
3582         * might overwrite the copy of the inode in the log with all the data
3583         * after re-logging only part of it, and in the face of a crash we
3584         * wouldn't have all the data we need to recover.
3585         *
3586         * What we do is move the bits to the ili_last_fields field.  When
3587         * logging the inode, these bits are moved back to the ili_fields field.
3588         * In the xfs_iflush_done() routine we clear ili_last_fields, since we
3589         * know that the information those bits represent is permanently on
3590         * disk.  As long as the flush completes before the inode is logged
3591         * again, then both ili_fields and ili_last_fields will be cleared.
3592         *
3593         * We can play with the ili_fields bits here, because the inode lock
3594         * must be held exclusively in order to set bits there and the flush
3595         * lock protects the ili_last_fields bits.  Set ili_logged so the flush
3596         * done routine can tell whether or not to look in the AIL.  Also, store
3597         * the current LSN of the inode so that we can tell whether the item has
3598         * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
3599         * need the AIL lock, because it is a 64 bit value that cannot be read
3600         * atomically.
3601         */
3602        iip->ili_last_fields = iip->ili_fields;
3603        iip->ili_fields = 0;
3604        iip->ili_fsync_fields = 0;
3605        iip->ili_logged = 1;
3606
3607        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3608                                &iip->ili_item.li_lsn);
3609
3610        /*
3611         * Attach the function xfs_iflush_done to the inode's
3612         * buffer.  This will remove the inode from the AIL
3613         * and unlock the inode's flush lock when the inode is
3614         * completely written to disk.
3615         */
3616        xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3617
3618        /* generate the checksum. */
3619        xfs_dinode_calc_crc(mp, dip);
3620
3621        ASSERT(bp->b_fspriv != NULL);
3622        ASSERT(bp->b_iodone != NULL);
3623        return 0;
3624
3625corrupt_out:
3626        return -EFSCORRUPTED;
3627}
3628