linux/fs/xfs/xfs_inode.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include <linux/iversion.h>
   7
   8#include "xfs.h"
   9#include "xfs_fs.h"
  10#include "xfs_shared.h"
  11#include "xfs_format.h"
  12#include "xfs_log_format.h"
  13#include "xfs_trans_resv.h"
  14#include "xfs_mount.h"
  15#include "xfs_defer.h"
  16#include "xfs_inode.h"
  17#include "xfs_dir2.h"
  18#include "xfs_attr.h"
  19#include "xfs_trans_space.h"
  20#include "xfs_trans.h"
  21#include "xfs_buf_item.h"
  22#include "xfs_inode_item.h"
  23#include "xfs_ialloc.h"
  24#include "xfs_bmap.h"
  25#include "xfs_bmap_util.h"
  26#include "xfs_errortag.h"
  27#include "xfs_error.h"
  28#include "xfs_quota.h"
  29#include "xfs_filestream.h"
  30#include "xfs_trace.h"
  31#include "xfs_icache.h"
  32#include "xfs_symlink.h"
  33#include "xfs_trans_priv.h"
  34#include "xfs_log.h"
  35#include "xfs_bmap_btree.h"
  36#include "xfs_reflink.h"
  37#include "xfs_ag.h"
  38
  39kmem_zone_t *xfs_inode_zone;
  40
  41/*
  42 * Used in xfs_itruncate_extents().  This is the maximum number of extents
  43 * freed from a file in a single transaction.
  44 */
  45#define XFS_ITRUNC_MAX_EXTENTS  2
  46
  47STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
  48STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
  49        struct xfs_inode *);
  50
  51/*
  52 * helper function to extract extent size hint from inode
  53 */
  54xfs_extlen_t
  55xfs_get_extsz_hint(
  56        struct xfs_inode        *ip)
  57{
  58        /*
  59         * No point in aligning allocations if we need to COW to actually
  60         * write to them.
  61         */
  62        if (xfs_is_always_cow_inode(ip))
  63                return 0;
  64        if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
  65                return ip->i_extsize;
  66        if (XFS_IS_REALTIME_INODE(ip))
  67                return ip->i_mount->m_sb.sb_rextsize;
  68        return 0;
  69}
  70
  71/*
  72 * Helper function to extract CoW extent size hint from inode.
  73 * Between the extent size hint and the CoW extent size hint, we
  74 * return the greater of the two.  If the value is zero (automatic),
  75 * use the default size.
  76 */
  77xfs_extlen_t
  78xfs_get_cowextsz_hint(
  79        struct xfs_inode        *ip)
  80{
  81        xfs_extlen_t            a, b;
  82
  83        a = 0;
  84        if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
  85                a = ip->i_cowextsize;
  86        b = xfs_get_extsz_hint(ip);
  87
  88        a = max(a, b);
  89        if (a == 0)
  90                return XFS_DEFAULT_COWEXTSZ_HINT;
  91        return a;
  92}
  93
  94/*
  95 * These two are wrapper routines around the xfs_ilock() routine used to
  96 * centralize some grungy code.  They are used in places that wish to lock the
  97 * inode solely for reading the extents.  The reason these places can't just
  98 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
  99 * bringing in of the extents from disk for a file in b-tree format.  If the
 100 * inode is in b-tree format, then we need to lock the inode exclusively until
 101 * the extents are read in.  Locking it exclusively all the time would limit
 102 * our parallelism unnecessarily, though.  What we do instead is check to see
 103 * if the extents have been read in yet, and only lock the inode exclusively
 104 * if they have not.
 105 *
 106 * The functions return a value which should be given to the corresponding
 107 * xfs_iunlock() call.
 108 */
 109uint
 110xfs_ilock_data_map_shared(
 111        struct xfs_inode        *ip)
 112{
 113        uint                    lock_mode = XFS_ILOCK_SHARED;
 114
 115        if (xfs_need_iread_extents(&ip->i_df))
 116                lock_mode = XFS_ILOCK_EXCL;
 117        xfs_ilock(ip, lock_mode);
 118        return lock_mode;
 119}
 120
 121uint
 122xfs_ilock_attr_map_shared(
 123        struct xfs_inode        *ip)
 124{
 125        uint                    lock_mode = XFS_ILOCK_SHARED;
 126
 127        if (ip->i_afp && xfs_need_iread_extents(ip->i_afp))
 128                lock_mode = XFS_ILOCK_EXCL;
 129        xfs_ilock(ip, lock_mode);
 130        return lock_mode;
 131}
 132
 133/*
 134 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
 135 * multi-reader locks: invalidate_lock and the i_lock.  This routine allows
 136 * various combinations of the locks to be obtained.
 137 *
 138 * The 3 locks should always be ordered so that the IO lock is obtained first,
 139 * the mmap lock second and the ilock last in order to prevent deadlock.
 140 *
 141 * Basic locking order:
 142 *
 143 * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
 144 *
 145 * mmap_lock locking order:
 146 *
 147 * i_rwsem -> page lock -> mmap_lock
 148 * mmap_lock -> invalidate_lock -> page_lock
 149 *
 150 * The difference in mmap_lock locking order mean that we cannot hold the
 151 * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
 152 * can fault in pages during copy in/out (for buffered IO) or require the
 153 * mmap_lock in get_user_pages() to map the user pages into the kernel address
 154 * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
 155 * fault because page faults already hold the mmap_lock.
 156 *
 157 * Hence to serialise fully against both syscall and mmap based IO, we need to
 158 * take both the i_rwsem and the invalidate_lock. These locks should *only* be
 159 * both taken in places where we need to invalidate the page cache in a race
 160 * free manner (e.g. truncate, hole punch and other extent manipulation
 161 * functions).
 162 */
 163void
 164xfs_ilock(
 165        xfs_inode_t             *ip,
 166        uint                    lock_flags)
 167{
 168        trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 169
 170        /*
 171         * You can't set both SHARED and EXCL for the same lock,
 172         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 173         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 174         */
 175        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 176               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 177        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 178               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 179        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 180               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 181        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 182
 183        if (lock_flags & XFS_IOLOCK_EXCL) {
 184                down_write_nested(&VFS_I(ip)->i_rwsem,
 185                                  XFS_IOLOCK_DEP(lock_flags));
 186        } else if (lock_flags & XFS_IOLOCK_SHARED) {
 187                down_read_nested(&VFS_I(ip)->i_rwsem,
 188                                 XFS_IOLOCK_DEP(lock_flags));
 189        }
 190
 191        if (lock_flags & XFS_MMAPLOCK_EXCL) {
 192                down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
 193                                  XFS_MMAPLOCK_DEP(lock_flags));
 194        } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
 195                down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
 196                                 XFS_MMAPLOCK_DEP(lock_flags));
 197        }
 198
 199        if (lock_flags & XFS_ILOCK_EXCL)
 200                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 201        else if (lock_flags & XFS_ILOCK_SHARED)
 202                mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 203}
 204
 205/*
 206 * This is just like xfs_ilock(), except that the caller
 207 * is guaranteed not to sleep.  It returns 1 if it gets
 208 * the requested locks and 0 otherwise.  If the IO lock is
 209 * obtained but the inode lock cannot be, then the IO lock
 210 * is dropped before returning.
 211 *
 212 * ip -- the inode being locked
 213 * lock_flags -- this parameter indicates the inode's locks to be
 214 *       to be locked.  See the comment for xfs_ilock() for a list
 215 *       of valid values.
 216 */
 217int
 218xfs_ilock_nowait(
 219        xfs_inode_t             *ip,
 220        uint                    lock_flags)
 221{
 222        trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
 223
 224        /*
 225         * You can't set both SHARED and EXCL for the same lock,
 226         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 227         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 228         */
 229        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 230               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 231        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 232               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 233        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 234               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 235        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 236
 237        if (lock_flags & XFS_IOLOCK_EXCL) {
 238                if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
 239                        goto out;
 240        } else if (lock_flags & XFS_IOLOCK_SHARED) {
 241                if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
 242                        goto out;
 243        }
 244
 245        if (lock_flags & XFS_MMAPLOCK_EXCL) {
 246                if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
 247                        goto out_undo_iolock;
 248        } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
 249                if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
 250                        goto out_undo_iolock;
 251        }
 252
 253        if (lock_flags & XFS_ILOCK_EXCL) {
 254                if (!mrtryupdate(&ip->i_lock))
 255                        goto out_undo_mmaplock;
 256        } else if (lock_flags & XFS_ILOCK_SHARED) {
 257                if (!mrtryaccess(&ip->i_lock))
 258                        goto out_undo_mmaplock;
 259        }
 260        return 1;
 261
 262out_undo_mmaplock:
 263        if (lock_flags & XFS_MMAPLOCK_EXCL)
 264                up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 265        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 266                up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
 267out_undo_iolock:
 268        if (lock_flags & XFS_IOLOCK_EXCL)
 269                up_write(&VFS_I(ip)->i_rwsem);
 270        else if (lock_flags & XFS_IOLOCK_SHARED)
 271                up_read(&VFS_I(ip)->i_rwsem);
 272out:
 273        return 0;
 274}
 275
 276/*
 277 * xfs_iunlock() is used to drop the inode locks acquired with
 278 * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
 279 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
 280 * that we know which locks to drop.
 281 *
 282 * ip -- the inode being unlocked
 283 * lock_flags -- this parameter indicates the inode's locks to be
 284 *       to be unlocked.  See the comment for xfs_ilock() for a list
 285 *       of valid values for this parameter.
 286 *
 287 */
 288void
 289xfs_iunlock(
 290        xfs_inode_t             *ip,
 291        uint                    lock_flags)
 292{
 293        /*
 294         * You can't set both SHARED and EXCL for the same lock,
 295         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 296         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 297         */
 298        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 299               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 300        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 301               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 302        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 303               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 304        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 305        ASSERT(lock_flags != 0);
 306
 307        if (lock_flags & XFS_IOLOCK_EXCL)
 308                up_write(&VFS_I(ip)->i_rwsem);
 309        else if (lock_flags & XFS_IOLOCK_SHARED)
 310                up_read(&VFS_I(ip)->i_rwsem);
 311
 312        if (lock_flags & XFS_MMAPLOCK_EXCL)
 313                up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 314        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 315                up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
 316
 317        if (lock_flags & XFS_ILOCK_EXCL)
 318                mrunlock_excl(&ip->i_lock);
 319        else if (lock_flags & XFS_ILOCK_SHARED)
 320                mrunlock_shared(&ip->i_lock);
 321
 322        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 323}
 324
 325/*
 326 * give up write locks.  the i/o lock cannot be held nested
 327 * if it is being demoted.
 328 */
 329void
 330xfs_ilock_demote(
 331        xfs_inode_t             *ip,
 332        uint                    lock_flags)
 333{
 334        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
 335        ASSERT((lock_flags &
 336                ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 337
 338        if (lock_flags & XFS_ILOCK_EXCL)
 339                mrdemote(&ip->i_lock);
 340        if (lock_flags & XFS_MMAPLOCK_EXCL)
 341                downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 342        if (lock_flags & XFS_IOLOCK_EXCL)
 343                downgrade_write(&VFS_I(ip)->i_rwsem);
 344
 345        trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 346}
 347
 348#if defined(DEBUG) || defined(XFS_WARN)
 349static inline bool
 350__xfs_rwsem_islocked(
 351        struct rw_semaphore     *rwsem,
 352        bool                    shared)
 353{
 354        if (!debug_locks)
 355                return rwsem_is_locked(rwsem);
 356
 357        if (!shared)
 358                return lockdep_is_held_type(rwsem, 0);
 359
 360        /*
 361         * We are checking that the lock is held at least in shared
 362         * mode but don't care that it might be held exclusively
 363         * (i.e. shared | excl). Hence we check if the lock is held
 364         * in any mode rather than an explicit shared mode.
 365         */
 366        return lockdep_is_held_type(rwsem, -1);
 367}
 368
 369bool
 370xfs_isilocked(
 371        struct xfs_inode        *ip,
 372        uint                    lock_flags)
 373{
 374        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
 375                if (!(lock_flags & XFS_ILOCK_SHARED))
 376                        return !!ip->i_lock.mr_writer;
 377                return rwsem_is_locked(&ip->i_lock.mr_lock);
 378        }
 379
 380        if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
 381                return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
 382                                (lock_flags & XFS_IOLOCK_SHARED));
 383        }
 384
 385        if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
 386                return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
 387                                (lock_flags & XFS_IOLOCK_SHARED));
 388        }
 389
 390        ASSERT(0);
 391        return false;
 392}
 393#endif
 394
 395/*
 396 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
 397 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
 398 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
 399 * errors and warnings.
 400 */
 401#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
 402static bool
 403xfs_lockdep_subclass_ok(
 404        int subclass)
 405{
 406        return subclass < MAX_LOCKDEP_SUBCLASSES;
 407}
 408#else
 409#define xfs_lockdep_subclass_ok(subclass)       (true)
 410#endif
 411
 412/*
 413 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
 414 * value. This can be called for any type of inode lock combination, including
 415 * parent locking. Care must be taken to ensure we don't overrun the subclass
 416 * storage fields in the class mask we build.
 417 */
 418static inline int
 419xfs_lock_inumorder(int lock_mode, int subclass)
 420{
 421        int     class = 0;
 422
 423        ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
 424                              XFS_ILOCK_RTSUM)));
 425        ASSERT(xfs_lockdep_subclass_ok(subclass));
 426
 427        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 428                ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
 429                class += subclass << XFS_IOLOCK_SHIFT;
 430        }
 431
 432        if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
 433                ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
 434                class += subclass << XFS_MMAPLOCK_SHIFT;
 435        }
 436
 437        if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
 438                ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
 439                class += subclass << XFS_ILOCK_SHIFT;
 440        }
 441
 442        return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
 443}
 444
 445/*
 446 * The following routine will lock n inodes in exclusive mode.  We assume the
 447 * caller calls us with the inodes in i_ino order.
 448 *
 449 * We need to detect deadlock where an inode that we lock is in the AIL and we
 450 * start waiting for another inode that is locked by a thread in a long running
 451 * transaction (such as truncate). This can result in deadlock since the long
 452 * running trans might need to wait for the inode we just locked in order to
 453 * push the tail and free space in the log.
 454 *
 455 * xfs_lock_inodes() can only be used to lock one type of lock at a time -
 456 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
 457 * lock more than one at a time, lockdep will report false positives saying we
 458 * have violated locking orders.
 459 */
 460static void
 461xfs_lock_inodes(
 462        struct xfs_inode        **ips,
 463        int                     inodes,
 464        uint                    lock_mode)
 465{
 466        int                     attempts = 0, i, j, try_lock;
 467        struct xfs_log_item     *lp;
 468
 469        /*
 470         * Currently supports between 2 and 5 inodes with exclusive locking.  We
 471         * support an arbitrary depth of locking here, but absolute limits on
 472         * inodes depend on the type of locking and the limits placed by
 473         * lockdep annotations in xfs_lock_inumorder.  These are all checked by
 474         * the asserts.
 475         */
 476        ASSERT(ips && inodes >= 2 && inodes <= 5);
 477        ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
 478                            XFS_ILOCK_EXCL));
 479        ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
 480                              XFS_ILOCK_SHARED)));
 481        ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
 482                inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
 483        ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
 484                inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
 485
 486        if (lock_mode & XFS_IOLOCK_EXCL) {
 487                ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
 488        } else if (lock_mode & XFS_MMAPLOCK_EXCL)
 489                ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
 490
 491        try_lock = 0;
 492        i = 0;
 493again:
 494        for (; i < inodes; i++) {
 495                ASSERT(ips[i]);
 496
 497                if (i && (ips[i] == ips[i - 1]))        /* Already locked */
 498                        continue;
 499
 500                /*
 501                 * If try_lock is not set yet, make sure all locked inodes are
 502                 * not in the AIL.  If any are, set try_lock to be used later.
 503                 */
 504                if (!try_lock) {
 505                        for (j = (i - 1); j >= 0 && !try_lock; j--) {
 506                                lp = &ips[j]->i_itemp->ili_item;
 507                                if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
 508                                        try_lock++;
 509                        }
 510                }
 511
 512                /*
 513                 * If any of the previous locks we have locked is in the AIL,
 514                 * we must TRY to get the second and subsequent locks. If
 515                 * we can't get any, we must release all we have
 516                 * and try again.
 517                 */
 518                if (!try_lock) {
 519                        xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
 520                        continue;
 521                }
 522
 523                /* try_lock means we have an inode locked that is in the AIL. */
 524                ASSERT(i != 0);
 525                if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
 526                        continue;
 527
 528                /*
 529                 * Unlock all previous guys and try again.  xfs_iunlock will try
 530                 * to push the tail if the inode is in the AIL.
 531                 */
 532                attempts++;
 533                for (j = i - 1; j >= 0; j--) {
 534                        /*
 535                         * Check to see if we've already unlocked this one.  Not
 536                         * the first one going back, and the inode ptr is the
 537                         * same.
 538                         */
 539                        if (j != (i - 1) && ips[j] == ips[j + 1])
 540                                continue;
 541
 542                        xfs_iunlock(ips[j], lock_mode);
 543                }
 544
 545                if ((attempts % 5) == 0) {
 546                        delay(1); /* Don't just spin the CPU */
 547                }
 548                i = 0;
 549                try_lock = 0;
 550                goto again;
 551        }
 552}
 553
 554/*
 555 * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
 556 * mmaplock must be double-locked separately since we use i_rwsem and
 557 * invalidate_lock for that. We now support taking one lock EXCL and the
 558 * other SHARED.
 559 */
 560void
 561xfs_lock_two_inodes(
 562        struct xfs_inode        *ip0,
 563        uint                    ip0_mode,
 564        struct xfs_inode        *ip1,
 565        uint                    ip1_mode)
 566{
 567        struct xfs_inode        *temp;
 568        uint                    mode_temp;
 569        int                     attempts = 0;
 570        struct xfs_log_item     *lp;
 571
 572        ASSERT(hweight32(ip0_mode) == 1);
 573        ASSERT(hweight32(ip1_mode) == 1);
 574        ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
 575        ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
 576        ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
 577        ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
 578        ASSERT(ip0->i_ino != ip1->i_ino);
 579
 580        if (ip0->i_ino > ip1->i_ino) {
 581                temp = ip0;
 582                ip0 = ip1;
 583                ip1 = temp;
 584                mode_temp = ip0_mode;
 585                ip0_mode = ip1_mode;
 586                ip1_mode = mode_temp;
 587        }
 588
 589 again:
 590        xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0));
 591
 592        /*
 593         * If the first lock we have locked is in the AIL, we must TRY to get
 594         * the second lock. If we can't get it, we must release the first one
 595         * and try again.
 596         */
 597        lp = &ip0->i_itemp->ili_item;
 598        if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
 599                if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
 600                        xfs_iunlock(ip0, ip0_mode);
 601                        if ((++attempts % 5) == 0)
 602                                delay(1); /* Don't just spin the CPU */
 603                        goto again;
 604                }
 605        } else {
 606                xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
 607        }
 608}
 609
 610uint
 611xfs_ip2xflags(
 612        struct xfs_inode        *ip)
 613{
 614        uint                    flags = 0;
 615
 616        if (ip->i_diflags & XFS_DIFLAG_ANY) {
 617                if (ip->i_diflags & XFS_DIFLAG_REALTIME)
 618                        flags |= FS_XFLAG_REALTIME;
 619                if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
 620                        flags |= FS_XFLAG_PREALLOC;
 621                if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
 622                        flags |= FS_XFLAG_IMMUTABLE;
 623                if (ip->i_diflags & XFS_DIFLAG_APPEND)
 624                        flags |= FS_XFLAG_APPEND;
 625                if (ip->i_diflags & XFS_DIFLAG_SYNC)
 626                        flags |= FS_XFLAG_SYNC;
 627                if (ip->i_diflags & XFS_DIFLAG_NOATIME)
 628                        flags |= FS_XFLAG_NOATIME;
 629                if (ip->i_diflags & XFS_DIFLAG_NODUMP)
 630                        flags |= FS_XFLAG_NODUMP;
 631                if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
 632                        flags |= FS_XFLAG_RTINHERIT;
 633                if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
 634                        flags |= FS_XFLAG_PROJINHERIT;
 635                if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
 636                        flags |= FS_XFLAG_NOSYMLINKS;
 637                if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
 638                        flags |= FS_XFLAG_EXTSIZE;
 639                if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
 640                        flags |= FS_XFLAG_EXTSZINHERIT;
 641                if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
 642                        flags |= FS_XFLAG_NODEFRAG;
 643                if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
 644                        flags |= FS_XFLAG_FILESTREAM;
 645        }
 646
 647        if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
 648                if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
 649                        flags |= FS_XFLAG_DAX;
 650                if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 651                        flags |= FS_XFLAG_COWEXTSIZE;
 652        }
 653
 654        if (XFS_IFORK_Q(ip))
 655                flags |= FS_XFLAG_HASATTR;
 656        return flags;
 657}
 658
 659/*
 660 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 661 * is allowed, otherwise it has to be an exact match. If a CI match is found,
 662 * ci_name->name will point to a the actual name (caller must free) or
 663 * will be set to NULL if an exact match is found.
 664 */
 665int
 666xfs_lookup(
 667        xfs_inode_t             *dp,
 668        struct xfs_name         *name,
 669        xfs_inode_t             **ipp,
 670        struct xfs_name         *ci_name)
 671{
 672        xfs_ino_t               inum;
 673        int                     error;
 674
 675        trace_xfs_lookup(dp, name);
 676
 677        if (xfs_is_shutdown(dp->i_mount))
 678                return -EIO;
 679
 680        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 681        if (error)
 682                goto out_unlock;
 683
 684        error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
 685        if (error)
 686                goto out_free_name;
 687
 688        return 0;
 689
 690out_free_name:
 691        if (ci_name)
 692                kmem_free(ci_name->name);
 693out_unlock:
 694        *ipp = NULL;
 695        return error;
 696}
 697
 698/* Propagate di_flags from a parent inode to a child inode. */
 699static void
 700xfs_inode_inherit_flags(
 701        struct xfs_inode        *ip,
 702        const struct xfs_inode  *pip)
 703{
 704        unsigned int            di_flags = 0;
 705        xfs_failaddr_t          failaddr;
 706        umode_t                 mode = VFS_I(ip)->i_mode;
 707
 708        if (S_ISDIR(mode)) {
 709                if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
 710                        di_flags |= XFS_DIFLAG_RTINHERIT;
 711                if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
 712                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 713                        ip->i_extsize = pip->i_extsize;
 714                }
 715                if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
 716                        di_flags |= XFS_DIFLAG_PROJINHERIT;
 717        } else if (S_ISREG(mode)) {
 718                if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
 719                    xfs_has_realtime(ip->i_mount))
 720                        di_flags |= XFS_DIFLAG_REALTIME;
 721                if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
 722                        di_flags |= XFS_DIFLAG_EXTSIZE;
 723                        ip->i_extsize = pip->i_extsize;
 724                }
 725        }
 726        if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
 727            xfs_inherit_noatime)
 728                di_flags |= XFS_DIFLAG_NOATIME;
 729        if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
 730            xfs_inherit_nodump)
 731                di_flags |= XFS_DIFLAG_NODUMP;
 732        if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
 733            xfs_inherit_sync)
 734                di_flags |= XFS_DIFLAG_SYNC;
 735        if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
 736            xfs_inherit_nosymlinks)
 737                di_flags |= XFS_DIFLAG_NOSYMLINKS;
 738        if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
 739            xfs_inherit_nodefrag)
 740                di_flags |= XFS_DIFLAG_NODEFRAG;
 741        if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
 742                di_flags |= XFS_DIFLAG_FILESTREAM;
 743
 744        ip->i_diflags |= di_flags;
 745
 746        /*
 747         * Inode verifiers on older kernels only check that the extent size
 748         * hint is an integer multiple of the rt extent size on realtime files.
 749         * They did not check the hint alignment on a directory with both
 750         * rtinherit and extszinherit flags set.  If the misaligned hint is
 751         * propagated from a directory into a new realtime file, new file
 752         * allocations will fail due to math errors in the rt allocator and/or
 753         * trip the verifiers.  Validate the hint settings in the new file so
 754         * that we don't let broken hints propagate.
 755         */
 756        failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
 757                        VFS_I(ip)->i_mode, ip->i_diflags);
 758        if (failaddr) {
 759                ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
 760                                   XFS_DIFLAG_EXTSZINHERIT);
 761                ip->i_extsize = 0;
 762        }
 763}
 764
 765/* Propagate di_flags2 from a parent inode to a child inode. */
 766static void
 767xfs_inode_inherit_flags2(
 768        struct xfs_inode        *ip,
 769        const struct xfs_inode  *pip)
 770{
 771        xfs_failaddr_t          failaddr;
 772
 773        if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
 774                ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
 775                ip->i_cowextsize = pip->i_cowextsize;
 776        }
 777        if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
 778                ip->i_diflags2 |= XFS_DIFLAG2_DAX;
 779
 780        /* Don't let invalid cowextsize hints propagate. */
 781        failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
 782                        VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
 783        if (failaddr) {
 784                ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
 785                ip->i_cowextsize = 0;
 786        }
 787}
 788
 789/*
 790 * Initialise a newly allocated inode and return the in-core inode to the
 791 * caller locked exclusively.
 792 */
 793int
 794xfs_init_new_inode(
 795        struct user_namespace   *mnt_userns,
 796        struct xfs_trans        *tp,
 797        struct xfs_inode        *pip,
 798        xfs_ino_t               ino,
 799        umode_t                 mode,
 800        xfs_nlink_t             nlink,
 801        dev_t                   rdev,
 802        prid_t                  prid,
 803        bool                    init_xattrs,
 804        struct xfs_inode        **ipp)
 805{
 806        struct inode            *dir = pip ? VFS_I(pip) : NULL;
 807        struct xfs_mount        *mp = tp->t_mountp;
 808        struct xfs_inode        *ip;
 809        unsigned int            flags;
 810        int                     error;
 811        struct timespec64       tv;
 812        struct inode            *inode;
 813
 814        /*
 815         * Protect against obviously corrupt allocation btree records. Later
 816         * xfs_iget checks will catch re-allocation of other active in-memory
 817         * and on-disk inodes. If we don't catch reallocating the parent inode
 818         * here we will deadlock in xfs_iget() so we have to do these checks
 819         * first.
 820         */
 821        if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
 822                xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
 823                return -EFSCORRUPTED;
 824        }
 825
 826        /*
 827         * Get the in-core inode with the lock held exclusively to prevent
 828         * others from looking at until we're done.
 829         */
 830        error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
 831        if (error)
 832                return error;
 833
 834        ASSERT(ip != NULL);
 835        inode = VFS_I(ip);
 836        set_nlink(inode, nlink);
 837        inode->i_rdev = rdev;
 838        ip->i_projid = prid;
 839
 840        if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
 841                inode_fsuid_set(inode, mnt_userns);
 842                inode->i_gid = dir->i_gid;
 843                inode->i_mode = mode;
 844        } else {
 845                inode_init_owner(mnt_userns, inode, dir, mode);
 846        }
 847
 848        /*
 849         * If the group ID of the new file does not match the effective group
 850         * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
 851         * (and only if the irix_sgid_inherit compatibility variable is set).
 852         */
 853        if (irix_sgid_inherit &&
 854            (inode->i_mode & S_ISGID) &&
 855            !in_group_p(i_gid_into_mnt(mnt_userns, inode)))
 856                inode->i_mode &= ~S_ISGID;
 857
 858        ip->i_disk_size = 0;
 859        ip->i_df.if_nextents = 0;
 860        ASSERT(ip->i_nblocks == 0);
 861
 862        tv = current_time(inode);
 863        inode->i_mtime = tv;
 864        inode->i_atime = tv;
 865        inode->i_ctime = tv;
 866
 867        ip->i_extsize = 0;
 868        ip->i_diflags = 0;
 869
 870        if (xfs_has_v3inodes(mp)) {
 871                inode_set_iversion(inode, 1);
 872                ip->i_cowextsize = 0;
 873                ip->i_crtime = tv;
 874        }
 875
 876        flags = XFS_ILOG_CORE;
 877        switch (mode & S_IFMT) {
 878        case S_IFIFO:
 879        case S_IFCHR:
 880        case S_IFBLK:
 881        case S_IFSOCK:
 882                ip->i_df.if_format = XFS_DINODE_FMT_DEV;
 883                flags |= XFS_ILOG_DEV;
 884                break;
 885        case S_IFREG:
 886        case S_IFDIR:
 887                if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
 888                        xfs_inode_inherit_flags(ip, pip);
 889                if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
 890                        xfs_inode_inherit_flags2(ip, pip);
 891                fallthrough;
 892        case S_IFLNK:
 893                ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
 894                ip->i_df.if_bytes = 0;
 895                ip->i_df.if_u1.if_root = NULL;
 896                break;
 897        default:
 898                ASSERT(0);
 899        }
 900
 901        /*
 902         * If we need to create attributes immediately after allocating the
 903         * inode, initialise an empty attribute fork right now. We use the
 904         * default fork offset for attributes here as we don't know exactly what
 905         * size or how many attributes we might be adding. We can do this
 906         * safely here because we know the data fork is completely empty and
 907         * this saves us from needing to run a separate transaction to set the
 908         * fork offset in the immediate future.
 909         */
 910        if (init_xattrs && xfs_has_attr(mp)) {
 911                ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
 912                ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
 913        }
 914
 915        /*
 916         * Log the new values stuffed into the inode.
 917         */
 918        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 919        xfs_trans_log_inode(tp, ip, flags);
 920
 921        /* now that we have an i_mode we can setup the inode structure */
 922        xfs_setup_inode(ip);
 923
 924        *ipp = ip;
 925        return 0;
 926}
 927
 928/*
 929 * Decrement the link count on an inode & log the change.  If this causes the
 930 * link count to go to zero, move the inode to AGI unlinked list so that it can
 931 * be freed when the last active reference goes away via xfs_inactive().
 932 */
 933static int                      /* error */
 934xfs_droplink(
 935        xfs_trans_t *tp,
 936        xfs_inode_t *ip)
 937{
 938        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 939
 940        drop_nlink(VFS_I(ip));
 941        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 942
 943        if (VFS_I(ip)->i_nlink)
 944                return 0;
 945
 946        return xfs_iunlink(tp, ip);
 947}
 948
 949/*
 950 * Increment the link count on an inode & log the change.
 951 */
 952static void
 953xfs_bumplink(
 954        xfs_trans_t *tp,
 955        xfs_inode_t *ip)
 956{
 957        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 958
 959        inc_nlink(VFS_I(ip));
 960        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 961}
 962
 963int
 964xfs_create(
 965        struct user_namespace   *mnt_userns,
 966        xfs_inode_t             *dp,
 967        struct xfs_name         *name,
 968        umode_t                 mode,
 969        dev_t                   rdev,
 970        bool                    init_xattrs,
 971        xfs_inode_t             **ipp)
 972{
 973        int                     is_dir = S_ISDIR(mode);
 974        struct xfs_mount        *mp = dp->i_mount;
 975        struct xfs_inode        *ip = NULL;
 976        struct xfs_trans        *tp = NULL;
 977        int                     error;
 978        bool                    unlock_dp_on_error = false;
 979        prid_t                  prid;
 980        struct xfs_dquot        *udqp = NULL;
 981        struct xfs_dquot        *gdqp = NULL;
 982        struct xfs_dquot        *pdqp = NULL;
 983        struct xfs_trans_res    *tres;
 984        uint                    resblks;
 985        xfs_ino_t               ino;
 986
 987        trace_xfs_create(dp, name);
 988
 989        if (xfs_is_shutdown(mp))
 990                return -EIO;
 991
 992        prid = xfs_get_initial_prid(dp);
 993
 994        /*
 995         * Make sure that we have allocated dquot(s) on disk.
 996         */
 997        error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
 998                        mapped_fsgid(mnt_userns), prid,
 999                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1000                        &udqp, &gdqp, &pdqp);
1001        if (error)
1002                return error;
1003
1004        if (is_dir) {
1005                resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1006                tres = &M_RES(mp)->tr_mkdir;
1007        } else {
1008                resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1009                tres = &M_RES(mp)->tr_create;
1010        }
1011
1012        /*
1013         * Initially assume that the file does not exist and
1014         * reserve the resources for that case.  If that is not
1015         * the case we'll drop the one we have and get a more
1016         * appropriate transaction later.
1017         */
1018        error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
1019                        &tp);
1020        if (error == -ENOSPC) {
1021                /* flush outstanding delalloc blocks and retry */
1022                xfs_flush_inodes(mp);
1023                error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
1024                                resblks, &tp);
1025        }
1026        if (error)
1027                goto out_release_dquots;
1028
1029        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1030        unlock_dp_on_error = true;
1031
1032        error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
1033                        XFS_IEXT_DIR_MANIP_CNT(mp));
1034        if (error)
1035                goto out_trans_cancel;
1036
1037        /*
1038         * A newly created regular or special file just has one directory
1039         * entry pointing to them, but a directory also the "." entry
1040         * pointing to itself.
1041         */
1042        error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1043        if (!error)
1044                error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode,
1045                                is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
1046        if (error)
1047                goto out_trans_cancel;
1048
1049        /*
1050         * Now we join the directory inode to the transaction.  We do not do it
1051         * earlier because xfs_dialloc might commit the previous transaction
1052         * (and release all the locks).  An error from here on will result in
1053         * the transaction cancel unlocking dp so don't do it explicitly in the
1054         * error path.
1055         */
1056        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1057        unlock_dp_on_error = false;
1058
1059        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1060                                        resblks - XFS_IALLOC_SPACE_RES(mp));
1061        if (error) {
1062                ASSERT(error != -ENOSPC);
1063                goto out_trans_cancel;
1064        }
1065        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1066        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1067
1068        if (is_dir) {
1069                error = xfs_dir_init(tp, ip, dp);
1070                if (error)
1071                        goto out_trans_cancel;
1072
1073                xfs_bumplink(tp, dp);
1074        }
1075
1076        /*
1077         * If this is a synchronous mount, make sure that the
1078         * create transaction goes to disk before returning to
1079         * the user.
1080         */
1081        if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
1082                xfs_trans_set_sync(tp);
1083
1084        /*
1085         * Attach the dquot(s) to the inodes and modify them incore.
1086         * These ids of the inode couldn't have changed since the new
1087         * inode has been locked ever since it was created.
1088         */
1089        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1090
1091        error = xfs_trans_commit(tp);
1092        if (error)
1093                goto out_release_inode;
1094
1095        xfs_qm_dqrele(udqp);
1096        xfs_qm_dqrele(gdqp);
1097        xfs_qm_dqrele(pdqp);
1098
1099        *ipp = ip;
1100        return 0;
1101
1102 out_trans_cancel:
1103        xfs_trans_cancel(tp);
1104 out_release_inode:
1105        /*
1106         * Wait until after the current transaction is aborted to finish the
1107         * setup of the inode and release the inode.  This prevents recursive
1108         * transactions and deadlocks from xfs_inactive.
1109         */
1110        if (ip) {
1111                xfs_finish_inode_setup(ip);
1112                xfs_irele(ip);
1113        }
1114 out_release_dquots:
1115        xfs_qm_dqrele(udqp);
1116        xfs_qm_dqrele(gdqp);
1117        xfs_qm_dqrele(pdqp);
1118
1119        if (unlock_dp_on_error)
1120                xfs_iunlock(dp, XFS_ILOCK_EXCL);
1121        return error;
1122}
1123
1124int
1125xfs_create_tmpfile(
1126        struct user_namespace   *mnt_userns,
1127        struct xfs_inode        *dp,
1128        umode_t                 mode,
1129        struct xfs_inode        **ipp)
1130{
1131        struct xfs_mount        *mp = dp->i_mount;
1132        struct xfs_inode        *ip = NULL;
1133        struct xfs_trans        *tp = NULL;
1134        int                     error;
1135        prid_t                  prid;
1136        struct xfs_dquot        *udqp = NULL;
1137        struct xfs_dquot        *gdqp = NULL;
1138        struct xfs_dquot        *pdqp = NULL;
1139        struct xfs_trans_res    *tres;
1140        uint                    resblks;
1141        xfs_ino_t               ino;
1142
1143        if (xfs_is_shutdown(mp))
1144                return -EIO;
1145
1146        prid = xfs_get_initial_prid(dp);
1147
1148        /*
1149         * Make sure that we have allocated dquot(s) on disk.
1150         */
1151        error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
1152                        mapped_fsgid(mnt_userns), prid,
1153                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1154                        &udqp, &gdqp, &pdqp);
1155        if (error)
1156                return error;
1157
1158        resblks = XFS_IALLOC_SPACE_RES(mp);
1159        tres = &M_RES(mp)->tr_create_tmpfile;
1160
1161        error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
1162                        &tp);
1163        if (error)
1164                goto out_release_dquots;
1165
1166        error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1167        if (!error)
1168                error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode,
1169                                0, 0, prid, false, &ip);
1170        if (error)
1171                goto out_trans_cancel;
1172
1173        if (xfs_has_wsync(mp))
1174                xfs_trans_set_sync(tp);
1175
1176        /*
1177         * Attach the dquot(s) to the inodes and modify them incore.
1178         * These ids of the inode couldn't have changed since the new
1179         * inode has been locked ever since it was created.
1180         */
1181        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1182
1183        error = xfs_iunlink(tp, ip);
1184        if (error)
1185                goto out_trans_cancel;
1186
1187        error = xfs_trans_commit(tp);
1188        if (error)
1189                goto out_release_inode;
1190
1191        xfs_qm_dqrele(udqp);
1192        xfs_qm_dqrele(gdqp);
1193        xfs_qm_dqrele(pdqp);
1194
1195        *ipp = ip;
1196        return 0;
1197
1198 out_trans_cancel:
1199        xfs_trans_cancel(tp);
1200 out_release_inode:
1201        /*
1202         * Wait until after the current transaction is aborted to finish the
1203         * setup of the inode and release the inode.  This prevents recursive
1204         * transactions and deadlocks from xfs_inactive.
1205         */
1206        if (ip) {
1207                xfs_finish_inode_setup(ip);
1208                xfs_irele(ip);
1209        }
1210 out_release_dquots:
1211        xfs_qm_dqrele(udqp);
1212        xfs_qm_dqrele(gdqp);
1213        xfs_qm_dqrele(pdqp);
1214
1215        return error;
1216}
1217
1218int
1219xfs_link(
1220        xfs_inode_t             *tdp,
1221        xfs_inode_t             *sip,
1222        struct xfs_name         *target_name)
1223{
1224        xfs_mount_t             *mp = tdp->i_mount;
1225        xfs_trans_t             *tp;
1226        int                     error;
1227        int                     resblks;
1228
1229        trace_xfs_link(tdp, target_name);
1230
1231        ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1232
1233        if (xfs_is_shutdown(mp))
1234                return -EIO;
1235
1236        error = xfs_qm_dqattach(sip);
1237        if (error)
1238                goto std_return;
1239
1240        error = xfs_qm_dqattach(tdp);
1241        if (error)
1242                goto std_return;
1243
1244        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1245        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
1246        if (error == -ENOSPC) {
1247                resblks = 0;
1248                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
1249        }
1250        if (error)
1251                goto std_return;
1252
1253        xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
1254
1255        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1256        xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1257
1258        error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
1259                        XFS_IEXT_DIR_MANIP_CNT(mp));
1260        if (error)
1261                goto error_return;
1262
1263        /*
1264         * If we are using project inheritance, we only allow hard link
1265         * creation in our tree when the project IDs are the same; else
1266         * the tree quota mechanism could be circumvented.
1267         */
1268        if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
1269                     tdp->i_projid != sip->i_projid)) {
1270                error = -EXDEV;
1271                goto error_return;
1272        }
1273
1274        if (!resblks) {
1275                error = xfs_dir_canenter(tp, tdp, target_name);
1276                if (error)
1277                        goto error_return;
1278        }
1279
1280        /*
1281         * Handle initial link state of O_TMPFILE inode
1282         */
1283        if (VFS_I(sip)->i_nlink == 0) {
1284                struct xfs_perag        *pag;
1285
1286                pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
1287                error = xfs_iunlink_remove(tp, pag, sip);
1288                xfs_perag_put(pag);
1289                if (error)
1290                        goto error_return;
1291        }
1292
1293        error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1294                                   resblks);
1295        if (error)
1296                goto error_return;
1297        xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1298        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1299
1300        xfs_bumplink(tp, sip);
1301
1302        /*
1303         * If this is a synchronous mount, make sure that the
1304         * link transaction goes to disk before returning to
1305         * the user.
1306         */
1307        if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
1308                xfs_trans_set_sync(tp);
1309
1310        return xfs_trans_commit(tp);
1311
1312 error_return:
1313        xfs_trans_cancel(tp);
1314 std_return:
1315        return error;
1316}
1317
1318/* Clear the reflink flag and the cowblocks tag if possible. */
1319static void
1320xfs_itruncate_clear_reflink_flags(
1321        struct xfs_inode        *ip)
1322{
1323        struct xfs_ifork        *dfork;
1324        struct xfs_ifork        *cfork;
1325
1326        if (!xfs_is_reflink_inode(ip))
1327                return;
1328        dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1329        cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
1330        if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
1331                ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1332        if (cfork->if_bytes == 0)
1333                xfs_inode_clear_cowblocks_tag(ip);
1334}
1335
1336/*
1337 * Free up the underlying blocks past new_size.  The new size must be smaller
1338 * than the current size.  This routine can be used both for the attribute and
1339 * data fork, and does not modify the inode size, which is left to the caller.
1340 *
1341 * The transaction passed to this routine must have made a permanent log
1342 * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1343 * given transaction and start new ones, so make sure everything involved in
1344 * the transaction is tidy before calling here.  Some transaction will be
1345 * returned to the caller to be committed.  The incoming transaction must
1346 * already include the inode, and both inode locks must be held exclusively.
1347 * The inode must also be "held" within the transaction.  On return the inode
1348 * will be "held" within the returned transaction.  This routine does NOT
1349 * require any disk space to be reserved for it within the transaction.
1350 *
1351 * If we get an error, we must return with the inode locked and linked into the
1352 * current transaction. This keeps things simple for the higher level code,
1353 * because it always knows that the inode is locked and held in the transaction
1354 * that returns to it whether errors occur or not.  We don't mark the inode
1355 * dirty on error so that transactions can be easily aborted if possible.
1356 */
1357int
1358xfs_itruncate_extents_flags(
1359        struct xfs_trans        **tpp,
1360        struct xfs_inode        *ip,
1361        int                     whichfork,
1362        xfs_fsize_t             new_size,
1363        int                     flags)
1364{
1365        struct xfs_mount        *mp = ip->i_mount;
1366        struct xfs_trans        *tp = *tpp;
1367        xfs_fileoff_t           first_unmap_block;
1368        xfs_filblks_t           unmap_len;
1369        int                     error = 0;
1370
1371        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1372        ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1373               xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1374        ASSERT(new_size <= XFS_ISIZE(ip));
1375        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1376        ASSERT(ip->i_itemp != NULL);
1377        ASSERT(ip->i_itemp->ili_lock_flags == 0);
1378        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1379
1380        trace_xfs_itruncate_extents_start(ip, new_size);
1381
1382        flags |= xfs_bmapi_aflag(whichfork);
1383
1384        /*
1385         * Since it is possible for space to become allocated beyond
1386         * the end of the file (in a crash where the space is allocated
1387         * but the inode size is not yet updated), simply remove any
1388         * blocks which show up between the new EOF and the maximum
1389         * possible file size.
1390         *
1391         * We have to free all the blocks to the bmbt maximum offset, even if
1392         * the page cache can't scale that far.
1393         */
1394        first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1395        if (!xfs_verify_fileoff(mp, first_unmap_block)) {
1396                WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
1397                return 0;
1398        }
1399
1400        unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
1401        while (unmap_len > 0) {
1402                ASSERT(tp->t_firstblock == NULLFSBLOCK);
1403                error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
1404                                flags, XFS_ITRUNC_MAX_EXTENTS);
1405                if (error)
1406                        goto out;
1407
1408                /* free the just unmapped extents */
1409                error = xfs_defer_finish(&tp);
1410                if (error)
1411                        goto out;
1412        }
1413
1414        if (whichfork == XFS_DATA_FORK) {
1415                /* Remove all pending CoW reservations. */
1416                error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1417                                first_unmap_block, XFS_MAX_FILEOFF, true);
1418                if (error)
1419                        goto out;
1420
1421                xfs_itruncate_clear_reflink_flags(ip);
1422        }
1423
1424        /*
1425         * Always re-log the inode so that our permanent transaction can keep
1426         * on rolling it forward in the log.
1427         */
1428        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1429
1430        trace_xfs_itruncate_extents_end(ip, new_size);
1431
1432out:
1433        *tpp = tp;
1434        return error;
1435}
1436
1437int
1438xfs_release(
1439        xfs_inode_t     *ip)
1440{
1441        xfs_mount_t     *mp = ip->i_mount;
1442        int             error = 0;
1443
1444        if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1445                return 0;
1446
1447        /* If this is a read-only mount, don't do this (would generate I/O) */
1448        if (xfs_is_readonly(mp))
1449                return 0;
1450
1451        if (!xfs_is_shutdown(mp)) {
1452                int truncated;
1453
1454                /*
1455                 * If we previously truncated this file and removed old data
1456                 * in the process, we want to initiate "early" writeout on
1457                 * the last close.  This is an attempt to combat the notorious
1458                 * NULL files problem which is particularly noticeable from a
1459                 * truncate down, buffered (re-)write (delalloc), followed by
1460                 * a crash.  What we are effectively doing here is
1461                 * significantly reducing the time window where we'd otherwise
1462                 * be exposed to that problem.
1463                 */
1464                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1465                if (truncated) {
1466                        xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1467                        if (ip->i_delayed_blks > 0) {
1468                                error = filemap_flush(VFS_I(ip)->i_mapping);
1469                                if (error)
1470                                        return error;
1471                        }
1472                }
1473        }
1474
1475        if (VFS_I(ip)->i_nlink == 0)
1476                return 0;
1477
1478        /*
1479         * If we can't get the iolock just skip truncating the blocks past EOF
1480         * because we could deadlock with the mmap_lock otherwise. We'll get
1481         * another chance to drop them once the last reference to the inode is
1482         * dropped, so we'll never leak blocks permanently.
1483         */
1484        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
1485                return 0;
1486
1487        if (xfs_can_free_eofblocks(ip, false)) {
1488                /*
1489                 * Check if the inode is being opened, written and closed
1490                 * frequently and we have delayed allocation blocks outstanding
1491                 * (e.g. streaming writes from the NFS server), truncating the
1492                 * blocks past EOF will cause fragmentation to occur.
1493                 *
1494                 * In this case don't do the truncation, but we have to be
1495                 * careful how we detect this case. Blocks beyond EOF show up as
1496                 * i_delayed_blks even when the inode is clean, so we need to
1497                 * truncate them away first before checking for a dirty release.
1498                 * Hence on the first dirty close we will still remove the
1499                 * speculative allocation, but after that we will leave it in
1500                 * place.
1501                 */
1502                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1503                        goto out_unlock;
1504
1505                error = xfs_free_eofblocks(ip);
1506                if (error)
1507                        goto out_unlock;
1508
1509                /* delalloc blocks after truncation means it really is dirty */
1510                if (ip->i_delayed_blks)
1511                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1512        }
1513
1514out_unlock:
1515        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1516        return error;
1517}
1518
1519/*
1520 * xfs_inactive_truncate
1521 *
1522 * Called to perform a truncate when an inode becomes unlinked.
1523 */
1524STATIC int
1525xfs_inactive_truncate(
1526        struct xfs_inode *ip)
1527{
1528        struct xfs_mount        *mp = ip->i_mount;
1529        struct xfs_trans        *tp;
1530        int                     error;
1531
1532        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
1533        if (error) {
1534                ASSERT(xfs_is_shutdown(mp));
1535                return error;
1536        }
1537        xfs_ilock(ip, XFS_ILOCK_EXCL);
1538        xfs_trans_ijoin(tp, ip, 0);
1539
1540        /*
1541         * Log the inode size first to prevent stale data exposure in the event
1542         * of a system crash before the truncate completes. See the related
1543         * comment in xfs_vn_setattr_size() for details.
1544         */
1545        ip->i_disk_size = 0;
1546        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1547
1548        error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1549        if (error)
1550                goto error_trans_cancel;
1551
1552        ASSERT(ip->i_df.if_nextents == 0);
1553
1554        error = xfs_trans_commit(tp);
1555        if (error)
1556                goto error_unlock;
1557
1558        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1559        return 0;
1560
1561error_trans_cancel:
1562        xfs_trans_cancel(tp);
1563error_unlock:
1564        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1565        return error;
1566}
1567
1568/*
1569 * xfs_inactive_ifree()
1570 *
1571 * Perform the inode free when an inode is unlinked.
1572 */
1573STATIC int
1574xfs_inactive_ifree(
1575        struct xfs_inode *ip)
1576{
1577        struct xfs_mount        *mp = ip->i_mount;
1578        struct xfs_trans        *tp;
1579        int                     error;
1580
1581        /*
1582         * We try to use a per-AG reservation for any block needed by the finobt
1583         * tree, but as the finobt feature predates the per-AG reservation
1584         * support a degraded file system might not have enough space for the
1585         * reservation at mount time.  In that case try to dip into the reserved
1586         * pool and pray.
1587         *
1588         * Send a warning if the reservation does happen to fail, as the inode
1589         * now remains allocated and sits on the unlinked list until the fs is
1590         * repaired.
1591         */
1592        if (unlikely(mp->m_finobt_nores)) {
1593                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1594                                XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1595                                &tp);
1596        } else {
1597                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1598        }
1599        if (error) {
1600                if (error == -ENOSPC) {
1601                        xfs_warn_ratelimited(mp,
1602                        "Failed to remove inode(s) from unlinked list. "
1603                        "Please free space, unmount and run xfs_repair.");
1604                } else {
1605                        ASSERT(xfs_is_shutdown(mp));
1606                }
1607                return error;
1608        }
1609
1610        /*
1611         * We do not hold the inode locked across the entire rolling transaction
1612         * here. We only need to hold it for the first transaction that
1613         * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1614         * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1615         * here breaks the relationship between cluster buffer invalidation and
1616         * stale inode invalidation on cluster buffer item journal commit
1617         * completion, and can result in leaving dirty stale inodes hanging
1618         * around in memory.
1619         *
1620         * We have no need for serialising this inode operation against other
1621         * operations - we freed the inode and hence reallocation is required
1622         * and that will serialise on reallocating the space the deferops need
1623         * to free. Hence we can unlock the inode on the first commit of
1624         * the transaction rather than roll it right through the deferops. This
1625         * avoids relogging the XFS_ISTALE inode.
1626         *
1627         * We check that xfs_ifree() hasn't grown an internal transaction roll
1628         * by asserting that the inode is still locked when it returns.
1629         */
1630        xfs_ilock(ip, XFS_ILOCK_EXCL);
1631        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1632
1633        error = xfs_ifree(tp, ip);
1634        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1635        if (error) {
1636                /*
1637                 * If we fail to free the inode, shut down.  The cancel
1638                 * might do that, we need to make sure.  Otherwise the
1639                 * inode might be lost for a long time or forever.
1640                 */
1641                if (!xfs_is_shutdown(mp)) {
1642                        xfs_notice(mp, "%s: xfs_ifree returned error %d",
1643                                __func__, error);
1644                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1645                }
1646                xfs_trans_cancel(tp);
1647                return error;
1648        }
1649
1650        /*
1651         * Credit the quota account(s). The inode is gone.
1652         */
1653        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1654
1655        /*
1656         * Just ignore errors at this point.  There is nothing we can do except
1657         * to try to keep going. Make sure it's not a silent error.
1658         */
1659        error = xfs_trans_commit(tp);
1660        if (error)
1661                xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1662                        __func__, error);
1663
1664        return 0;
1665}
1666
1667/*
1668 * Returns true if we need to update the on-disk metadata before we can free
1669 * the memory used by this inode.  Updates include freeing post-eof
1670 * preallocations; freeing COW staging extents; and marking the inode free in
1671 * the inobt if it is on the unlinked list.
1672 */
1673bool
1674xfs_inode_needs_inactive(
1675        struct xfs_inode        *ip)
1676{
1677        struct xfs_mount        *mp = ip->i_mount;
1678        struct xfs_ifork        *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
1679
1680        /*
1681         * If the inode is already free, then there can be nothing
1682         * to clean up here.
1683         */
1684        if (VFS_I(ip)->i_mode == 0)
1685                return false;
1686
1687        /* If this is a read-only mount, don't do this (would generate I/O) */
1688        if (xfs_is_readonly(mp))
1689                return false;
1690
1691        /* If the log isn't running, push inodes straight to reclaim. */
1692        if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp))
1693                return false;
1694
1695        /* Metadata inodes require explicit resource cleanup. */
1696        if (xfs_is_metadata_inode(ip))
1697                return false;
1698
1699        /* Want to clean out the cow blocks if there are any. */
1700        if (cow_ifp && cow_ifp->if_bytes > 0)
1701                return true;
1702
1703        /* Unlinked files must be freed. */
1704        if (VFS_I(ip)->i_nlink == 0)
1705                return true;
1706
1707        /*
1708         * This file isn't being freed, so check if there are post-eof blocks
1709         * to free.  @force is true because we are evicting an inode from the
1710         * cache.  Post-eof blocks must be freed, lest we end up with broken
1711         * free space accounting.
1712         *
1713         * Note: don't bother with iolock here since lockdep complains about
1714         * acquiring it in reclaim context. We have the only reference to the
1715         * inode at this point anyways.
1716         */
1717        return xfs_can_free_eofblocks(ip, true);
1718}
1719
1720/*
1721 * xfs_inactive
1722 *
1723 * This is called when the vnode reference count for the vnode
1724 * goes to zero.  If the file has been unlinked, then it must
1725 * now be truncated.  Also, we clear all of the read-ahead state
1726 * kept for the inode here since the file is now closed.
1727 */
1728void
1729xfs_inactive(
1730        xfs_inode_t     *ip)
1731{
1732        struct xfs_mount        *mp;
1733        int                     error;
1734        int                     truncate = 0;
1735
1736        /*
1737         * If the inode is already free, then there can be nothing
1738         * to clean up here.
1739         */
1740        if (VFS_I(ip)->i_mode == 0) {
1741                ASSERT(ip->i_df.if_broot_bytes == 0);
1742                goto out;
1743        }
1744
1745        mp = ip->i_mount;
1746        ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
1747
1748        /* If this is a read-only mount, don't do this (would generate I/O) */
1749        if (xfs_is_readonly(mp))
1750                goto out;
1751
1752        /* Metadata inodes require explicit resource cleanup. */
1753        if (xfs_is_metadata_inode(ip))
1754                goto out;
1755
1756        /* Try to clean out the cow blocks if there are any. */
1757        if (xfs_inode_has_cow_data(ip))
1758                xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
1759
1760        if (VFS_I(ip)->i_nlink != 0) {
1761                /*
1762                 * force is true because we are evicting an inode from the
1763                 * cache. Post-eof blocks must be freed, lest we end up with
1764                 * broken free space accounting.
1765                 *
1766                 * Note: don't bother with iolock here since lockdep complains
1767                 * about acquiring it in reclaim context. We have the only
1768                 * reference to the inode at this point anyways.
1769                 */
1770                if (xfs_can_free_eofblocks(ip, true))
1771                        xfs_free_eofblocks(ip);
1772
1773                goto out;
1774        }
1775
1776        if (S_ISREG(VFS_I(ip)->i_mode) &&
1777            (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 ||
1778             ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
1779                truncate = 1;
1780
1781        error = xfs_qm_dqattach(ip);
1782        if (error)
1783                goto out;
1784
1785        if (S_ISLNK(VFS_I(ip)->i_mode))
1786                error = xfs_inactive_symlink(ip);
1787        else if (truncate)
1788                error = xfs_inactive_truncate(ip);
1789        if (error)
1790                goto out;
1791
1792        /*
1793         * If there are attributes associated with the file then blow them away
1794         * now.  The code calls a routine that recursively deconstructs the
1795         * attribute fork. If also blows away the in-core attribute fork.
1796         */
1797        if (XFS_IFORK_Q(ip)) {
1798                error = xfs_attr_inactive(ip);
1799                if (error)
1800                        goto out;
1801        }
1802
1803        ASSERT(!ip->i_afp);
1804        ASSERT(ip->i_forkoff == 0);
1805
1806        /*
1807         * Free the inode.
1808         */
1809        xfs_inactive_ifree(ip);
1810
1811out:
1812        /*
1813         * We're done making metadata updates for this inode, so we can release
1814         * the attached dquots.
1815         */
1816        xfs_qm_dqdetach(ip);
1817}
1818
1819/*
1820 * In-Core Unlinked List Lookups
1821 * =============================
1822 *
1823 * Every inode is supposed to be reachable from some other piece of metadata
1824 * with the exception of the root directory.  Inodes with a connection to a
1825 * file descriptor but not linked from anywhere in the on-disk directory tree
1826 * are collectively known as unlinked inodes, though the filesystem itself
1827 * maintains links to these inodes so that on-disk metadata are consistent.
1828 *
1829 * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
1830 * header contains a number of buckets that point to an inode, and each inode
1831 * record has a pointer to the next inode in the hash chain.  This
1832 * singly-linked list causes scaling problems in the iunlink remove function
1833 * because we must walk that list to find the inode that points to the inode
1834 * being removed from the unlinked hash bucket list.
1835 *
1836 * What if we modelled the unlinked list as a collection of records capturing
1837 * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
1838 * have a fast way to look up unlinked list predecessors, which avoids the
1839 * slow list walk.  That's exactly what we do here (in-core) with a per-AG
1840 * rhashtable.
1841 *
1842 * Because this is a backref cache, we ignore operational failures since the
1843 * iunlink code can fall back to the slow bucket walk.  The only errors that
1844 * should bubble out are for obviously incorrect situations.
1845 *
1846 * All users of the backref cache MUST hold the AGI buffer lock to serialize
1847 * access or have otherwise provided for concurrency control.
1848 */
1849
1850/* Capture a "X.next_unlinked = Y" relationship. */
1851struct xfs_iunlink {
1852        struct rhash_head       iu_rhash_head;
1853        xfs_agino_t             iu_agino;               /* X */
1854        xfs_agino_t             iu_next_unlinked;       /* Y */
1855};
1856
1857/* Unlinked list predecessor lookup hashtable construction */
1858static int
1859xfs_iunlink_obj_cmpfn(
1860        struct rhashtable_compare_arg   *arg,
1861        const void                      *obj)
1862{
1863        const xfs_agino_t               *key = arg->key;
1864        const struct xfs_iunlink        *iu = obj;
1865
1866        if (iu->iu_next_unlinked != *key)
1867                return 1;
1868        return 0;
1869}
1870
1871static const struct rhashtable_params xfs_iunlink_hash_params = {
1872        .min_size               = XFS_AGI_UNLINKED_BUCKETS,
1873        .key_len                = sizeof(xfs_agino_t),
1874        .key_offset             = offsetof(struct xfs_iunlink,
1875                                           iu_next_unlinked),
1876        .head_offset            = offsetof(struct xfs_iunlink, iu_rhash_head),
1877        .automatic_shrinking    = true,
1878        .obj_cmpfn              = xfs_iunlink_obj_cmpfn,
1879};
1880
1881/*
1882 * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
1883 * relation is found.
1884 */
1885static xfs_agino_t
1886xfs_iunlink_lookup_backref(
1887        struct xfs_perag        *pag,
1888        xfs_agino_t             agino)
1889{
1890        struct xfs_iunlink      *iu;
1891
1892        iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
1893                        xfs_iunlink_hash_params);
1894        return iu ? iu->iu_agino : NULLAGINO;
1895}
1896
1897/*
1898 * Take ownership of an iunlink cache entry and insert it into the hash table.
1899 * If successful, the entry will be owned by the cache; if not, it is freed.
1900 * Either way, the caller does not own @iu after this call.
1901 */
1902static int
1903xfs_iunlink_insert_backref(
1904        struct xfs_perag        *pag,
1905        struct xfs_iunlink      *iu)
1906{
1907        int                     error;
1908
1909        error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
1910                        &iu->iu_rhash_head, xfs_iunlink_hash_params);
1911        /*
1912         * Fail loudly if there already was an entry because that's a sign of
1913         * corruption of in-memory data.  Also fail loudly if we see an error
1914         * code we didn't anticipate from the rhashtable code.  Currently we
1915         * only anticipate ENOMEM.
1916         */
1917        if (error) {
1918                WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
1919                kmem_free(iu);
1920        }
1921        /*
1922         * Absorb any runtime errors that aren't a result of corruption because
1923         * this is a cache and we can always fall back to bucket list scanning.
1924         */
1925        if (error != 0 && error != -EEXIST)
1926                error = 0;
1927        return error;
1928}
1929
1930/* Remember that @prev_agino.next_unlinked = @this_agino. */
1931static int
1932xfs_iunlink_add_backref(
1933        struct xfs_perag        *pag,
1934        xfs_agino_t             prev_agino,
1935        xfs_agino_t             this_agino)
1936{
1937        struct xfs_iunlink      *iu;
1938
1939        if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
1940                return 0;
1941
1942        iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
1943        iu->iu_agino = prev_agino;
1944        iu->iu_next_unlinked = this_agino;
1945
1946        return xfs_iunlink_insert_backref(pag, iu);
1947}
1948
1949/*
1950 * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
1951 * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
1952 * wasn't any such entry then we don't bother.
1953 */
1954static int
1955xfs_iunlink_change_backref(
1956        struct xfs_perag        *pag,
1957        xfs_agino_t             agino,
1958        xfs_agino_t             next_unlinked)
1959{
1960        struct xfs_iunlink      *iu;
1961        int                     error;
1962
1963        /* Look up the old entry; if there wasn't one then exit. */
1964        iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
1965                        xfs_iunlink_hash_params);
1966        if (!iu)
1967                return 0;
1968
1969        /*
1970         * Remove the entry.  This shouldn't ever return an error, but if we
1971         * couldn't remove the old entry we don't want to add it again to the
1972         * hash table, and if the entry disappeared on us then someone's
1973         * violated the locking rules and we need to fail loudly.  Either way
1974         * we cannot remove the inode because internal state is or would have
1975         * been corrupt.
1976         */
1977        error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
1978                        &iu->iu_rhash_head, xfs_iunlink_hash_params);
1979        if (error)
1980                return error;
1981
1982        /* If there is no new next entry just free our item and return. */
1983        if (next_unlinked == NULLAGINO) {
1984                kmem_free(iu);
1985                return 0;
1986        }
1987
1988        /* Update the entry and re-add it to the hash table. */
1989        iu->iu_next_unlinked = next_unlinked;
1990        return xfs_iunlink_insert_backref(pag, iu);
1991}
1992
1993/* Set up the in-core predecessor structures. */
1994int
1995xfs_iunlink_init(
1996        struct xfs_perag        *pag)
1997{
1998        return rhashtable_init(&pag->pagi_unlinked_hash,
1999                        &xfs_iunlink_hash_params);
2000}
2001
2002/* Free the in-core predecessor structures. */
2003static void
2004xfs_iunlink_free_item(
2005        void                    *ptr,
2006        void                    *arg)
2007{
2008        struct xfs_iunlink      *iu = ptr;
2009        bool                    *freed_anything = arg;
2010
2011        *freed_anything = true;
2012        kmem_free(iu);
2013}
2014
2015void
2016xfs_iunlink_destroy(
2017        struct xfs_perag        *pag)
2018{
2019        bool                    freed_anything = false;
2020
2021        rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
2022                        xfs_iunlink_free_item, &freed_anything);
2023
2024        ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount));
2025}
2026
2027/*
2028 * Point the AGI unlinked bucket at an inode and log the results.  The caller
2029 * is responsible for validating the old value.
2030 */
2031STATIC int
2032xfs_iunlink_update_bucket(
2033        struct xfs_trans        *tp,
2034        struct xfs_perag        *pag,
2035        struct xfs_buf          *agibp,
2036        unsigned int            bucket_index,
2037        xfs_agino_t             new_agino)
2038{
2039        struct xfs_agi          *agi = agibp->b_addr;
2040        xfs_agino_t             old_value;
2041        int                     offset;
2042
2043        ASSERT(xfs_verify_agino_or_null(tp->t_mountp, pag->pag_agno, new_agino));
2044
2045        old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2046        trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
2047                        old_value, new_agino);
2048
2049        /*
2050         * We should never find the head of the list already set to the value
2051         * passed in because either we're adding or removing ourselves from the
2052         * head of the list.
2053         */
2054        if (old_value == new_agino) {
2055                xfs_buf_mark_corrupt(agibp);
2056                return -EFSCORRUPTED;
2057        }
2058
2059        agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
2060        offset = offsetof(struct xfs_agi, agi_unlinked) +
2061                        (sizeof(xfs_agino_t) * bucket_index);
2062        xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
2063        return 0;
2064}
2065
2066/* Set an on-disk inode's next_unlinked pointer. */
2067STATIC void
2068xfs_iunlink_update_dinode(
2069        struct xfs_trans        *tp,
2070        struct xfs_perag        *pag,
2071        xfs_agino_t             agino,
2072        struct xfs_buf          *ibp,
2073        struct xfs_dinode       *dip,
2074        struct xfs_imap         *imap,
2075        xfs_agino_t             next_agino)
2076{
2077        struct xfs_mount        *mp = tp->t_mountp;
2078        int                     offset;
2079
2080        ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino));
2081
2082        trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino,
2083                        be32_to_cpu(dip->di_next_unlinked), next_agino);
2084
2085        dip->di_next_unlinked = cpu_to_be32(next_agino);
2086        offset = imap->im_boffset +
2087                        offsetof(struct xfs_dinode, di_next_unlinked);
2088
2089        /* need to recalc the inode CRC if appropriate */
2090        xfs_dinode_calc_crc(mp, dip);
2091        xfs_trans_inode_buf(tp, ibp);
2092        xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
2093}
2094
2095/* Set an in-core inode's unlinked pointer and return the old value. */
2096STATIC int
2097xfs_iunlink_update_inode(
2098        struct xfs_trans        *tp,
2099        struct xfs_inode        *ip,
2100        struct xfs_perag        *pag,
2101        xfs_agino_t             next_agino,
2102        xfs_agino_t             *old_next_agino)
2103{
2104        struct xfs_mount        *mp = tp->t_mountp;
2105        struct xfs_dinode       *dip;
2106        struct xfs_buf          *ibp;
2107        xfs_agino_t             old_value;
2108        int                     error;
2109
2110        ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino));
2111
2112        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
2113        if (error)
2114                return error;
2115        dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
2116
2117        /* Make sure the old pointer isn't garbage. */
2118        old_value = be32_to_cpu(dip->di_next_unlinked);
2119        if (!xfs_verify_agino_or_null(mp, pag->pag_agno, old_value)) {
2120                xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
2121                                sizeof(*dip), __this_address);
2122                error = -EFSCORRUPTED;
2123                goto out;
2124        }
2125
2126        /*
2127         * Since we're updating a linked list, we should never find that the
2128         * current pointer is the same as the new value, unless we're
2129         * terminating the list.
2130         */
2131        *old_next_agino = old_value;
2132        if (old_value == next_agino) {
2133                if (next_agino != NULLAGINO) {
2134                        xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
2135                                        dip, sizeof(*dip), __this_address);
2136                        error = -EFSCORRUPTED;
2137                }
2138                goto out;
2139        }
2140
2141        /* Ok, update the new pointer. */
2142        xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
2143                        ibp, dip, &ip->i_imap, next_agino);
2144        return 0;
2145out:
2146        xfs_trans_brelse(tp, ibp);
2147        return error;
2148}
2149
2150/*
2151 * This is called when the inode's link count has gone to 0 or we are creating
2152 * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
2153 *
2154 * We place the on-disk inode on a list in the AGI.  It will be pulled from this
2155 * list when the inode is freed.
2156 */
2157STATIC int
2158xfs_iunlink(
2159        struct xfs_trans        *tp,
2160        struct xfs_inode        *ip)
2161{
2162        struct xfs_mount        *mp = tp->t_mountp;
2163        struct xfs_perag        *pag;
2164        struct xfs_agi          *agi;
2165        struct xfs_buf          *agibp;
2166        xfs_agino_t             next_agino;
2167        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2168        short                   bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2169        int                     error;
2170
2171        ASSERT(VFS_I(ip)->i_nlink == 0);
2172        ASSERT(VFS_I(ip)->i_mode != 0);
2173        trace_xfs_iunlink(ip);
2174
2175        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2176
2177        /* Get the agi buffer first.  It ensures lock ordering on the list. */
2178        error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp);
2179        if (error)
2180                goto out;
2181        agi = agibp->b_addr;
2182
2183        /*
2184         * Get the index into the agi hash table for the list this inode will
2185         * go on.  Make sure the pointer isn't garbage and that this inode
2186         * isn't already on the list.
2187         */
2188        next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2189        if (next_agino == agino ||
2190            !xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)) {
2191                xfs_buf_mark_corrupt(agibp);
2192                error = -EFSCORRUPTED;
2193                goto out;
2194        }
2195
2196        if (next_agino != NULLAGINO) {
2197                xfs_agino_t             old_agino;
2198
2199                /*
2200                 * There is already another inode in the bucket, so point this
2201                 * inode to the current head of the list.
2202                 */
2203                error = xfs_iunlink_update_inode(tp, ip, pag, next_agino,
2204                                &old_agino);
2205                if (error)
2206                        goto out;
2207                ASSERT(old_agino == NULLAGINO);
2208
2209                /*
2210                 * agino has been unlinked, add a backref from the next inode
2211                 * back to agino.
2212                 */
2213                error = xfs_iunlink_add_backref(pag, agino, next_agino);
2214                if (error)
2215                        goto out;
2216        }
2217
2218        /* Point the head of the list to point to this inode. */
2219        error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
2220out:
2221        xfs_perag_put(pag);
2222        return error;
2223}
2224
2225/* Return the imap, dinode pointer, and buffer for an inode. */
2226STATIC int
2227xfs_iunlink_map_ino(
2228        struct xfs_trans        *tp,
2229        xfs_agnumber_t          agno,
2230        xfs_agino_t             agino,
2231        struct xfs_imap         *imap,
2232        struct xfs_dinode       **dipp,
2233        struct xfs_buf          **bpp)
2234{
2235        struct xfs_mount        *mp = tp->t_mountp;
2236        int                     error;
2237
2238        imap->im_blkno = 0;
2239        error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
2240        if (error) {
2241                xfs_warn(mp, "%s: xfs_imap returned error %d.",
2242                                __func__, error);
2243                return error;
2244        }
2245
2246        error = xfs_imap_to_bp(mp, tp, imap, bpp);
2247        if (error) {
2248                xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
2249                                __func__, error);
2250                return error;
2251        }
2252
2253        *dipp = xfs_buf_offset(*bpp, imap->im_boffset);
2254        return 0;
2255}
2256
2257/*
2258 * Walk the unlinked chain from @head_agino until we find the inode that
2259 * points to @target_agino.  Return the inode number, map, dinode pointer,
2260 * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
2261 *
2262 * @tp, @pag, @head_agino, and @target_agino are input parameters.
2263 * @agino, @imap, @dipp, and @bpp are all output parameters.
2264 *
2265 * Do not call this function if @target_agino is the head of the list.
2266 */
2267STATIC int
2268xfs_iunlink_map_prev(
2269        struct xfs_trans        *tp,
2270        struct xfs_perag        *pag,
2271        xfs_agino_t             head_agino,
2272        xfs_agino_t             target_agino,
2273        xfs_agino_t             *agino,
2274        struct xfs_imap         *imap,
2275        struct xfs_dinode       **dipp,
2276        struct xfs_buf          **bpp)
2277{
2278        struct xfs_mount        *mp = tp->t_mountp;
2279        xfs_agino_t             next_agino;
2280        int                     error;
2281
2282        ASSERT(head_agino != target_agino);
2283        *bpp = NULL;
2284
2285        /* See if our backref cache can find it faster. */
2286        *agino = xfs_iunlink_lookup_backref(pag, target_agino);
2287        if (*agino != NULLAGINO) {
2288                error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap,
2289                                dipp, bpp);
2290                if (error)
2291                        return error;
2292
2293                if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
2294                        return 0;
2295
2296                /*
2297                 * If we get here the cache contents were corrupt, so drop the
2298                 * buffer and fall back to walking the bucket list.
2299                 */
2300                xfs_trans_brelse(tp, *bpp);
2301                *bpp = NULL;
2302                WARN_ON_ONCE(1);
2303        }
2304
2305        trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno);
2306
2307        /* Otherwise, walk the entire bucket until we find it. */
2308        next_agino = head_agino;
2309        while (next_agino != target_agino) {
2310                xfs_agino_t     unlinked_agino;
2311
2312                if (*bpp)
2313                        xfs_trans_brelse(tp, *bpp);
2314
2315                *agino = next_agino;
2316                error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap,
2317                                dipp, bpp);
2318                if (error)
2319                        return error;
2320
2321                unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
2322                /*
2323                 * Make sure this pointer is valid and isn't an obvious
2324                 * infinite loop.
2325                 */
2326                if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) ||
2327                    next_agino == unlinked_agino) {
2328                        XFS_CORRUPTION_ERROR(__func__,
2329                                        XFS_ERRLEVEL_LOW, mp,
2330                                        *dipp, sizeof(**dipp));
2331                        error = -EFSCORRUPTED;
2332                        return error;
2333                }
2334                next_agino = unlinked_agino;
2335        }
2336
2337        return 0;
2338}
2339
2340/*
2341 * Pull the on-disk inode from the AGI unlinked list.
2342 */
2343STATIC int
2344xfs_iunlink_remove(
2345        struct xfs_trans        *tp,
2346        struct xfs_perag        *pag,
2347        struct xfs_inode        *ip)
2348{
2349        struct xfs_mount        *mp = tp->t_mountp;
2350        struct xfs_agi          *agi;
2351        struct xfs_buf          *agibp;
2352        struct xfs_buf          *last_ibp;
2353        struct xfs_dinode       *last_dip = NULL;
2354        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2355        xfs_agino_t             next_agino;
2356        xfs_agino_t             head_agino;
2357        short                   bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2358        int                     error;
2359
2360        trace_xfs_iunlink_remove(ip);
2361
2362        /* Get the agi buffer first.  It ensures lock ordering on the list. */
2363        error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp);
2364        if (error)
2365                return error;
2366        agi = agibp->b_addr;
2367
2368        /*
2369         * Get the index into the agi hash table for the list this inode will
2370         * go on.  Make sure the head pointer isn't garbage.
2371         */
2372        head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2373        if (!xfs_verify_agino(mp, pag->pag_agno, head_agino)) {
2374                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
2375                                agi, sizeof(*agi));
2376                return -EFSCORRUPTED;
2377        }
2378
2379        /*
2380         * Set our inode's next_unlinked pointer to NULL and then return
2381         * the old pointer value so that we can update whatever was previous
2382         * to us in the list to point to whatever was next in the list.
2383         */
2384        error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino);
2385        if (error)
2386                return error;
2387
2388        /*
2389         * If there was a backref pointing from the next inode back to this
2390         * one, remove it because we've removed this inode from the list.
2391         *
2392         * Later, if this inode was in the middle of the list we'll update
2393         * this inode's backref to point from the next inode.
2394         */
2395        if (next_agino != NULLAGINO) {
2396                error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO);
2397                if (error)
2398                        return error;
2399        }
2400
2401        if (head_agino != agino) {
2402                struct xfs_imap imap;
2403                xfs_agino_t     prev_agino;
2404
2405                /* We need to search the list for the inode being freed. */
2406                error = xfs_iunlink_map_prev(tp, pag, head_agino, agino,
2407                                &prev_agino, &imap, &last_dip, &last_ibp);
2408                if (error)
2409                        return error;
2410
2411                /* Point the previous inode on the list to the next inode. */
2412                xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp,
2413                                last_dip, &imap, next_agino);
2414
2415                /*
2416                 * Now we deal with the backref for this inode.  If this inode
2417                 * pointed at a real inode, change the backref that pointed to
2418                 * us to point to our old next.  If this inode was the end of
2419                 * the list, delete the backref that pointed to us.  Note that
2420                 * change_backref takes care of deleting the backref if
2421                 * next_agino is NULLAGINO.
2422                 */
2423                return xfs_iunlink_change_backref(agibp->b_pag, agino,
2424                                next_agino);
2425        }
2426
2427        /* Point the head of the list to the next unlinked inode. */
2428        return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
2429                        next_agino);
2430}
2431
2432/*
2433 * Look up the inode number specified and if it is not already marked XFS_ISTALE
2434 * mark it stale. We should only find clean inodes in this lookup that aren't
2435 * already stale.
2436 */
2437static void
2438xfs_ifree_mark_inode_stale(
2439        struct xfs_perag        *pag,
2440        struct xfs_inode        *free_ip,
2441        xfs_ino_t               inum)
2442{
2443        struct xfs_mount        *mp = pag->pag_mount;
2444        struct xfs_inode_log_item *iip;
2445        struct xfs_inode        *ip;
2446
2447retry:
2448        rcu_read_lock();
2449        ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
2450
2451        /* Inode not in memory, nothing to do */
2452        if (!ip) {
2453                rcu_read_unlock();
2454                return;
2455        }
2456
2457        /*
2458         * because this is an RCU protected lookup, we could find a recently
2459         * freed or even reallocated inode during the lookup. We need to check
2460         * under the i_flags_lock for a valid inode here. Skip it if it is not
2461         * valid, the wrong inode or stale.
2462         */
2463        spin_lock(&ip->i_flags_lock);
2464        if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
2465                goto out_iflags_unlock;
2466
2467        /*
2468         * Don't try to lock/unlock the current inode, but we _cannot_ skip the
2469         * other inodes that we did not find in the list attached to the buffer
2470         * and are not already marked stale. If we can't lock it, back off and
2471         * retry.
2472         */
2473        if (ip != free_ip) {
2474                if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2475                        spin_unlock(&ip->i_flags_lock);
2476                        rcu_read_unlock();
2477                        delay(1);
2478                        goto retry;
2479                }
2480        }
2481        ip->i_flags |= XFS_ISTALE;
2482
2483        /*
2484         * If the inode is flushing, it is already attached to the buffer.  All
2485         * we needed to do here is mark the inode stale so buffer IO completion
2486         * will remove it from the AIL.
2487         */
2488        iip = ip->i_itemp;
2489        if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
2490                ASSERT(!list_empty(&iip->ili_item.li_bio_list));
2491                ASSERT(iip->ili_last_fields);
2492                goto out_iunlock;
2493        }
2494
2495        /*
2496         * Inodes not attached to the buffer can be released immediately.
2497         * Everything else has to go through xfs_iflush_abort() on journal
2498         * commit as the flock synchronises removal of the inode from the
2499         * cluster buffer against inode reclaim.
2500         */
2501        if (!iip || list_empty(&iip->ili_item.li_bio_list))
2502                goto out_iunlock;
2503
2504        __xfs_iflags_set(ip, XFS_IFLUSHING);
2505        spin_unlock(&ip->i_flags_lock);
2506        rcu_read_unlock();
2507
2508        /* we have a dirty inode in memory that has not yet been flushed. */
2509        spin_lock(&iip->ili_lock);
2510        iip->ili_last_fields = iip->ili_fields;
2511        iip->ili_fields = 0;
2512        iip->ili_fsync_fields = 0;
2513        spin_unlock(&iip->ili_lock);
2514        ASSERT(iip->ili_last_fields);
2515
2516        if (ip != free_ip)
2517                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2518        return;
2519
2520out_iunlock:
2521        if (ip != free_ip)
2522                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2523out_iflags_unlock:
2524        spin_unlock(&ip->i_flags_lock);
2525        rcu_read_unlock();
2526}
2527
2528/*
2529 * A big issue when freeing the inode cluster is that we _cannot_ skip any
2530 * inodes that are in memory - they all must be marked stale and attached to
2531 * the cluster buffer.
2532 */
2533static int
2534xfs_ifree_cluster(
2535        struct xfs_trans        *tp,
2536        struct xfs_perag        *pag,
2537        struct xfs_inode        *free_ip,
2538        struct xfs_icluster     *xic)
2539{
2540        struct xfs_mount        *mp = free_ip->i_mount;
2541        struct xfs_ino_geometry *igeo = M_IGEO(mp);
2542        struct xfs_buf          *bp;
2543        xfs_daddr_t             blkno;
2544        xfs_ino_t               inum = xic->first_ino;
2545        int                     nbufs;
2546        int                     i, j;
2547        int                     ioffset;
2548        int                     error;
2549
2550        nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2551
2552        for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2553                /*
2554                 * The allocation bitmap tells us which inodes of the chunk were
2555                 * physically allocated. Skip the cluster if an inode falls into
2556                 * a sparse region.
2557                 */
2558                ioffset = inum - xic->first_ino;
2559                if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2560                        ASSERT(ioffset % igeo->inodes_per_cluster == 0);
2561                        continue;
2562                }
2563
2564                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2565                                         XFS_INO_TO_AGBNO(mp, inum));
2566
2567                /*
2568                 * We obtain and lock the backing buffer first in the process
2569                 * here to ensure dirty inodes attached to the buffer remain in
2570                 * the flushing state while we mark them stale.
2571                 *
2572                 * If we scan the in-memory inodes first, then buffer IO can
2573                 * complete before we get a lock on it, and hence we may fail
2574                 * to mark all the active inodes on the buffer stale.
2575                 */
2576                error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2577                                mp->m_bsize * igeo->blocks_per_cluster,
2578                                XBF_UNMAPPED, &bp);
2579                if (error)
2580                        return error;
2581
2582                /*
2583                 * This buffer may not have been correctly initialised as we
2584                 * didn't read it from disk. That's not important because we are
2585                 * only using to mark the buffer as stale in the log, and to
2586                 * attach stale cached inodes on it. That means it will never be
2587                 * dispatched for IO. If it is, we want to know about it, and we
2588                 * want it to fail. We can acheive this by adding a write
2589                 * verifier to the buffer.
2590                 */
2591                bp->b_ops = &xfs_inode_buf_ops;
2592
2593                /*
2594                 * Now we need to set all the cached clean inodes as XFS_ISTALE,
2595                 * too. This requires lookups, and will skip inodes that we've
2596                 * already marked XFS_ISTALE.
2597                 */
2598                for (i = 0; i < igeo->inodes_per_cluster; i++)
2599                        xfs_ifree_mark_inode_stale(pag, free_ip, inum + i);
2600
2601                xfs_trans_stale_inode_buf(tp, bp);
2602                xfs_trans_binval(tp, bp);
2603        }
2604        return 0;
2605}
2606
2607/*
2608 * This is called to return an inode to the inode free list.
2609 * The inode should already be truncated to 0 length and have
2610 * no pages associated with it.  This routine also assumes that
2611 * the inode is already a part of the transaction.
2612 *
2613 * The on-disk copy of the inode will have been added to the list
2614 * of unlinked inodes in the AGI. We need to remove the inode from
2615 * that list atomically with respect to freeing it here.
2616 */
2617int
2618xfs_ifree(
2619        struct xfs_trans        *tp,
2620        struct xfs_inode        *ip)
2621{
2622        struct xfs_mount        *mp = ip->i_mount;
2623        struct xfs_perag        *pag;
2624        struct xfs_icluster     xic = { 0 };
2625        struct xfs_inode_log_item *iip = ip->i_itemp;
2626        int                     error;
2627
2628        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2629        ASSERT(VFS_I(ip)->i_nlink == 0);
2630        ASSERT(ip->i_df.if_nextents == 0);
2631        ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2632        ASSERT(ip->i_nblocks == 0);
2633
2634        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2635
2636        /*
2637         * Pull the on-disk inode from the AGI unlinked list.
2638         */
2639        error = xfs_iunlink_remove(tp, pag, ip);
2640        if (error)
2641                goto out;
2642
2643        error = xfs_difree(tp, pag, ip->i_ino, &xic);
2644        if (error)
2645                goto out;
2646
2647        /*
2648         * Free any local-format data sitting around before we reset the
2649         * data fork to extents format.  Note that the attr fork data has
2650         * already been freed by xfs_attr_inactive.
2651         */
2652        if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
2653                kmem_free(ip->i_df.if_u1.if_data);
2654                ip->i_df.if_u1.if_data = NULL;
2655                ip->i_df.if_bytes = 0;
2656        }
2657
2658        VFS_I(ip)->i_mode = 0;          /* mark incore inode as free */
2659        ip->i_diflags = 0;
2660        ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
2661        ip->i_forkoff = 0;              /* mark the attr fork not in use */
2662        ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
2663        if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
2664                xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
2665
2666        /* Don't attempt to replay owner changes for a deleted inode */
2667        spin_lock(&iip->ili_lock);
2668        iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
2669        spin_unlock(&iip->ili_lock);
2670
2671        /*
2672         * Bump the generation count so no one will be confused
2673         * by reincarnations of this inode.
2674         */
2675        VFS_I(ip)->i_generation++;
2676        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2677
2678        if (xic.deleted)
2679                error = xfs_ifree_cluster(tp, pag, ip, &xic);
2680out:
2681        xfs_perag_put(pag);
2682        return error;
2683}
2684
2685/*
2686 * This is called to unpin an inode.  The caller must have the inode locked
2687 * in at least shared mode so that the buffer cannot be subsequently pinned
2688 * once someone is waiting for it to be unpinned.
2689 */
2690static void
2691xfs_iunpin(
2692        struct xfs_inode        *ip)
2693{
2694        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2695
2696        trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2697
2698        /* Give the log a push to start the unpinning I/O */
2699        xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
2700
2701}
2702
2703static void
2704__xfs_iunpin_wait(
2705        struct xfs_inode        *ip)
2706{
2707        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2708        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2709
2710        xfs_iunpin(ip);
2711
2712        do {
2713                prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2714                if (xfs_ipincount(ip))
2715                        io_schedule();
2716        } while (xfs_ipincount(ip));
2717        finish_wait(wq, &wait.wq_entry);
2718}
2719
2720void
2721xfs_iunpin_wait(
2722        struct xfs_inode        *ip)
2723{
2724        if (xfs_ipincount(ip))
2725                __xfs_iunpin_wait(ip);
2726}
2727
2728/*
2729 * Removing an inode from the namespace involves removing the directory entry
2730 * and dropping the link count on the inode. Removing the directory entry can
2731 * result in locking an AGF (directory blocks were freed) and removing a link
2732 * count can result in placing the inode on an unlinked list which results in
2733 * locking an AGI.
2734 *
2735 * The big problem here is that we have an ordering constraint on AGF and AGI
2736 * locking - inode allocation locks the AGI, then can allocate a new extent for
2737 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2738 * removes the inode from the unlinked list, requiring that we lock the AGI
2739 * first, and then freeing the inode can result in an inode chunk being freed
2740 * and hence freeing disk space requiring that we lock an AGF.
2741 *
2742 * Hence the ordering that is imposed by other parts of the code is AGI before
2743 * AGF. This means we cannot remove the directory entry before we drop the inode
2744 * reference count and put it on the unlinked list as this results in a lock
2745 * order of AGF then AGI, and this can deadlock against inode allocation and
2746 * freeing. Therefore we must drop the link counts before we remove the
2747 * directory entry.
2748 *
2749 * This is still safe from a transactional point of view - it is not until we
2750 * get to xfs_defer_finish() that we have the possibility of multiple
2751 * transactions in this operation. Hence as long as we remove the directory
2752 * entry and drop the link count in the first transaction of the remove
2753 * operation, there are no transactional constraints on the ordering here.
2754 */
2755int
2756xfs_remove(
2757        xfs_inode_t             *dp,
2758        struct xfs_name         *name,
2759        xfs_inode_t             *ip)
2760{
2761        xfs_mount_t             *mp = dp->i_mount;
2762        xfs_trans_t             *tp = NULL;
2763        int                     is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2764        int                     error = 0;
2765        uint                    resblks;
2766
2767        trace_xfs_remove(dp, name);
2768
2769        if (xfs_is_shutdown(mp))
2770                return -EIO;
2771
2772        error = xfs_qm_dqattach(dp);
2773        if (error)
2774                goto std_return;
2775
2776        error = xfs_qm_dqattach(ip);
2777        if (error)
2778                goto std_return;
2779
2780        /*
2781         * We try to get the real space reservation first,
2782         * allowing for directory btree deletion(s) implying
2783         * possible bmap insert(s).  If we can't get the space
2784         * reservation then we use 0 instead, and avoid the bmap
2785         * btree insert(s) in the directory code by, if the bmap
2786         * insert tries to happen, instead trimming the LAST
2787         * block from the directory.
2788         */
2789        resblks = XFS_REMOVE_SPACE_RES(mp);
2790        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
2791        if (error == -ENOSPC) {
2792                resblks = 0;
2793                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
2794                                &tp);
2795        }
2796        if (error) {
2797                ASSERT(error != -ENOSPC);
2798                goto std_return;
2799        }
2800
2801        xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
2802
2803        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2804        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2805
2806        /*
2807         * If we're removing a directory perform some additional validation.
2808         */
2809        if (is_dir) {
2810                ASSERT(VFS_I(ip)->i_nlink >= 2);
2811                if (VFS_I(ip)->i_nlink != 2) {
2812                        error = -ENOTEMPTY;
2813                        goto out_trans_cancel;
2814                }
2815                if (!xfs_dir_isempty(ip)) {
2816                        error = -ENOTEMPTY;
2817                        goto out_trans_cancel;
2818                }
2819
2820                /* Drop the link from ip's "..".  */
2821                error = xfs_droplink(tp, dp);
2822                if (error)
2823                        goto out_trans_cancel;
2824
2825                /* Drop the "." link from ip to self.  */
2826                error = xfs_droplink(tp, ip);
2827                if (error)
2828                        goto out_trans_cancel;
2829
2830                /*
2831                 * Point the unlinked child directory's ".." entry to the root
2832                 * directory to eliminate back-references to inodes that may
2833                 * get freed before the child directory is closed.  If the fs
2834                 * gets shrunk, this can lead to dirent inode validation errors.
2835                 */
2836                if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
2837                        error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
2838                                        tp->t_mountp->m_sb.sb_rootino, 0);
2839                        if (error)
2840                                return error;
2841                }
2842        } else {
2843                /*
2844                 * When removing a non-directory we need to log the parent
2845                 * inode here.  For a directory this is done implicitly
2846                 * by the xfs_droplink call for the ".." entry.
2847                 */
2848                xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2849        }
2850        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2851
2852        /* Drop the link from dp to ip. */
2853        error = xfs_droplink(tp, ip);
2854        if (error)
2855                goto out_trans_cancel;
2856
2857        error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
2858        if (error) {
2859                ASSERT(error != -ENOENT);
2860                goto out_trans_cancel;
2861        }
2862
2863        /*
2864         * If this is a synchronous mount, make sure that the
2865         * remove transaction goes to disk before returning to
2866         * the user.
2867         */
2868        if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
2869                xfs_trans_set_sync(tp);
2870
2871        error = xfs_trans_commit(tp);
2872        if (error)
2873                goto std_return;
2874
2875        if (is_dir && xfs_inode_is_filestream(ip))
2876                xfs_filestream_deassociate(ip);
2877
2878        return 0;
2879
2880 out_trans_cancel:
2881        xfs_trans_cancel(tp);
2882 std_return:
2883        return error;
2884}
2885
2886/*
2887 * Enter all inodes for a rename transaction into a sorted array.
2888 */
2889#define __XFS_SORT_INODES       5
2890STATIC void
2891xfs_sort_for_rename(
2892        struct xfs_inode        *dp1,   /* in: old (source) directory inode */
2893        struct xfs_inode        *dp2,   /* in: new (target) directory inode */
2894        struct xfs_inode        *ip1,   /* in: inode of old entry */
2895        struct xfs_inode        *ip2,   /* in: inode of new entry */
2896        struct xfs_inode        *wip,   /* in: whiteout inode */
2897        struct xfs_inode        **i_tab,/* out: sorted array of inodes */
2898        int                     *num_inodes)  /* in/out: inodes in array */
2899{
2900        int                     i, j;
2901
2902        ASSERT(*num_inodes == __XFS_SORT_INODES);
2903        memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2904
2905        /*
2906         * i_tab contains a list of pointers to inodes.  We initialize
2907         * the table here & we'll sort it.  We will then use it to
2908         * order the acquisition of the inode locks.
2909         *
2910         * Note that the table may contain duplicates.  e.g., dp1 == dp2.
2911         */
2912        i = 0;
2913        i_tab[i++] = dp1;
2914        i_tab[i++] = dp2;
2915        i_tab[i++] = ip1;
2916        if (ip2)
2917                i_tab[i++] = ip2;
2918        if (wip)
2919                i_tab[i++] = wip;
2920        *num_inodes = i;
2921
2922        /*
2923         * Sort the elements via bubble sort.  (Remember, there are at
2924         * most 5 elements to sort, so this is adequate.)
2925         */
2926        for (i = 0; i < *num_inodes; i++) {
2927                for (j = 1; j < *num_inodes; j++) {
2928                        if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2929                                struct xfs_inode *temp = i_tab[j];
2930                                i_tab[j] = i_tab[j-1];
2931                                i_tab[j-1] = temp;
2932                        }
2933                }
2934        }
2935}
2936
2937static int
2938xfs_finish_rename(
2939        struct xfs_trans        *tp)
2940{
2941        /*
2942         * If this is a synchronous mount, make sure that the rename transaction
2943         * goes to disk before returning to the user.
2944         */
2945        if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
2946                xfs_trans_set_sync(tp);
2947
2948        return xfs_trans_commit(tp);
2949}
2950
2951/*
2952 * xfs_cross_rename()
2953 *
2954 * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall
2955 */
2956STATIC int
2957xfs_cross_rename(
2958        struct xfs_trans        *tp,
2959        struct xfs_inode        *dp1,
2960        struct xfs_name         *name1,
2961        struct xfs_inode        *ip1,
2962        struct xfs_inode        *dp2,
2963        struct xfs_name         *name2,
2964        struct xfs_inode        *ip2,
2965        int                     spaceres)
2966{
2967        int             error = 0;
2968        int             ip1_flags = 0;
2969        int             ip2_flags = 0;
2970        int             dp2_flags = 0;
2971
2972        /* Swap inode number for dirent in first parent */
2973        error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
2974        if (error)
2975                goto out_trans_abort;
2976
2977        /* Swap inode number for dirent in second parent */
2978        error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
2979        if (error)
2980                goto out_trans_abort;
2981
2982        /*
2983         * If we're renaming one or more directories across different parents,
2984         * update the respective ".." entries (and link counts) to match the new
2985         * parents.
2986         */
2987        if (dp1 != dp2) {
2988                dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2989
2990                if (S_ISDIR(VFS_I(ip2)->i_mode)) {
2991                        error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2992                                                dp1->i_ino, spaceres);
2993                        if (error)
2994                                goto out_trans_abort;
2995
2996                        /* transfer ip2 ".." reference to dp1 */
2997                        if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
2998                                error = xfs_droplink(tp, dp2);
2999                                if (error)
3000                                        goto out_trans_abort;
3001                                xfs_bumplink(tp, dp1);
3002                        }
3003
3004                        /*
3005                         * Although ip1 isn't changed here, userspace needs
3006                         * to be warned about the change, so that applications
3007                         * relying on it (like backup ones), will properly
3008                         * notify the change
3009                         */
3010                        ip1_flags |= XFS_ICHGTIME_CHG;
3011                        ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
3012                }
3013
3014                if (S_ISDIR(VFS_I(ip1)->i_mode)) {
3015                        error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
3016                                                dp2->i_ino, spaceres);
3017                        if (error)
3018                                goto out_trans_abort;
3019
3020                        /* transfer ip1 ".." reference to dp2 */
3021                        if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
3022                                error = xfs_droplink(tp, dp1);
3023                                if (error)
3024                                        goto out_trans_abort;
3025                                xfs_bumplink(tp, dp2);
3026                        }
3027
3028                        /*
3029                         * Although ip2 isn't changed here, userspace needs
3030                         * to be warned about the change, so that applications
3031                         * relying on it (like backup ones), will properly
3032                         * notify the change
3033                         */
3034                        ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
3035                        ip2_flags |= XFS_ICHGTIME_CHG;
3036                }
3037        }
3038
3039        if (ip1_flags) {
3040                xfs_trans_ichgtime(tp, ip1, ip1_flags);
3041                xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
3042        }
3043        if (ip2_flags) {
3044                xfs_trans_ichgtime(tp, ip2, ip2_flags);
3045                xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
3046        }
3047        if (dp2_flags) {
3048                xfs_trans_ichgtime(tp, dp2, dp2_flags);
3049                xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
3050        }
3051        xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3052        xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
3053        return xfs_finish_rename(tp);
3054
3055out_trans_abort:
3056        xfs_trans_cancel(tp);
3057        return error;
3058}
3059
3060/*
3061 * xfs_rename_alloc_whiteout()
3062 *
3063 * Return a referenced, unlinked, unlocked inode that can be used as a
3064 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
3065 * crash between allocating the inode and linking it into the rename transaction
3066 * recovery will free the inode and we won't leak it.
3067 */
3068static int
3069xfs_rename_alloc_whiteout(
3070        struct user_namespace   *mnt_userns,
3071        struct xfs_inode        *dp,
3072        struct xfs_inode        **wip)
3073{
3074        struct xfs_inode        *tmpfile;
3075        int                     error;
3076
3077        error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE,
3078                                   &tmpfile);
3079        if (error)
3080                return error;
3081
3082        /*
3083         * Prepare the tmpfile inode as if it were created through the VFS.
3084         * Complete the inode setup and flag it as linkable.  nlink is already
3085         * zero, so we can skip the drop_nlink.
3086         */
3087        xfs_setup_iops(tmpfile);
3088        xfs_finish_inode_setup(tmpfile);
3089        VFS_I(tmpfile)->i_state |= I_LINKABLE;
3090
3091        *wip = tmpfile;
3092        return 0;
3093}
3094
3095/*
3096 * xfs_rename
3097 */
3098int
3099xfs_rename(
3100        struct user_namespace   *mnt_userns,
3101        struct xfs_inode        *src_dp,
3102        struct xfs_name         *src_name,
3103        struct xfs_inode        *src_ip,
3104        struct xfs_inode        *target_dp,
3105        struct xfs_name         *target_name,
3106        struct xfs_inode        *target_ip,
3107        unsigned int            flags)
3108{
3109        struct xfs_mount        *mp = src_dp->i_mount;
3110        struct xfs_trans        *tp;
3111        struct xfs_inode        *wip = NULL;            /* whiteout inode */
3112        struct xfs_inode        *inodes[__XFS_SORT_INODES];
3113        int                     i;
3114        int                     num_inodes = __XFS_SORT_INODES;
3115        bool                    new_parent = (src_dp != target_dp);
3116        bool                    src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
3117        int                     spaceres;
3118        int                     error;
3119
3120        trace_xfs_rename(src_dp, target_dp, src_name, target_name);
3121
3122        if ((flags & RENAME_EXCHANGE) && !target_ip)
3123                return -EINVAL;
3124
3125        /*
3126         * If we are doing a whiteout operation, allocate the whiteout inode
3127         * we will be placing at the target and ensure the type is set
3128         * appropriately.
3129         */
3130        if (flags & RENAME_WHITEOUT) {
3131                ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
3132                error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip);
3133                if (error)
3134                        return error;
3135
3136                /* setup target dirent info as whiteout */
3137                src_name->type = XFS_DIR3_FT_CHRDEV;
3138        }
3139
3140        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
3141                                inodes, &num_inodes);
3142
3143        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
3144        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
3145        if (error == -ENOSPC) {
3146                spaceres = 0;
3147                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
3148                                &tp);
3149        }
3150        if (error)
3151                goto out_release_wip;
3152
3153        /*
3154         * Attach the dquots to the inodes
3155         */
3156        error = xfs_qm_vop_rename_dqattach(inodes);
3157        if (error)
3158                goto out_trans_cancel;
3159
3160        /*
3161         * Lock all the participating inodes. Depending upon whether
3162         * the target_name exists in the target directory, and
3163         * whether the target directory is the same as the source
3164         * directory, we can lock from 2 to 4 inodes.
3165         */
3166        xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
3167
3168        /*
3169         * Join all the inodes to the transaction. From this point on,
3170         * we can rely on either trans_commit or trans_cancel to unlock
3171         * them.
3172         */
3173        xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
3174        if (new_parent)
3175                xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
3176        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
3177        if (target_ip)
3178                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
3179        if (wip)
3180                xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
3181
3182        /*
3183         * If we are using project inheritance, we only allow renames
3184         * into our tree when the project IDs are the same; else the
3185         * tree quota mechanism would be circumvented.
3186         */
3187        if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
3188                     target_dp->i_projid != src_ip->i_projid)) {
3189                error = -EXDEV;
3190                goto out_trans_cancel;
3191        }
3192
3193        /* RENAME_EXCHANGE is unique from here on. */
3194        if (flags & RENAME_EXCHANGE)
3195                return xfs_cross_rename(tp, src_dp, src_name, src_ip,
3196                                        target_dp, target_name, target_ip,
3197                                        spaceres);
3198
3199        /*
3200         * Check for expected errors before we dirty the transaction
3201         * so we can return an error without a transaction abort.
3202         *
3203         * Extent count overflow check:
3204         *
3205         * From the perspective of src_dp, a rename operation is essentially a
3206         * directory entry remove operation. Hence the only place where we check
3207         * for extent count overflow for src_dp is in
3208         * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
3209         * -ENOSPC when it detects a possible extent count overflow and in
3210         * response, the higher layers of directory handling code do the
3211         * following:
3212         * 1. Data/Free blocks: XFS lets these blocks linger until a
3213         *    future remove operation removes them.
3214         * 2. Dabtree blocks: XFS swaps the blocks with the last block in the
3215         *    Leaf space and unmaps the last block.
3216         *
3217         * For target_dp, there are two cases depending on whether the
3218         * destination directory entry exists or not.
3219         *
3220         * When destination directory entry does not exist (i.e. target_ip ==
3221         * NULL), extent count overflow check is performed only when transaction
3222         * has a non-zero sized space reservation associated with it.  With a
3223         * zero-sized space reservation, XFS allows a rename operation to
3224         * continue only when the directory has sufficient free space in its
3225         * data/leaf/free space blocks to hold the new entry.
3226         *
3227         * When destination directory entry exists (i.e. target_ip != NULL), all
3228         * we need to do is change the inode number associated with the already
3229         * existing entry. Hence there is no need to perform an extent count
3230         * overflow check.
3231         */
3232        if (target_ip == NULL) {
3233                /*
3234                 * If there's no space reservation, check the entry will
3235                 * fit before actually inserting it.
3236                 */
3237                if (!spaceres) {
3238                        error = xfs_dir_canenter(tp, target_dp, target_name);
3239                        if (error)
3240                                goto out_trans_cancel;
3241                } else {
3242                        error = xfs_iext_count_may_overflow(target_dp,
3243                                        XFS_DATA_FORK,
3244                                        XFS_IEXT_DIR_MANIP_CNT(mp));
3245                        if (error)
3246                                goto out_trans_cancel;
3247                }
3248        } else {
3249                /*
3250                 * If target exists and it's a directory, check that whether
3251                 * it can be destroyed.
3252                 */
3253                if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
3254                    (!xfs_dir_isempty(target_ip) ||
3255                     (VFS_I(target_ip)->i_nlink > 2))) {
3256                        error = -EEXIST;
3257                        goto out_trans_cancel;
3258                }
3259        }
3260
3261        /*
3262         * Lock the AGI buffers we need to handle bumping the nlink of the
3263         * whiteout inode off the unlinked list and to handle dropping the
3264         * nlink of the target inode.  Per locking order rules, do this in
3265         * increasing AG order and before directory block allocation tries to
3266         * grab AGFs because we grab AGIs before AGFs.
3267         *
3268         * The (vfs) caller must ensure that if src is a directory then
3269         * target_ip is either null or an empty directory.
3270         */
3271        for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
3272                if (inodes[i] == wip ||
3273                    (inodes[i] == target_ip &&
3274                     (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
3275                        struct xfs_buf  *bp;
3276                        xfs_agnumber_t  agno;
3277
3278                        agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
3279                        error = xfs_read_agi(mp, tp, agno, &bp);
3280                        if (error)
3281                                goto out_trans_cancel;
3282                }
3283        }
3284
3285        /*
3286         * Directory entry creation below may acquire the AGF. Remove
3287         * the whiteout from the unlinked list first to preserve correct
3288         * AGI/AGF locking order. This dirties the transaction so failures
3289         * after this point will abort and log recovery will clean up the
3290         * mess.
3291         *
3292         * For whiteouts, we need to bump the link count on the whiteout
3293         * inode. After this point, we have a real link, clear the tmpfile
3294         * state flag from the inode so it doesn't accidentally get misused
3295         * in future.
3296         */
3297        if (wip) {
3298                struct xfs_perag        *pag;
3299
3300                ASSERT(VFS_I(wip)->i_nlink == 0);
3301
3302                pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
3303                error = xfs_iunlink_remove(tp, pag, wip);
3304                xfs_perag_put(pag);
3305                if (error)
3306                        goto out_trans_cancel;
3307
3308                xfs_bumplink(tp, wip);
3309                VFS_I(wip)->i_state &= ~I_LINKABLE;
3310        }
3311
3312        /*
3313         * Set up the target.
3314         */
3315        if (target_ip == NULL) {
3316                /*
3317                 * If target does not exist and the rename crosses
3318                 * directories, adjust the target directory link count
3319                 * to account for the ".." reference from the new entry.
3320                 */
3321                error = xfs_dir_createname(tp, target_dp, target_name,
3322                                           src_ip->i_ino, spaceres);
3323                if (error)
3324                        goto out_trans_cancel;
3325
3326                xfs_trans_ichgtime(tp, target_dp,
3327                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3328
3329                if (new_parent && src_is_directory) {
3330                        xfs_bumplink(tp, target_dp);
3331                }
3332        } else { /* target_ip != NULL */
3333                /*
3334                 * Link the source inode under the target name.
3335                 * If the source inode is a directory and we are moving
3336                 * it across directories, its ".." entry will be
3337                 * inconsistent until we replace that down below.
3338                 *
3339                 * In case there is already an entry with the same
3340                 * name at the destination directory, remove it first.
3341                 */
3342                error = xfs_dir_replace(tp, target_dp, target_name,
3343                                        src_ip->i_ino, spaceres);
3344                if (error)
3345                        goto out_trans_cancel;
3346
3347                xfs_trans_ichgtime(tp, target_dp,
3348                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3349
3350                /*
3351                 * Decrement the link count on the target since the target
3352                 * dir no longer points to it.
3353                 */
3354                error = xfs_droplink(tp, target_ip);
3355                if (error)
3356                        goto out_trans_cancel;
3357
3358                if (src_is_directory) {
3359                        /*
3360                         * Drop the link from the old "." entry.
3361                         */
3362                        error = xfs_droplink(tp, target_ip);
3363                        if (error)
3364                                goto out_trans_cancel;
3365                }
3366        } /* target_ip != NULL */
3367
3368        /*
3369         * Remove the source.
3370         */
3371        if (new_parent && src_is_directory) {
3372                /*
3373                 * Rewrite the ".." entry to point to the new
3374                 * directory.
3375                 */
3376                error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
3377                                        target_dp->i_ino, spaceres);
3378                ASSERT(error != -EEXIST);
3379                if (error)
3380                        goto out_trans_cancel;
3381        }
3382
3383        /*
3384         * We always want to hit the ctime on the source inode.
3385         *
3386         * This isn't strictly required by the standards since the source
3387         * inode isn't really being changed, but old unix file systems did
3388         * it and some incremental backup programs won't work without it.
3389         */
3390        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
3391        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
3392
3393        /*
3394         * Adjust the link count on src_dp.  This is necessary when
3395         * renaming a directory, either within one parent when
3396         * the target existed, or across two parent directories.
3397         */
3398        if (src_is_directory && (new_parent || target_ip != NULL)) {
3399
3400                /*
3401                 * Decrement link count on src_directory since the
3402                 * entry that's moved no longer points to it.
3403                 */
3404                error = xfs_droplink(tp, src_dp);
3405                if (error)
3406                        goto out_trans_cancel;
3407        }
3408
3409        /*
3410         * For whiteouts, we only need to update the source dirent with the
3411         * inode number of the whiteout inode rather than removing it
3412         * altogether.
3413         */
3414        if (wip) {
3415                error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3416                                        spaceres);
3417        } else {
3418                /*
3419                 * NOTE: We don't need to check for extent count overflow here
3420                 * because the dir remove name code will leave the dir block in
3421                 * place if the extent count would overflow.
3422                 */
3423                error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3424                                           spaceres);
3425        }
3426
3427        if (error)
3428                goto out_trans_cancel;
3429
3430        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3431        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3432        if (new_parent)
3433                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3434
3435        error = xfs_finish_rename(tp);
3436        if (wip)
3437                xfs_irele(wip);
3438        return error;
3439
3440out_trans_cancel:
3441        xfs_trans_cancel(tp);
3442out_release_wip:
3443        if (wip)
3444                xfs_irele(wip);
3445        return error;
3446}
3447
3448static int
3449xfs_iflush(
3450        struct xfs_inode        *ip,
3451        struct xfs_buf          *bp)
3452{
3453        struct xfs_inode_log_item *iip = ip->i_itemp;
3454        struct xfs_dinode       *dip;
3455        struct xfs_mount        *mp = ip->i_mount;
3456        int                     error;
3457
3458        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3459        ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
3460        ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
3461               ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3462        ASSERT(iip->ili_item.li_buf == bp);
3463
3464        dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3465
3466        /*
3467         * We don't flush the inode if any of the following checks fail, but we
3468         * do still update the log item and attach to the backing buffer as if
3469         * the flush happened. This is a formality to facilitate predictable
3470         * error handling as the caller will shutdown and fail the buffer.
3471         */
3472        error = -EFSCORRUPTED;
3473        if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3474                               mp, XFS_ERRTAG_IFLUSH_1)) {
3475                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3476                        "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
3477                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3478                goto flush_out;
3479        }
3480        if (S_ISREG(VFS_I(ip)->i_mode)) {
3481                if (XFS_TEST_ERROR(
3482                    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3483                    ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
3484                    mp, XFS_ERRTAG_IFLUSH_3)) {
3485                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3486                                "%s: Bad regular inode %Lu, ptr "PTR_FMT,
3487                                __func__, ip->i_ino, ip);
3488                        goto flush_out;
3489                }
3490        } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3491                if (XFS_TEST_ERROR(
3492                    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3493                    ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
3494                    ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
3495                    mp, XFS_ERRTAG_IFLUSH_4)) {
3496                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3497                                "%s: Bad directory inode %Lu, ptr "PTR_FMT,
3498                                __func__, ip->i_ino, ip);
3499                        goto flush_out;
3500                }
3501        }
3502        if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
3503                                ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
3504                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3505                        "%s: detected corrupt incore inode %Lu, "
3506                        "total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
3507                        __func__, ip->i_ino,
3508                        ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
3509                        ip->i_nblocks, ip);
3510                goto flush_out;
3511        }
3512        if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
3513                                mp, XFS_ERRTAG_IFLUSH_6)) {
3514                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3515                        "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
3516                        __func__, ip->i_ino, ip->i_forkoff, ip);
3517                goto flush_out;
3518        }
3519
3520        /*
3521         * Inode item log recovery for v2 inodes are dependent on the flushiter
3522         * count for correct sequencing.  We bump the flush iteration count so
3523         * we can detect flushes which postdate a log record during recovery.
3524         * This is redundant as we now log every change and hence this can't
3525         * happen but we need to still do it to ensure backwards compatibility
3526         * with old kernels that predate logging all inode changes.
3527         */
3528        if (!xfs_has_v3inodes(mp))
3529                ip->i_flushiter++;
3530
3531        /*
3532         * If there are inline format data / attr forks attached to this inode,
3533         * make sure they are not corrupt.
3534         */
3535        if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
3536            xfs_ifork_verify_local_data(ip))
3537                goto flush_out;
3538        if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL &&
3539            xfs_ifork_verify_local_attr(ip))
3540                goto flush_out;
3541
3542        /*
3543         * Copy the dirty parts of the inode into the on-disk inode.  We always
3544         * copy out the core of the inode, because if the inode is dirty at all
3545         * the core must be.
3546         */
3547        xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3548
3549        /* Wrap, we never let the log put out DI_MAX_FLUSH */
3550        if (!xfs_has_v3inodes(mp)) {
3551                if (ip->i_flushiter == DI_MAX_FLUSH)
3552                        ip->i_flushiter = 0;
3553        }
3554
3555        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3556        if (XFS_IFORK_Q(ip))
3557                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3558
3559        /*
3560         * We've recorded everything logged in the inode, so we'd like to clear
3561         * the ili_fields bits so we don't log and flush things unnecessarily.
3562         * However, we can't stop logging all this information until the data
3563         * we've copied into the disk buffer is written to disk.  If we did we
3564         * might overwrite the copy of the inode in the log with all the data
3565         * after re-logging only part of it, and in the face of a crash we
3566         * wouldn't have all the data we need to recover.
3567         *
3568         * What we do is move the bits to the ili_last_fields field.  When
3569         * logging the inode, these bits are moved back to the ili_fields field.
3570         * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
3571         * we know that the information those bits represent is permanently on
3572         * disk.  As long as the flush completes before the inode is logged
3573         * again, then both ili_fields and ili_last_fields will be cleared.
3574         */
3575        error = 0;
3576flush_out:
3577        spin_lock(&iip->ili_lock);
3578        iip->ili_last_fields = iip->ili_fields;
3579        iip->ili_fields = 0;
3580        iip->ili_fsync_fields = 0;
3581        spin_unlock(&iip->ili_lock);
3582
3583        /*
3584         * Store the current LSN of the inode so that we can tell whether the
3585         * item has moved in the AIL from xfs_buf_inode_iodone().
3586         */
3587        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3588                                &iip->ili_item.li_lsn);
3589
3590        /* generate the checksum. */
3591        xfs_dinode_calc_crc(mp, dip);
3592        return error;
3593}
3594
3595/*
3596 * Non-blocking flush of dirty inode metadata into the backing buffer.
3597 *
3598 * The caller must have a reference to the inode and hold the cluster buffer
3599 * locked. The function will walk across all the inodes on the cluster buffer it
3600 * can find and lock without blocking, and flush them to the cluster buffer.
3601 *
3602 * On successful flushing of at least one inode, the caller must write out the
3603 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
3604 * the caller needs to release the buffer. On failure, the filesystem will be
3605 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
3606 * will be returned.
3607 */
3608int
3609xfs_iflush_cluster(
3610        struct xfs_buf          *bp)
3611{
3612        struct xfs_mount        *mp = bp->b_mount;
3613        struct xfs_log_item     *lip, *n;
3614        struct xfs_inode        *ip;
3615        struct xfs_inode_log_item *iip;
3616        int                     clcount = 0;
3617        int                     error = 0;
3618
3619        /*
3620         * We must use the safe variant here as on shutdown xfs_iflush_abort()
3621         * can remove itself from the list.
3622         */
3623        list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
3624                iip = (struct xfs_inode_log_item *)lip;
3625                ip = iip->ili_inode;
3626
3627                /*
3628                 * Quick and dirty check to avoid locks if possible.
3629                 */
3630                if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
3631                        continue;
3632                if (xfs_ipincount(ip))
3633                        continue;
3634
3635                /*
3636                 * The inode is still attached to the buffer, which means it is
3637                 * dirty but reclaim might try to grab it. Check carefully for
3638                 * that, and grab the ilock while still holding the i_flags_lock
3639                 * to guarantee reclaim will not be able to reclaim this inode
3640                 * once we drop the i_flags_lock.
3641                 */
3642                spin_lock(&ip->i_flags_lock);
3643                ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
3644                if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
3645                        spin_unlock(&ip->i_flags_lock);
3646                        continue;
3647                }
3648
3649                /*
3650                 * ILOCK will pin the inode against reclaim and prevent
3651                 * concurrent transactions modifying the inode while we are
3652                 * flushing the inode. If we get the lock, set the flushing
3653                 * state before we drop the i_flags_lock.
3654                 */
3655                if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3656                        spin_unlock(&ip->i_flags_lock);
3657                        continue;
3658                }
3659                __xfs_iflags_set(ip, XFS_IFLUSHING);
3660                spin_unlock(&ip->i_flags_lock);
3661
3662                /*
3663                 * Abort flushing this inode if we are shut down because the
3664                 * inode may not currently be in the AIL. This can occur when
3665                 * log I/O failure unpins the inode without inserting into the
3666                 * AIL, leaving a dirty/unpinned inode attached to the buffer
3667                 * that otherwise looks like it should be flushed.
3668                 */
3669                if (xfs_is_shutdown(mp)) {
3670                        xfs_iunpin_wait(ip);
3671                        xfs_iflush_abort(ip);
3672                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
3673                        error = -EIO;
3674                        continue;
3675                }
3676
3677                /* don't block waiting on a log force to unpin dirty inodes */
3678                if (xfs_ipincount(ip)) {
3679                        xfs_iflags_clear(ip, XFS_IFLUSHING);
3680                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
3681                        continue;
3682                }
3683
3684                if (!xfs_inode_clean(ip))
3685                        error = xfs_iflush(ip, bp);
3686                else
3687                        xfs_iflags_clear(ip, XFS_IFLUSHING);
3688                xfs_iunlock(ip, XFS_ILOCK_SHARED);
3689                if (error)
3690                        break;
3691                clcount++;
3692        }
3693
3694        if (error) {
3695                bp->b_flags |= XBF_ASYNC;
3696                xfs_buf_ioend_fail(bp);
3697                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3698                return error;
3699        }
3700
3701        if (!clcount)
3702                return -EAGAIN;
3703
3704        XFS_STATS_INC(mp, xs_icluster_flushcnt);
3705        XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3706        return 0;
3707
3708}
3709
3710/* Release an inode. */
3711void
3712xfs_irele(
3713        struct xfs_inode        *ip)
3714{
3715        trace_xfs_irele(ip, _RET_IP_);
3716        iput(VFS_I(ip));
3717}
3718
3719/*
3720 * Ensure all commited transactions touching the inode are written to the log.
3721 */
3722int
3723xfs_log_force_inode(
3724        struct xfs_inode        *ip)
3725{
3726        xfs_csn_t               seq = 0;
3727
3728        xfs_ilock(ip, XFS_ILOCK_SHARED);
3729        if (xfs_ipincount(ip))
3730                seq = ip->i_itemp->ili_commit_seq;
3731        xfs_iunlock(ip, XFS_ILOCK_SHARED);
3732
3733        if (!seq)
3734                return 0;
3735        return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
3736}
3737
3738/*
3739 * Grab the exclusive iolock for a data copy from src to dest, making sure to
3740 * abide vfs locking order (lowest pointer value goes first) and breaking the
3741 * layout leases before proceeding.  The loop is needed because we cannot call
3742 * the blocking break_layout() with the iolocks held, and therefore have to
3743 * back out both locks.
3744 */
3745static int
3746xfs_iolock_two_inodes_and_break_layout(
3747        struct inode            *src,
3748        struct inode            *dest)
3749{
3750        int                     error;
3751
3752        if (src > dest)
3753                swap(src, dest);
3754
3755retry:
3756        /* Wait to break both inodes' layouts before we start locking. */
3757        error = break_layout(src, true);
3758        if (error)
3759                return error;
3760        if (src != dest) {
3761                error = break_layout(dest, true);
3762                if (error)
3763                        return error;
3764        }
3765
3766        /* Lock one inode and make sure nobody got in and leased it. */
3767        inode_lock(src);
3768        error = break_layout(src, false);
3769        if (error) {
3770                inode_unlock(src);
3771                if (error == -EWOULDBLOCK)
3772                        goto retry;
3773                return error;
3774        }
3775
3776        if (src == dest)
3777                return 0;
3778
3779        /* Lock the other inode and make sure nobody got in and leased it. */
3780        inode_lock_nested(dest, I_MUTEX_NONDIR2);
3781        error = break_layout(dest, false);
3782        if (error) {
3783                inode_unlock(src);
3784                inode_unlock(dest);
3785                if (error == -EWOULDBLOCK)
3786                        goto retry;
3787                return error;
3788        }
3789
3790        return 0;
3791}
3792
3793/*
3794 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
3795 * mmap activity.
3796 */
3797int
3798xfs_ilock2_io_mmap(
3799        struct xfs_inode        *ip1,
3800        struct xfs_inode        *ip2)
3801{
3802        int                     ret;
3803
3804        ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
3805        if (ret)
3806                return ret;
3807        filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
3808                                    VFS_I(ip2)->i_mapping);
3809        return 0;
3810}
3811
3812/* Unlock both inodes to allow IO and mmap activity. */
3813void
3814xfs_iunlock2_io_mmap(
3815        struct xfs_inode        *ip1,
3816        struct xfs_inode        *ip2)
3817{
3818        filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
3819                                      VFS_I(ip2)->i_mapping);
3820        inode_unlock(VFS_I(ip2));
3821        if (ip1 != ip2)
3822                inode_unlock(VFS_I(ip1));
3823}
3824