linux/fs/xfs/xfs_inode.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include <linux/iversion.h>
   7
   8#include "xfs.h"
   9#include "xfs_fs.h"
  10#include "xfs_shared.h"
  11#include "xfs_format.h"
  12#include "xfs_log_format.h"
  13#include "xfs_trans_resv.h"
  14#include "xfs_sb.h"
  15#include "xfs_mount.h"
  16#include "xfs_defer.h"
  17#include "xfs_inode.h"
  18#include "xfs_dir2.h"
  19#include "xfs_attr.h"
  20#include "xfs_trans_space.h"
  21#include "xfs_trans.h"
  22#include "xfs_buf_item.h"
  23#include "xfs_inode_item.h"
  24#include "xfs_ialloc.h"
  25#include "xfs_bmap.h"
  26#include "xfs_bmap_util.h"
  27#include "xfs_errortag.h"
  28#include "xfs_error.h"
  29#include "xfs_quota.h"
  30#include "xfs_filestream.h"
  31#include "xfs_trace.h"
  32#include "xfs_icache.h"
  33#include "xfs_symlink.h"
  34#include "xfs_trans_priv.h"
  35#include "xfs_log.h"
  36#include "xfs_bmap_btree.h"
  37#include "xfs_reflink.h"
  38
  39kmem_zone_t *xfs_inode_zone;
  40
  41/*
  42 * Used in xfs_itruncate_extents().  This is the maximum number of extents
  43 * freed from a file in a single transaction.
  44 */
  45#define XFS_ITRUNC_MAX_EXTENTS  2
  46
  47STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
  48STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
  49STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
  50
  51/*
  52 * helper function to extract extent size hint from inode
  53 */
  54xfs_extlen_t
  55xfs_get_extsz_hint(
  56        struct xfs_inode        *ip)
  57{
  58        if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
  59                return ip->i_d.di_extsize;
  60        if (XFS_IS_REALTIME_INODE(ip))
  61                return ip->i_mount->m_sb.sb_rextsize;
  62        return 0;
  63}
  64
  65/*
  66 * Helper function to extract CoW extent size hint from inode.
  67 * Between the extent size hint and the CoW extent size hint, we
  68 * return the greater of the two.  If the value is zero (automatic),
  69 * use the default size.
  70 */
  71xfs_extlen_t
  72xfs_get_cowextsz_hint(
  73        struct xfs_inode        *ip)
  74{
  75        xfs_extlen_t            a, b;
  76
  77        a = 0;
  78        if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
  79                a = ip->i_d.di_cowextsize;
  80        b = xfs_get_extsz_hint(ip);
  81
  82        a = max(a, b);
  83        if (a == 0)
  84                return XFS_DEFAULT_COWEXTSZ_HINT;
  85        return a;
  86}
  87
  88/*
  89 * These two are wrapper routines around the xfs_ilock() routine used to
  90 * centralize some grungy code.  They are used in places that wish to lock the
  91 * inode solely for reading the extents.  The reason these places can't just
  92 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
  93 * bringing in of the extents from disk for a file in b-tree format.  If the
  94 * inode is in b-tree format, then we need to lock the inode exclusively until
  95 * the extents are read in.  Locking it exclusively all the time would limit
  96 * our parallelism unnecessarily, though.  What we do instead is check to see
  97 * if the extents have been read in yet, and only lock the inode exclusively
  98 * if they have not.
  99 *
 100 * The functions return a value which should be given to the corresponding
 101 * xfs_iunlock() call.
 102 */
 103uint
 104xfs_ilock_data_map_shared(
 105        struct xfs_inode        *ip)
 106{
 107        uint                    lock_mode = XFS_ILOCK_SHARED;
 108
 109        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
 110            (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
 111                lock_mode = XFS_ILOCK_EXCL;
 112        xfs_ilock(ip, lock_mode);
 113        return lock_mode;
 114}
 115
 116uint
 117xfs_ilock_attr_map_shared(
 118        struct xfs_inode        *ip)
 119{
 120        uint                    lock_mode = XFS_ILOCK_SHARED;
 121
 122        if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
 123            (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
 124                lock_mode = XFS_ILOCK_EXCL;
 125        xfs_ilock(ip, lock_mode);
 126        return lock_mode;
 127}
 128
 129/*
 130 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
 131 * multi-reader locks: i_mmap_lock and the i_lock.  This routine allows
 132 * various combinations of the locks to be obtained.
 133 *
 134 * The 3 locks should always be ordered so that the IO lock is obtained first,
 135 * the mmap lock second and the ilock last in order to prevent deadlock.
 136 *
 137 * Basic locking order:
 138 *
 139 * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
 140 *
 141 * mmap_sem locking order:
 142 *
 143 * i_rwsem -> page lock -> mmap_sem
 144 * mmap_sem -> i_mmap_lock -> page_lock
 145 *
 146 * The difference in mmap_sem locking order mean that we cannot hold the
 147 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
 148 * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
 149 * in get_user_pages() to map the user pages into the kernel address space for
 150 * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
 151 * page faults already hold the mmap_sem.
 152 *
 153 * Hence to serialise fully against both syscall and mmap based IO, we need to
 154 * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
 155 * taken in places where we need to invalidate the page cache in a race
 156 * free manner (e.g. truncate, hole punch and other extent manipulation
 157 * functions).
 158 */
 159void
 160xfs_ilock(
 161        xfs_inode_t             *ip,
 162        uint                    lock_flags)
 163{
 164        trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 165
 166        /*
 167         * You can't set both SHARED and EXCL for the same lock,
 168         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 169         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 170         */
 171        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 172               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 173        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 174               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 175        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 176               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 177        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 178
 179        if (lock_flags & XFS_IOLOCK_EXCL) {
 180                down_write_nested(&VFS_I(ip)->i_rwsem,
 181                                  XFS_IOLOCK_DEP(lock_flags));
 182        } else if (lock_flags & XFS_IOLOCK_SHARED) {
 183                down_read_nested(&VFS_I(ip)->i_rwsem,
 184                                 XFS_IOLOCK_DEP(lock_flags));
 185        }
 186
 187        if (lock_flags & XFS_MMAPLOCK_EXCL)
 188                mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
 189        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 190                mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
 191
 192        if (lock_flags & XFS_ILOCK_EXCL)
 193                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 194        else if (lock_flags & XFS_ILOCK_SHARED)
 195                mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 196}
 197
 198/*
 199 * This is just like xfs_ilock(), except that the caller
 200 * is guaranteed not to sleep.  It returns 1 if it gets
 201 * the requested locks and 0 otherwise.  If the IO lock is
 202 * obtained but the inode lock cannot be, then the IO lock
 203 * is dropped before returning.
 204 *
 205 * ip -- the inode being locked
 206 * lock_flags -- this parameter indicates the inode's locks to be
 207 *       to be locked.  See the comment for xfs_ilock() for a list
 208 *       of valid values.
 209 */
 210int
 211xfs_ilock_nowait(
 212        xfs_inode_t             *ip,
 213        uint                    lock_flags)
 214{
 215        trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
 216
 217        /*
 218         * You can't set both SHARED and EXCL for the same lock,
 219         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 220         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 221         */
 222        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 223               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 224        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 225               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 226        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 227               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 228        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 229
 230        if (lock_flags & XFS_IOLOCK_EXCL) {
 231                if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
 232                        goto out;
 233        } else if (lock_flags & XFS_IOLOCK_SHARED) {
 234                if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
 235                        goto out;
 236        }
 237
 238        if (lock_flags & XFS_MMAPLOCK_EXCL) {
 239                if (!mrtryupdate(&ip->i_mmaplock))
 240                        goto out_undo_iolock;
 241        } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
 242                if (!mrtryaccess(&ip->i_mmaplock))
 243                        goto out_undo_iolock;
 244        }
 245
 246        if (lock_flags & XFS_ILOCK_EXCL) {
 247                if (!mrtryupdate(&ip->i_lock))
 248                        goto out_undo_mmaplock;
 249        } else if (lock_flags & XFS_ILOCK_SHARED) {
 250                if (!mrtryaccess(&ip->i_lock))
 251                        goto out_undo_mmaplock;
 252        }
 253        return 1;
 254
 255out_undo_mmaplock:
 256        if (lock_flags & XFS_MMAPLOCK_EXCL)
 257                mrunlock_excl(&ip->i_mmaplock);
 258        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 259                mrunlock_shared(&ip->i_mmaplock);
 260out_undo_iolock:
 261        if (lock_flags & XFS_IOLOCK_EXCL)
 262                up_write(&VFS_I(ip)->i_rwsem);
 263        else if (lock_flags & XFS_IOLOCK_SHARED)
 264                up_read(&VFS_I(ip)->i_rwsem);
 265out:
 266        return 0;
 267}
 268
 269/*
 270 * xfs_iunlock() is used to drop the inode locks acquired with
 271 * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
 272 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
 273 * that we know which locks to drop.
 274 *
 275 * ip -- the inode being unlocked
 276 * lock_flags -- this parameter indicates the inode's locks to be
 277 *       to be unlocked.  See the comment for xfs_ilock() for a list
 278 *       of valid values for this parameter.
 279 *
 280 */
 281void
 282xfs_iunlock(
 283        xfs_inode_t             *ip,
 284        uint                    lock_flags)
 285{
 286        /*
 287         * You can't set both SHARED and EXCL for the same lock,
 288         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 289         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 290         */
 291        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 292               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 293        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 294               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 295        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 296               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 297        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 298        ASSERT(lock_flags != 0);
 299
 300        if (lock_flags & XFS_IOLOCK_EXCL)
 301                up_write(&VFS_I(ip)->i_rwsem);
 302        else if (lock_flags & XFS_IOLOCK_SHARED)
 303                up_read(&VFS_I(ip)->i_rwsem);
 304
 305        if (lock_flags & XFS_MMAPLOCK_EXCL)
 306                mrunlock_excl(&ip->i_mmaplock);
 307        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 308                mrunlock_shared(&ip->i_mmaplock);
 309
 310        if (lock_flags & XFS_ILOCK_EXCL)
 311                mrunlock_excl(&ip->i_lock);
 312        else if (lock_flags & XFS_ILOCK_SHARED)
 313                mrunlock_shared(&ip->i_lock);
 314
 315        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 316}
 317
 318/*
 319 * give up write locks.  the i/o lock cannot be held nested
 320 * if it is being demoted.
 321 */
 322void
 323xfs_ilock_demote(
 324        xfs_inode_t             *ip,
 325        uint                    lock_flags)
 326{
 327        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
 328        ASSERT((lock_flags &
 329                ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 330
 331        if (lock_flags & XFS_ILOCK_EXCL)
 332                mrdemote(&ip->i_lock);
 333        if (lock_flags & XFS_MMAPLOCK_EXCL)
 334                mrdemote(&ip->i_mmaplock);
 335        if (lock_flags & XFS_IOLOCK_EXCL)
 336                downgrade_write(&VFS_I(ip)->i_rwsem);
 337
 338        trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 339}
 340
 341#if defined(DEBUG) || defined(XFS_WARN)
 342int
 343xfs_isilocked(
 344        xfs_inode_t             *ip,
 345        uint                    lock_flags)
 346{
 347        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
 348                if (!(lock_flags & XFS_ILOCK_SHARED))
 349                        return !!ip->i_lock.mr_writer;
 350                return rwsem_is_locked(&ip->i_lock.mr_lock);
 351        }
 352
 353        if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
 354                if (!(lock_flags & XFS_MMAPLOCK_SHARED))
 355                        return !!ip->i_mmaplock.mr_writer;
 356                return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
 357        }
 358
 359        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 360                if (!(lock_flags & XFS_IOLOCK_SHARED))
 361                        return !debug_locks ||
 362                                lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
 363                return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
 364        }
 365
 366        ASSERT(0);
 367        return 0;
 368}
 369#endif
 370
 371/*
 372 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
 373 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
 374 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
 375 * errors and warnings.
 376 */
 377#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
 378static bool
 379xfs_lockdep_subclass_ok(
 380        int subclass)
 381{
 382        return subclass < MAX_LOCKDEP_SUBCLASSES;
 383}
 384#else
 385#define xfs_lockdep_subclass_ok(subclass)       (true)
 386#endif
 387
 388/*
 389 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
 390 * value. This can be called for any type of inode lock combination, including
 391 * parent locking. Care must be taken to ensure we don't overrun the subclass
 392 * storage fields in the class mask we build.
 393 */
 394static inline int
 395xfs_lock_inumorder(int lock_mode, int subclass)
 396{
 397        int     class = 0;
 398
 399        ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
 400                              XFS_ILOCK_RTSUM)));
 401        ASSERT(xfs_lockdep_subclass_ok(subclass));
 402
 403        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 404                ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
 405                class += subclass << XFS_IOLOCK_SHIFT;
 406        }
 407
 408        if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
 409                ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
 410                class += subclass << XFS_MMAPLOCK_SHIFT;
 411        }
 412
 413        if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
 414                ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
 415                class += subclass << XFS_ILOCK_SHIFT;
 416        }
 417
 418        return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
 419}
 420
 421/*
 422 * The following routine will lock n inodes in exclusive mode.  We assume the
 423 * caller calls us with the inodes in i_ino order.
 424 *
 425 * We need to detect deadlock where an inode that we lock is in the AIL and we
 426 * start waiting for another inode that is locked by a thread in a long running
 427 * transaction (such as truncate). This can result in deadlock since the long
 428 * running trans might need to wait for the inode we just locked in order to
 429 * push the tail and free space in the log.
 430 *
 431 * xfs_lock_inodes() can only be used to lock one type of lock at a time -
 432 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
 433 * lock more than one at a time, lockdep will report false positives saying we
 434 * have violated locking orders.
 435 */
 436static void
 437xfs_lock_inodes(
 438        struct xfs_inode        **ips,
 439        int                     inodes,
 440        uint                    lock_mode)
 441{
 442        int                     attempts = 0, i, j, try_lock;
 443        struct xfs_log_item     *lp;
 444
 445        /*
 446         * Currently supports between 2 and 5 inodes with exclusive locking.  We
 447         * support an arbitrary depth of locking here, but absolute limits on
 448         * inodes depend on the the type of locking and the limits placed by
 449         * lockdep annotations in xfs_lock_inumorder.  These are all checked by
 450         * the asserts.
 451         */
 452        ASSERT(ips && inodes >= 2 && inodes <= 5);
 453        ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
 454                            XFS_ILOCK_EXCL));
 455        ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
 456                              XFS_ILOCK_SHARED)));
 457        ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
 458                inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
 459        ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
 460                inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
 461
 462        if (lock_mode & XFS_IOLOCK_EXCL) {
 463                ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
 464        } else if (lock_mode & XFS_MMAPLOCK_EXCL)
 465                ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
 466
 467        try_lock = 0;
 468        i = 0;
 469again:
 470        for (; i < inodes; i++) {
 471                ASSERT(ips[i]);
 472
 473                if (i && (ips[i] == ips[i - 1]))        /* Already locked */
 474                        continue;
 475
 476                /*
 477                 * If try_lock is not set yet, make sure all locked inodes are
 478                 * not in the AIL.  If any are, set try_lock to be used later.
 479                 */
 480                if (!try_lock) {
 481                        for (j = (i - 1); j >= 0 && !try_lock; j--) {
 482                                lp = &ips[j]->i_itemp->ili_item;
 483                                if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
 484                                        try_lock++;
 485                        }
 486                }
 487
 488                /*
 489                 * If any of the previous locks we have locked is in the AIL,
 490                 * we must TRY to get the second and subsequent locks. If
 491                 * we can't get any, we must release all we have
 492                 * and try again.
 493                 */
 494                if (!try_lock) {
 495                        xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
 496                        continue;
 497                }
 498
 499                /* try_lock means we have an inode locked that is in the AIL. */
 500                ASSERT(i != 0);
 501                if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
 502                        continue;
 503
 504                /*
 505                 * Unlock all previous guys and try again.  xfs_iunlock will try
 506                 * to push the tail if the inode is in the AIL.
 507                 */
 508                attempts++;
 509                for (j = i - 1; j >= 0; j--) {
 510                        /*
 511                         * Check to see if we've already unlocked this one.  Not
 512                         * the first one going back, and the inode ptr is the
 513                         * same.
 514                         */
 515                        if (j != (i - 1) && ips[j] == ips[j + 1])
 516                                continue;
 517
 518                        xfs_iunlock(ips[j], lock_mode);
 519                }
 520
 521                if ((attempts % 5) == 0) {
 522                        delay(1); /* Don't just spin the CPU */
 523                }
 524                i = 0;
 525                try_lock = 0;
 526                goto again;
 527        }
 528}
 529
 530/*
 531 * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
 532 * the mmaplock or the ilock, but not more than one type at a time. If we lock
 533 * more than one at a time, lockdep will report false positives saying we have
 534 * violated locking orders.  The iolock must be double-locked separately since
 535 * we use i_rwsem for that.  We now support taking one lock EXCL and the other
 536 * SHARED.
 537 */
 538void
 539xfs_lock_two_inodes(
 540        struct xfs_inode        *ip0,
 541        uint                    ip0_mode,
 542        struct xfs_inode        *ip1,
 543        uint                    ip1_mode)
 544{
 545        struct xfs_inode        *temp;
 546        uint                    mode_temp;
 547        int                     attempts = 0;
 548        struct xfs_log_item     *lp;
 549
 550        ASSERT(hweight32(ip0_mode) == 1);
 551        ASSERT(hweight32(ip1_mode) == 1);
 552        ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
 553        ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
 554        ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
 555               !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 556        ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
 557               !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 558        ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
 559               !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 560        ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
 561               !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 562
 563        ASSERT(ip0->i_ino != ip1->i_ino);
 564
 565        if (ip0->i_ino > ip1->i_ino) {
 566                temp = ip0;
 567                ip0 = ip1;
 568                ip1 = temp;
 569                mode_temp = ip0_mode;
 570                ip0_mode = ip1_mode;
 571                ip1_mode = mode_temp;
 572        }
 573
 574 again:
 575        xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0));
 576
 577        /*
 578         * If the first lock we have locked is in the AIL, we must TRY to get
 579         * the second lock. If we can't get it, we must release the first one
 580         * and try again.
 581         */
 582        lp = &ip0->i_itemp->ili_item;
 583        if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
 584                if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
 585                        xfs_iunlock(ip0, ip0_mode);
 586                        if ((++attempts % 5) == 0)
 587                                delay(1); /* Don't just spin the CPU */
 588                        goto again;
 589                }
 590        } else {
 591                xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
 592        }
 593}
 594
 595void
 596__xfs_iflock(
 597        struct xfs_inode        *ip)
 598{
 599        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
 600        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
 601
 602        do {
 603                prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 604                if (xfs_isiflocked(ip))
 605                        io_schedule();
 606        } while (!xfs_iflock_nowait(ip));
 607
 608        finish_wait(wq, &wait.wq_entry);
 609}
 610
 611STATIC uint
 612_xfs_dic2xflags(
 613        uint16_t                di_flags,
 614        uint64_t                di_flags2,
 615        bool                    has_attr)
 616{
 617        uint                    flags = 0;
 618
 619        if (di_flags & XFS_DIFLAG_ANY) {
 620                if (di_flags & XFS_DIFLAG_REALTIME)
 621                        flags |= FS_XFLAG_REALTIME;
 622                if (di_flags & XFS_DIFLAG_PREALLOC)
 623                        flags |= FS_XFLAG_PREALLOC;
 624                if (di_flags & XFS_DIFLAG_IMMUTABLE)
 625                        flags |= FS_XFLAG_IMMUTABLE;
 626                if (di_flags & XFS_DIFLAG_APPEND)
 627                        flags |= FS_XFLAG_APPEND;
 628                if (di_flags & XFS_DIFLAG_SYNC)
 629                        flags |= FS_XFLAG_SYNC;
 630                if (di_flags & XFS_DIFLAG_NOATIME)
 631                        flags |= FS_XFLAG_NOATIME;
 632                if (di_flags & XFS_DIFLAG_NODUMP)
 633                        flags |= FS_XFLAG_NODUMP;
 634                if (di_flags & XFS_DIFLAG_RTINHERIT)
 635                        flags |= FS_XFLAG_RTINHERIT;
 636                if (di_flags & XFS_DIFLAG_PROJINHERIT)
 637                        flags |= FS_XFLAG_PROJINHERIT;
 638                if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 639                        flags |= FS_XFLAG_NOSYMLINKS;
 640                if (di_flags & XFS_DIFLAG_EXTSIZE)
 641                        flags |= FS_XFLAG_EXTSIZE;
 642                if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 643                        flags |= FS_XFLAG_EXTSZINHERIT;
 644                if (di_flags & XFS_DIFLAG_NODEFRAG)
 645                        flags |= FS_XFLAG_NODEFRAG;
 646                if (di_flags & XFS_DIFLAG_FILESTREAM)
 647                        flags |= FS_XFLAG_FILESTREAM;
 648        }
 649
 650        if (di_flags2 & XFS_DIFLAG2_ANY) {
 651                if (di_flags2 & XFS_DIFLAG2_DAX)
 652                        flags |= FS_XFLAG_DAX;
 653                if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
 654                        flags |= FS_XFLAG_COWEXTSIZE;
 655        }
 656
 657        if (has_attr)
 658                flags |= FS_XFLAG_HASATTR;
 659
 660        return flags;
 661}
 662
 663uint
 664xfs_ip2xflags(
 665        struct xfs_inode        *ip)
 666{
 667        struct xfs_icdinode     *dic = &ip->i_d;
 668
 669        return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
 670}
 671
 672/*
 673 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 674 * is allowed, otherwise it has to be an exact match. If a CI match is found,
 675 * ci_name->name will point to a the actual name (caller must free) or
 676 * will be set to NULL if an exact match is found.
 677 */
 678int
 679xfs_lookup(
 680        xfs_inode_t             *dp,
 681        struct xfs_name         *name,
 682        xfs_inode_t             **ipp,
 683        struct xfs_name         *ci_name)
 684{
 685        xfs_ino_t               inum;
 686        int                     error;
 687
 688        trace_xfs_lookup(dp, name);
 689
 690        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 691                return -EIO;
 692
 693        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 694        if (error)
 695                goto out_unlock;
 696
 697        error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
 698        if (error)
 699                goto out_free_name;
 700
 701        return 0;
 702
 703out_free_name:
 704        if (ci_name)
 705                kmem_free(ci_name->name);
 706out_unlock:
 707        *ipp = NULL;
 708        return error;
 709}
 710
 711/*
 712 * Allocate an inode on disk and return a copy of its in-core version.
 713 * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
 714 * appropriately within the inode.  The uid and gid for the inode are
 715 * set according to the contents of the given cred structure.
 716 *
 717 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
 718 * has a free inode available, call xfs_iget() to obtain the in-core
 719 * version of the allocated inode.  Finally, fill in the inode and
 720 * log its initial contents.  In this case, ialloc_context would be
 721 * set to NULL.
 722 *
 723 * If xfs_dialloc() does not have an available inode, it will replenish
 724 * its supply by doing an allocation. Since we can only do one
 725 * allocation within a transaction without deadlocks, we must commit
 726 * the current transaction before returning the inode itself.
 727 * In this case, therefore, we will set ialloc_context and return.
 728 * The caller should then commit the current transaction, start a new
 729 * transaction, and call xfs_ialloc() again to actually get the inode.
 730 *
 731 * To ensure that some other process does not grab the inode that
 732 * was allocated during the first call to xfs_ialloc(), this routine
 733 * also returns the [locked] bp pointing to the head of the freelist
 734 * as ialloc_context.  The caller should hold this buffer across
 735 * the commit and pass it back into this routine on the second call.
 736 *
 737 * If we are allocating quota inodes, we do not have a parent inode
 738 * to attach to or associate with (i.e. pip == NULL) because they
 739 * are not linked into the directory structure - they are attached
 740 * directly to the superblock - and so have no parent.
 741 */
 742static int
 743xfs_ialloc(
 744        xfs_trans_t     *tp,
 745        xfs_inode_t     *pip,
 746        umode_t         mode,
 747        xfs_nlink_t     nlink,
 748        dev_t           rdev,
 749        prid_t          prid,
 750        xfs_buf_t       **ialloc_context,
 751        xfs_inode_t     **ipp)
 752{
 753        struct xfs_mount *mp = tp->t_mountp;
 754        xfs_ino_t       ino;
 755        xfs_inode_t     *ip;
 756        uint            flags;
 757        int             error;
 758        struct timespec64 tv;
 759        struct inode    *inode;
 760
 761        /*
 762         * Call the space management code to pick
 763         * the on-disk inode to be allocated.
 764         */
 765        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode,
 766                            ialloc_context, &ino);
 767        if (error)
 768                return error;
 769        if (*ialloc_context || ino == NULLFSINO) {
 770                *ipp = NULL;
 771                return 0;
 772        }
 773        ASSERT(*ialloc_context == NULL);
 774
 775        /*
 776         * Protect against obviously corrupt allocation btree records. Later
 777         * xfs_iget checks will catch re-allocation of other active in-memory
 778         * and on-disk inodes. If we don't catch reallocating the parent inode
 779         * here we will deadlock in xfs_iget() so we have to do these checks
 780         * first.
 781         */
 782        if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
 783                xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
 784                return -EFSCORRUPTED;
 785        }
 786
 787        /*
 788         * Get the in-core inode with the lock held exclusively.
 789         * This is because we're setting fields here we need
 790         * to prevent others from looking at until we're done.
 791         */
 792        error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
 793                         XFS_ILOCK_EXCL, &ip);
 794        if (error)
 795                return error;
 796        ASSERT(ip != NULL);
 797        inode = VFS_I(ip);
 798
 799        /*
 800         * We always convert v1 inodes to v2 now - we only support filesystems
 801         * with >= v2 inode capability, so there is no reason for ever leaving
 802         * an inode in v1 format.
 803         */
 804        if (ip->i_d.di_version == 1)
 805                ip->i_d.di_version = 2;
 806
 807        inode->i_mode = mode;
 808        set_nlink(inode, nlink);
 809        ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
 810        ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
 811        inode->i_rdev = rdev;
 812        xfs_set_projid(ip, prid);
 813
 814        if (pip && XFS_INHERIT_GID(pip)) {
 815                ip->i_d.di_gid = pip->i_d.di_gid;
 816                if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
 817                        inode->i_mode |= S_ISGID;
 818        }
 819
 820        /*
 821         * If the group ID of the new file does not match the effective group
 822         * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
 823         * (and only if the irix_sgid_inherit compatibility variable is set).
 824         */
 825        if ((irix_sgid_inherit) &&
 826            (inode->i_mode & S_ISGID) &&
 827            (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
 828                inode->i_mode &= ~S_ISGID;
 829
 830        ip->i_d.di_size = 0;
 831        ip->i_d.di_nextents = 0;
 832        ASSERT(ip->i_d.di_nblocks == 0);
 833
 834        tv = current_time(inode);
 835        inode->i_mtime = tv;
 836        inode->i_atime = tv;
 837        inode->i_ctime = tv;
 838
 839        ip->i_d.di_extsize = 0;
 840        ip->i_d.di_dmevmask = 0;
 841        ip->i_d.di_dmstate = 0;
 842        ip->i_d.di_flags = 0;
 843
 844        if (ip->i_d.di_version == 3) {
 845                inode_set_iversion(inode, 1);
 846                ip->i_d.di_flags2 = 0;
 847                ip->i_d.di_cowextsize = 0;
 848                ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
 849                ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
 850        }
 851
 852
 853        flags = XFS_ILOG_CORE;
 854        switch (mode & S_IFMT) {
 855        case S_IFIFO:
 856        case S_IFCHR:
 857        case S_IFBLK:
 858        case S_IFSOCK:
 859                ip->i_d.di_format = XFS_DINODE_FMT_DEV;
 860                ip->i_df.if_flags = 0;
 861                flags |= XFS_ILOG_DEV;
 862                break;
 863        case S_IFREG:
 864        case S_IFDIR:
 865                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
 866                        uint            di_flags = 0;
 867
 868                        if (S_ISDIR(mode)) {
 869                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 870                                        di_flags |= XFS_DIFLAG_RTINHERIT;
 871                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 872                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 873                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
 874                                }
 875                                if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 876                                        di_flags |= XFS_DIFLAG_PROJINHERIT;
 877                        } else if (S_ISREG(mode)) {
 878                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 879                                        di_flags |= XFS_DIFLAG_REALTIME;
 880                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 881                                        di_flags |= XFS_DIFLAG_EXTSIZE;
 882                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
 883                                }
 884                        }
 885                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
 886                            xfs_inherit_noatime)
 887                                di_flags |= XFS_DIFLAG_NOATIME;
 888                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
 889                            xfs_inherit_nodump)
 890                                di_flags |= XFS_DIFLAG_NODUMP;
 891                        if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
 892                            xfs_inherit_sync)
 893                                di_flags |= XFS_DIFLAG_SYNC;
 894                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
 895                            xfs_inherit_nosymlinks)
 896                                di_flags |= XFS_DIFLAG_NOSYMLINKS;
 897                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
 898                            xfs_inherit_nodefrag)
 899                                di_flags |= XFS_DIFLAG_NODEFRAG;
 900                        if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
 901                                di_flags |= XFS_DIFLAG_FILESTREAM;
 902
 903                        ip->i_d.di_flags |= di_flags;
 904                }
 905                if (pip &&
 906                    (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
 907                    pip->i_d.di_version == 3 &&
 908                    ip->i_d.di_version == 3) {
 909                        uint64_t        di_flags2 = 0;
 910
 911                        if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
 912                                di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
 913                                ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
 914                        }
 915                        if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
 916                                di_flags2 |= XFS_DIFLAG2_DAX;
 917
 918                        ip->i_d.di_flags2 |= di_flags2;
 919                }
 920                /* FALLTHROUGH */
 921        case S_IFLNK:
 922                ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
 923                ip->i_df.if_flags = XFS_IFEXTENTS;
 924                ip->i_df.if_bytes = 0;
 925                ip->i_df.if_u1.if_root = NULL;
 926                break;
 927        default:
 928                ASSERT(0);
 929        }
 930        /*
 931         * Attribute fork settings for new inode.
 932         */
 933        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 934        ip->i_d.di_anextents = 0;
 935
 936        /*
 937         * Log the new values stuffed into the inode.
 938         */
 939        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 940        xfs_trans_log_inode(tp, ip, flags);
 941
 942        /* now that we have an i_mode we can setup the inode structure */
 943        xfs_setup_inode(ip);
 944
 945        *ipp = ip;
 946        return 0;
 947}
 948
 949/*
 950 * Allocates a new inode from disk and return a pointer to the
 951 * incore copy. This routine will internally commit the current
 952 * transaction and allocate a new one if the Space Manager needed
 953 * to do an allocation to replenish the inode free-list.
 954 *
 955 * This routine is designed to be called from xfs_create and
 956 * xfs_create_dir.
 957 *
 958 */
 959int
 960xfs_dir_ialloc(
 961        xfs_trans_t     **tpp,          /* input: current transaction;
 962                                           output: may be a new transaction. */
 963        xfs_inode_t     *dp,            /* directory within whose allocate
 964                                           the inode. */
 965        umode_t         mode,
 966        xfs_nlink_t     nlink,
 967        dev_t           rdev,
 968        prid_t          prid,           /* project id */
 969        xfs_inode_t     **ipp)          /* pointer to inode; it will be
 970                                           locked. */
 971{
 972        xfs_trans_t     *tp;
 973        xfs_inode_t     *ip;
 974        xfs_buf_t       *ialloc_context = NULL;
 975        int             code;
 976        void            *dqinfo;
 977        uint            tflags;
 978
 979        tp = *tpp;
 980        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 981
 982        /*
 983         * xfs_ialloc will return a pointer to an incore inode if
 984         * the Space Manager has an available inode on the free
 985         * list. Otherwise, it will do an allocation and replenish
 986         * the freelist.  Since we can only do one allocation per
 987         * transaction without deadlocks, we will need to commit the
 988         * current transaction and start a new one.  We will then
 989         * need to call xfs_ialloc again to get the inode.
 990         *
 991         * If xfs_ialloc did an allocation to replenish the freelist,
 992         * it returns the bp containing the head of the freelist as
 993         * ialloc_context. We will hold a lock on it across the
 994         * transaction commit so that no other process can steal
 995         * the inode(s) that we've just allocated.
 996         */
 997        code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
 998                        &ip);
 999
1000        /*
1001         * Return an error if we were unable to allocate a new inode.
1002         * This should only happen if we run out of space on disk or
1003         * encounter a disk error.
1004         */
1005        if (code) {
1006                *ipp = NULL;
1007                return code;
1008        }
1009        if (!ialloc_context && !ip) {
1010                *ipp = NULL;
1011                return -ENOSPC;
1012        }
1013
1014        /*
1015         * If the AGI buffer is non-NULL, then we were unable to get an
1016         * inode in one operation.  We need to commit the current
1017         * transaction and call xfs_ialloc() again.  It is guaranteed
1018         * to succeed the second time.
1019         */
1020        if (ialloc_context) {
1021                /*
1022                 * Normally, xfs_trans_commit releases all the locks.
1023                 * We call bhold to hang on to the ialloc_context across
1024                 * the commit.  Holding this buffer prevents any other
1025                 * processes from doing any allocations in this
1026                 * allocation group.
1027                 */
1028                xfs_trans_bhold(tp, ialloc_context);
1029
1030                /*
1031                 * We want the quota changes to be associated with the next
1032                 * transaction, NOT this one. So, detach the dqinfo from this
1033                 * and attach it to the next transaction.
1034                 */
1035                dqinfo = NULL;
1036                tflags = 0;
1037                if (tp->t_dqinfo) {
1038                        dqinfo = (void *)tp->t_dqinfo;
1039                        tp->t_dqinfo = NULL;
1040                        tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
1041                        tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
1042                }
1043
1044                code = xfs_trans_roll(&tp);
1045
1046                /*
1047                 * Re-attach the quota info that we detached from prev trx.
1048                 */
1049                if (dqinfo) {
1050                        tp->t_dqinfo = dqinfo;
1051                        tp->t_flags |= tflags;
1052                }
1053
1054                if (code) {
1055                        xfs_buf_relse(ialloc_context);
1056                        *tpp = tp;
1057                        *ipp = NULL;
1058                        return code;
1059                }
1060                xfs_trans_bjoin(tp, ialloc_context);
1061
1062                /*
1063                 * Call ialloc again. Since we've locked out all
1064                 * other allocations in this allocation group,
1065                 * this call should always succeed.
1066                 */
1067                code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
1068                                  &ialloc_context, &ip);
1069
1070                /*
1071                 * If we get an error at this point, return to the caller
1072                 * so that the current transaction can be aborted.
1073                 */
1074                if (code) {
1075                        *tpp = tp;
1076                        *ipp = NULL;
1077                        return code;
1078                }
1079                ASSERT(!ialloc_context && ip);
1080
1081        }
1082
1083        *ipp = ip;
1084        *tpp = tp;
1085
1086        return 0;
1087}
1088
1089/*
1090 * Decrement the link count on an inode & log the change.  If this causes the
1091 * link count to go to zero, move the inode to AGI unlinked list so that it can
1092 * be freed when the last active reference goes away via xfs_inactive().
1093 */
1094static int                      /* error */
1095xfs_droplink(
1096        xfs_trans_t *tp,
1097        xfs_inode_t *ip)
1098{
1099        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1100
1101        drop_nlink(VFS_I(ip));
1102        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1103
1104        if (VFS_I(ip)->i_nlink)
1105                return 0;
1106
1107        return xfs_iunlink(tp, ip);
1108}
1109
1110/*
1111 * Increment the link count on an inode & log the change.
1112 */
1113static void
1114xfs_bumplink(
1115        xfs_trans_t *tp,
1116        xfs_inode_t *ip)
1117{
1118        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1119
1120        ASSERT(ip->i_d.di_version > 1);
1121        inc_nlink(VFS_I(ip));
1122        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1123}
1124
1125int
1126xfs_create(
1127        xfs_inode_t             *dp,
1128        struct xfs_name         *name,
1129        umode_t                 mode,
1130        dev_t                   rdev,
1131        xfs_inode_t             **ipp)
1132{
1133        int                     is_dir = S_ISDIR(mode);
1134        struct xfs_mount        *mp = dp->i_mount;
1135        struct xfs_inode        *ip = NULL;
1136        struct xfs_trans        *tp = NULL;
1137        int                     error;
1138        bool                    unlock_dp_on_error = false;
1139        prid_t                  prid;
1140        struct xfs_dquot        *udqp = NULL;
1141        struct xfs_dquot        *gdqp = NULL;
1142        struct xfs_dquot        *pdqp = NULL;
1143        struct xfs_trans_res    *tres;
1144        uint                    resblks;
1145
1146        trace_xfs_create(dp, name);
1147
1148        if (XFS_FORCED_SHUTDOWN(mp))
1149                return -EIO;
1150
1151        prid = xfs_get_initial_prid(dp);
1152
1153        /*
1154         * Make sure that we have allocated dquot(s) on disk.
1155         */
1156        error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1157                                        xfs_kgid_to_gid(current_fsgid()), prid,
1158                                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1159                                        &udqp, &gdqp, &pdqp);
1160        if (error)
1161                return error;
1162
1163        if (is_dir) {
1164                resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1165                tres = &M_RES(mp)->tr_mkdir;
1166        } else {
1167                resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1168                tres = &M_RES(mp)->tr_create;
1169        }
1170
1171        /*
1172         * Initially assume that the file does not exist and
1173         * reserve the resources for that case.  If that is not
1174         * the case we'll drop the one we have and get a more
1175         * appropriate transaction later.
1176         */
1177        error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1178        if (error == -ENOSPC) {
1179                /* flush outstanding delalloc blocks and retry */
1180                xfs_flush_inodes(mp);
1181                error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1182        }
1183        if (error)
1184                goto out_release_inode;
1185
1186        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1187        unlock_dp_on_error = true;
1188
1189        /*
1190         * Reserve disk quota and the inode.
1191         */
1192        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1193                                                pdqp, resblks, 1, 0);
1194        if (error)
1195                goto out_trans_cancel;
1196
1197        /*
1198         * A newly created regular or special file just has one directory
1199         * entry pointing to them, but a directory also the "." entry
1200         * pointing to itself.
1201         */
1202        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
1203        if (error)
1204                goto out_trans_cancel;
1205
1206        /*
1207         * Now we join the directory inode to the transaction.  We do not do it
1208         * earlier because xfs_dir_ialloc might commit the previous transaction
1209         * (and release all the locks).  An error from here on will result in
1210         * the transaction cancel unlocking dp so don't do it explicitly in the
1211         * error path.
1212         */
1213        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1214        unlock_dp_on_error = false;
1215
1216        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1217                                   resblks ?
1218                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1219        if (error) {
1220                ASSERT(error != -ENOSPC);
1221                goto out_trans_cancel;
1222        }
1223        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1224        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1225
1226        if (is_dir) {
1227                error = xfs_dir_init(tp, ip, dp);
1228                if (error)
1229                        goto out_trans_cancel;
1230
1231                xfs_bumplink(tp, dp);
1232        }
1233
1234        /*
1235         * If this is a synchronous mount, make sure that the
1236         * create transaction goes to disk before returning to
1237         * the user.
1238         */
1239        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1240                xfs_trans_set_sync(tp);
1241
1242        /*
1243         * Attach the dquot(s) to the inodes and modify them incore.
1244         * These ids of the inode couldn't have changed since the new
1245         * inode has been locked ever since it was created.
1246         */
1247        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1248
1249        error = xfs_trans_commit(tp);
1250        if (error)
1251                goto out_release_inode;
1252
1253        xfs_qm_dqrele(udqp);
1254        xfs_qm_dqrele(gdqp);
1255        xfs_qm_dqrele(pdqp);
1256
1257        *ipp = ip;
1258        return 0;
1259
1260 out_trans_cancel:
1261        xfs_trans_cancel(tp);
1262 out_release_inode:
1263        /*
1264         * Wait until after the current transaction is aborted to finish the
1265         * setup of the inode and release the inode.  This prevents recursive
1266         * transactions and deadlocks from xfs_inactive.
1267         */
1268        if (ip) {
1269                xfs_finish_inode_setup(ip);
1270                xfs_irele(ip);
1271        }
1272
1273        xfs_qm_dqrele(udqp);
1274        xfs_qm_dqrele(gdqp);
1275        xfs_qm_dqrele(pdqp);
1276
1277        if (unlock_dp_on_error)
1278                xfs_iunlock(dp, XFS_ILOCK_EXCL);
1279        return error;
1280}
1281
1282int
1283xfs_create_tmpfile(
1284        struct xfs_inode        *dp,
1285        umode_t                 mode,
1286        struct xfs_inode        **ipp)
1287{
1288        struct xfs_mount        *mp = dp->i_mount;
1289        struct xfs_inode        *ip = NULL;
1290        struct xfs_trans        *tp = NULL;
1291        int                     error;
1292        prid_t                  prid;
1293        struct xfs_dquot        *udqp = NULL;
1294        struct xfs_dquot        *gdqp = NULL;
1295        struct xfs_dquot        *pdqp = NULL;
1296        struct xfs_trans_res    *tres;
1297        uint                    resblks;
1298
1299        if (XFS_FORCED_SHUTDOWN(mp))
1300                return -EIO;
1301
1302        prid = xfs_get_initial_prid(dp);
1303
1304        /*
1305         * Make sure that we have allocated dquot(s) on disk.
1306         */
1307        error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1308                                xfs_kgid_to_gid(current_fsgid()), prid,
1309                                XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1310                                &udqp, &gdqp, &pdqp);
1311        if (error)
1312                return error;
1313
1314        resblks = XFS_IALLOC_SPACE_RES(mp);
1315        tres = &M_RES(mp)->tr_create_tmpfile;
1316
1317        error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1318        if (error)
1319                goto out_release_inode;
1320
1321        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1322                                                pdqp, resblks, 1, 0);
1323        if (error)
1324                goto out_trans_cancel;
1325
1326        error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
1327        if (error)
1328                goto out_trans_cancel;
1329
1330        if (mp->m_flags & XFS_MOUNT_WSYNC)
1331                xfs_trans_set_sync(tp);
1332
1333        /*
1334         * Attach the dquot(s) to the inodes and modify them incore.
1335         * These ids of the inode couldn't have changed since the new
1336         * inode has been locked ever since it was created.
1337         */
1338        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1339
1340        error = xfs_iunlink(tp, ip);
1341        if (error)
1342                goto out_trans_cancel;
1343
1344        error = xfs_trans_commit(tp);
1345        if (error)
1346                goto out_release_inode;
1347
1348        xfs_qm_dqrele(udqp);
1349        xfs_qm_dqrele(gdqp);
1350        xfs_qm_dqrele(pdqp);
1351
1352        *ipp = ip;
1353        return 0;
1354
1355 out_trans_cancel:
1356        xfs_trans_cancel(tp);
1357 out_release_inode:
1358        /*
1359         * Wait until after the current transaction is aborted to finish the
1360         * setup of the inode and release the inode.  This prevents recursive
1361         * transactions and deadlocks from xfs_inactive.
1362         */
1363        if (ip) {
1364                xfs_finish_inode_setup(ip);
1365                xfs_irele(ip);
1366        }
1367
1368        xfs_qm_dqrele(udqp);
1369        xfs_qm_dqrele(gdqp);
1370        xfs_qm_dqrele(pdqp);
1371
1372        return error;
1373}
1374
1375int
1376xfs_link(
1377        xfs_inode_t             *tdp,
1378        xfs_inode_t             *sip,
1379        struct xfs_name         *target_name)
1380{
1381        xfs_mount_t             *mp = tdp->i_mount;
1382        xfs_trans_t             *tp;
1383        int                     error;
1384        int                     resblks;
1385
1386        trace_xfs_link(tdp, target_name);
1387
1388        ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1389
1390        if (XFS_FORCED_SHUTDOWN(mp))
1391                return -EIO;
1392
1393        error = xfs_qm_dqattach(sip);
1394        if (error)
1395                goto std_return;
1396
1397        error = xfs_qm_dqattach(tdp);
1398        if (error)
1399                goto std_return;
1400
1401        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1402        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
1403        if (error == -ENOSPC) {
1404                resblks = 0;
1405                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
1406        }
1407        if (error)
1408                goto std_return;
1409
1410        xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
1411
1412        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1413        xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1414
1415        /*
1416         * If we are using project inheritance, we only allow hard link
1417         * creation in our tree when the project IDs are the same; else
1418         * the tree quota mechanism could be circumvented.
1419         */
1420        if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1421                     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1422                error = -EXDEV;
1423                goto error_return;
1424        }
1425
1426        if (!resblks) {
1427                error = xfs_dir_canenter(tp, tdp, target_name);
1428                if (error)
1429                        goto error_return;
1430        }
1431
1432        /*
1433         * Handle initial link state of O_TMPFILE inode
1434         */
1435        if (VFS_I(sip)->i_nlink == 0) {
1436                error = xfs_iunlink_remove(tp, sip);
1437                if (error)
1438                        goto error_return;
1439        }
1440
1441        error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1442                                   resblks);
1443        if (error)
1444                goto error_return;
1445        xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1446        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1447
1448        xfs_bumplink(tp, sip);
1449
1450        /*
1451         * If this is a synchronous mount, make sure that the
1452         * link transaction goes to disk before returning to
1453         * the user.
1454         */
1455        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1456                xfs_trans_set_sync(tp);
1457
1458        return xfs_trans_commit(tp);
1459
1460 error_return:
1461        xfs_trans_cancel(tp);
1462 std_return:
1463        return error;
1464}
1465
1466/* Clear the reflink flag and the cowblocks tag if possible. */
1467static void
1468xfs_itruncate_clear_reflink_flags(
1469        struct xfs_inode        *ip)
1470{
1471        struct xfs_ifork        *dfork;
1472        struct xfs_ifork        *cfork;
1473
1474        if (!xfs_is_reflink_inode(ip))
1475                return;
1476        dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1477        cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
1478        if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
1479                ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
1480        if (cfork->if_bytes == 0)
1481                xfs_inode_clear_cowblocks_tag(ip);
1482}
1483
1484/*
1485 * Free up the underlying blocks past new_size.  The new size must be smaller
1486 * than the current size.  This routine can be used both for the attribute and
1487 * data fork, and does not modify the inode size, which is left to the caller.
1488 *
1489 * The transaction passed to this routine must have made a permanent log
1490 * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1491 * given transaction and start new ones, so make sure everything involved in
1492 * the transaction is tidy before calling here.  Some transaction will be
1493 * returned to the caller to be committed.  The incoming transaction must
1494 * already include the inode, and both inode locks must be held exclusively.
1495 * The inode must also be "held" within the transaction.  On return the inode
1496 * will be "held" within the returned transaction.  This routine does NOT
1497 * require any disk space to be reserved for it within the transaction.
1498 *
1499 * If we get an error, we must return with the inode locked and linked into the
1500 * current transaction. This keeps things simple for the higher level code,
1501 * because it always knows that the inode is locked and held in the transaction
1502 * that returns to it whether errors occur or not.  We don't mark the inode
1503 * dirty on error so that transactions can be easily aborted if possible.
1504 */
1505int
1506xfs_itruncate_extents_flags(
1507        struct xfs_trans        **tpp,
1508        struct xfs_inode        *ip,
1509        int                     whichfork,
1510        xfs_fsize_t             new_size,
1511        int                     flags)
1512{
1513        struct xfs_mount        *mp = ip->i_mount;
1514        struct xfs_trans        *tp = *tpp;
1515        xfs_fileoff_t           first_unmap_block;
1516        xfs_fileoff_t           last_block;
1517        xfs_filblks_t           unmap_len;
1518        int                     error = 0;
1519        int                     done = 0;
1520
1521        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1522        ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1523               xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1524        ASSERT(new_size <= XFS_ISIZE(ip));
1525        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1526        ASSERT(ip->i_itemp != NULL);
1527        ASSERT(ip->i_itemp->ili_lock_flags == 0);
1528        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1529
1530        trace_xfs_itruncate_extents_start(ip, new_size);
1531
1532        flags |= xfs_bmapi_aflag(whichfork);
1533
1534        /*
1535         * Since it is possible for space to become allocated beyond
1536         * the end of the file (in a crash where the space is allocated
1537         * but the inode size is not yet updated), simply remove any
1538         * blocks which show up between the new EOF and the maximum
1539         * possible file size.  If the first block to be removed is
1540         * beyond the maximum file size (ie it is the same as last_block),
1541         * then there is nothing to do.
1542         */
1543        first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1544        last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1545        if (first_unmap_block == last_block)
1546                return 0;
1547
1548        ASSERT(first_unmap_block < last_block);
1549        unmap_len = last_block - first_unmap_block + 1;
1550        while (!done) {
1551                ASSERT(tp->t_firstblock == NULLFSBLOCK);
1552                error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
1553                                    XFS_ITRUNC_MAX_EXTENTS, &done);
1554                if (error)
1555                        goto out;
1556
1557                /*
1558                 * Duplicate the transaction that has the permanent
1559                 * reservation and commit the old transaction.
1560                 */
1561                error = xfs_defer_finish(&tp);
1562                if (error)
1563                        goto out;
1564
1565                error = xfs_trans_roll_inode(&tp, ip);
1566                if (error)
1567                        goto out;
1568        }
1569
1570        if (whichfork == XFS_DATA_FORK) {
1571                /* Remove all pending CoW reservations. */
1572                error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1573                                first_unmap_block, last_block, true);
1574                if (error)
1575                        goto out;
1576
1577                xfs_itruncate_clear_reflink_flags(ip);
1578        }
1579
1580        /*
1581         * Always re-log the inode so that our permanent transaction can keep
1582         * on rolling it forward in the log.
1583         */
1584        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1585
1586        trace_xfs_itruncate_extents_end(ip, new_size);
1587
1588out:
1589        *tpp = tp;
1590        return error;
1591}
1592
1593int
1594xfs_release(
1595        xfs_inode_t     *ip)
1596{
1597        xfs_mount_t     *mp = ip->i_mount;
1598        int             error;
1599
1600        if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1601                return 0;
1602
1603        /* If this is a read-only mount, don't do this (would generate I/O) */
1604        if (mp->m_flags & XFS_MOUNT_RDONLY)
1605                return 0;
1606
1607        if (!XFS_FORCED_SHUTDOWN(mp)) {
1608                int truncated;
1609
1610                /*
1611                 * If we previously truncated this file and removed old data
1612                 * in the process, we want to initiate "early" writeout on
1613                 * the last close.  This is an attempt to combat the notorious
1614                 * NULL files problem which is particularly noticeable from a
1615                 * truncate down, buffered (re-)write (delalloc), followed by
1616                 * a crash.  What we are effectively doing here is
1617                 * significantly reducing the time window where we'd otherwise
1618                 * be exposed to that problem.
1619                 */
1620                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1621                if (truncated) {
1622                        xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1623                        if (ip->i_delayed_blks > 0) {
1624                                error = filemap_flush(VFS_I(ip)->i_mapping);
1625                                if (error)
1626                                        return error;
1627                        }
1628                }
1629        }
1630
1631        if (VFS_I(ip)->i_nlink == 0)
1632                return 0;
1633
1634        if (xfs_can_free_eofblocks(ip, false)) {
1635
1636                /*
1637                 * Check if the inode is being opened, written and closed
1638                 * frequently and we have delayed allocation blocks outstanding
1639                 * (e.g. streaming writes from the NFS server), truncating the
1640                 * blocks past EOF will cause fragmentation to occur.
1641                 *
1642                 * In this case don't do the truncation, but we have to be
1643                 * careful how we detect this case. Blocks beyond EOF show up as
1644                 * i_delayed_blks even when the inode is clean, so we need to
1645                 * truncate them away first before checking for a dirty release.
1646                 * Hence on the first dirty close we will still remove the
1647                 * speculative allocation, but after that we will leave it in
1648                 * place.
1649                 */
1650                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1651                        return 0;
1652                /*
1653                 * If we can't get the iolock just skip truncating the blocks
1654                 * past EOF because we could deadlock with the mmap_sem
1655                 * otherwise. We'll get another chance to drop them once the
1656                 * last reference to the inode is dropped, so we'll never leak
1657                 * blocks permanently.
1658                 */
1659                if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1660                        error = xfs_free_eofblocks(ip);
1661                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1662                        if (error)
1663                                return error;
1664                }
1665
1666                /* delalloc blocks after truncation means it really is dirty */
1667                if (ip->i_delayed_blks)
1668                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1669        }
1670        return 0;
1671}
1672
1673/*
1674 * xfs_inactive_truncate
1675 *
1676 * Called to perform a truncate when an inode becomes unlinked.
1677 */
1678STATIC int
1679xfs_inactive_truncate(
1680        struct xfs_inode *ip)
1681{
1682        struct xfs_mount        *mp = ip->i_mount;
1683        struct xfs_trans        *tp;
1684        int                     error;
1685
1686        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
1687        if (error) {
1688                ASSERT(XFS_FORCED_SHUTDOWN(mp));
1689                return error;
1690        }
1691        xfs_ilock(ip, XFS_ILOCK_EXCL);
1692        xfs_trans_ijoin(tp, ip, 0);
1693
1694        /*
1695         * Log the inode size first to prevent stale data exposure in the event
1696         * of a system crash before the truncate completes. See the related
1697         * comment in xfs_vn_setattr_size() for details.
1698         */
1699        ip->i_d.di_size = 0;
1700        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1701
1702        error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1703        if (error)
1704                goto error_trans_cancel;
1705
1706        ASSERT(ip->i_d.di_nextents == 0);
1707
1708        error = xfs_trans_commit(tp);
1709        if (error)
1710                goto error_unlock;
1711
1712        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1713        return 0;
1714
1715error_trans_cancel:
1716        xfs_trans_cancel(tp);
1717error_unlock:
1718        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1719        return error;
1720}
1721
1722/*
1723 * xfs_inactive_ifree()
1724 *
1725 * Perform the inode free when an inode is unlinked.
1726 */
1727STATIC int
1728xfs_inactive_ifree(
1729        struct xfs_inode *ip)
1730{
1731        struct xfs_mount        *mp = ip->i_mount;
1732        struct xfs_trans        *tp;
1733        int                     error;
1734
1735        /*
1736         * We try to use a per-AG reservation for any block needed by the finobt
1737         * tree, but as the finobt feature predates the per-AG reservation
1738         * support a degraded file system might not have enough space for the
1739         * reservation at mount time.  In that case try to dip into the reserved
1740         * pool and pray.
1741         *
1742         * Send a warning if the reservation does happen to fail, as the inode
1743         * now remains allocated and sits on the unlinked list until the fs is
1744         * repaired.
1745         */
1746        if (unlikely(mp->m_finobt_nores)) {
1747                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1748                                XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1749                                &tp);
1750        } else {
1751                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1752        }
1753        if (error) {
1754                if (error == -ENOSPC) {
1755                        xfs_warn_ratelimited(mp,
1756                        "Failed to remove inode(s) from unlinked list. "
1757                        "Please free space, unmount and run xfs_repair.");
1758                } else {
1759                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
1760                }
1761                return error;
1762        }
1763
1764        xfs_ilock(ip, XFS_ILOCK_EXCL);
1765        xfs_trans_ijoin(tp, ip, 0);
1766
1767        error = xfs_ifree(tp, ip);
1768        if (error) {
1769                /*
1770                 * If we fail to free the inode, shut down.  The cancel
1771                 * might do that, we need to make sure.  Otherwise the
1772                 * inode might be lost for a long time or forever.
1773                 */
1774                if (!XFS_FORCED_SHUTDOWN(mp)) {
1775                        xfs_notice(mp, "%s: xfs_ifree returned error %d",
1776                                __func__, error);
1777                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1778                }
1779                xfs_trans_cancel(tp);
1780                xfs_iunlock(ip, XFS_ILOCK_EXCL);
1781                return error;
1782        }
1783
1784        /*
1785         * Credit the quota account(s). The inode is gone.
1786         */
1787        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1788
1789        /*
1790         * Just ignore errors at this point.  There is nothing we can do except
1791         * to try to keep going. Make sure it's not a silent error.
1792         */
1793        error = xfs_trans_commit(tp);
1794        if (error)
1795                xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1796                        __func__, error);
1797
1798        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1799        return 0;
1800}
1801
1802/*
1803 * xfs_inactive
1804 *
1805 * This is called when the vnode reference count for the vnode
1806 * goes to zero.  If the file has been unlinked, then it must
1807 * now be truncated.  Also, we clear all of the read-ahead state
1808 * kept for the inode here since the file is now closed.
1809 */
1810void
1811xfs_inactive(
1812        xfs_inode_t     *ip)
1813{
1814        struct xfs_mount        *mp;
1815        int                     error;
1816        int                     truncate = 0;
1817
1818        /*
1819         * If the inode is already free, then there can be nothing
1820         * to clean up here.
1821         */
1822        if (VFS_I(ip)->i_mode == 0) {
1823                ASSERT(ip->i_df.if_broot_bytes == 0);
1824                return;
1825        }
1826
1827        mp = ip->i_mount;
1828        ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
1829
1830        /* If this is a read-only mount, don't do this (would generate I/O) */
1831        if (mp->m_flags & XFS_MOUNT_RDONLY)
1832                return;
1833
1834        /* Try to clean out the cow blocks if there are any. */
1835        if (xfs_inode_has_cow_data(ip))
1836                xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
1837
1838        if (VFS_I(ip)->i_nlink != 0) {
1839                /*
1840                 * force is true because we are evicting an inode from the
1841                 * cache. Post-eof blocks must be freed, lest we end up with
1842                 * broken free space accounting.
1843                 *
1844                 * Note: don't bother with iolock here since lockdep complains
1845                 * about acquiring it in reclaim context. We have the only
1846                 * reference to the inode at this point anyways.
1847                 */
1848                if (xfs_can_free_eofblocks(ip, true))
1849                        xfs_free_eofblocks(ip);
1850
1851                return;
1852        }
1853
1854        if (S_ISREG(VFS_I(ip)->i_mode) &&
1855            (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
1856             ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
1857                truncate = 1;
1858
1859        error = xfs_qm_dqattach(ip);
1860        if (error)
1861                return;
1862
1863        if (S_ISLNK(VFS_I(ip)->i_mode))
1864                error = xfs_inactive_symlink(ip);
1865        else if (truncate)
1866                error = xfs_inactive_truncate(ip);
1867        if (error)
1868                return;
1869
1870        /*
1871         * If there are attributes associated with the file then blow them away
1872         * now.  The code calls a routine that recursively deconstructs the
1873         * attribute fork. If also blows away the in-core attribute fork.
1874         */
1875        if (XFS_IFORK_Q(ip)) {
1876                error = xfs_attr_inactive(ip);
1877                if (error)
1878                        return;
1879        }
1880
1881        ASSERT(!ip->i_afp);
1882        ASSERT(ip->i_d.di_anextents == 0);
1883        ASSERT(ip->i_d.di_forkoff == 0);
1884
1885        /*
1886         * Free the inode.
1887         */
1888        error = xfs_inactive_ifree(ip);
1889        if (error)
1890                return;
1891
1892        /*
1893         * Release the dquots held by inode, if any.
1894         */
1895        xfs_qm_dqdetach(ip);
1896}
1897
1898/*
1899 * In-Core Unlinked List Lookups
1900 * =============================
1901 *
1902 * Every inode is supposed to be reachable from some other piece of metadata
1903 * with the exception of the root directory.  Inodes with a connection to a
1904 * file descriptor but not linked from anywhere in the on-disk directory tree
1905 * are collectively known as unlinked inodes, though the filesystem itself
1906 * maintains links to these inodes so that on-disk metadata are consistent.
1907 *
1908 * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
1909 * header contains a number of buckets that point to an inode, and each inode
1910 * record has a pointer to the next inode in the hash chain.  This
1911 * singly-linked list causes scaling problems in the iunlink remove function
1912 * because we must walk that list to find the inode that points to the inode
1913 * being removed from the unlinked hash bucket list.
1914 *
1915 * What if we modelled the unlinked list as a collection of records capturing
1916 * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
1917 * have a fast way to look up unlinked list predecessors, which avoids the
1918 * slow list walk.  That's exactly what we do here (in-core) with a per-AG
1919 * rhashtable.
1920 *
1921 * Because this is a backref cache, we ignore operational failures since the
1922 * iunlink code can fall back to the slow bucket walk.  The only errors that
1923 * should bubble out are for obviously incorrect situations.
1924 *
1925 * All users of the backref cache MUST hold the AGI buffer lock to serialize
1926 * access or have otherwise provided for concurrency control.
1927 */
1928
1929/* Capture a "X.next_unlinked = Y" relationship. */
1930struct xfs_iunlink {
1931        struct rhash_head       iu_rhash_head;
1932        xfs_agino_t             iu_agino;               /* X */
1933        xfs_agino_t             iu_next_unlinked;       /* Y */
1934};
1935
1936/* Unlinked list predecessor lookup hashtable construction */
1937static int
1938xfs_iunlink_obj_cmpfn(
1939        struct rhashtable_compare_arg   *arg,
1940        const void                      *obj)
1941{
1942        const xfs_agino_t               *key = arg->key;
1943        const struct xfs_iunlink        *iu = obj;
1944
1945        if (iu->iu_next_unlinked != *key)
1946                return 1;
1947        return 0;
1948}
1949
1950static const struct rhashtable_params xfs_iunlink_hash_params = {
1951        .min_size               = XFS_AGI_UNLINKED_BUCKETS,
1952        .key_len                = sizeof(xfs_agino_t),
1953        .key_offset             = offsetof(struct xfs_iunlink,
1954                                           iu_next_unlinked),
1955        .head_offset            = offsetof(struct xfs_iunlink, iu_rhash_head),
1956        .automatic_shrinking    = true,
1957        .obj_cmpfn              = xfs_iunlink_obj_cmpfn,
1958};
1959
1960/*
1961 * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
1962 * relation is found.
1963 */
1964static xfs_agino_t
1965xfs_iunlink_lookup_backref(
1966        struct xfs_perag        *pag,
1967        xfs_agino_t             agino)
1968{
1969        struct xfs_iunlink      *iu;
1970
1971        iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
1972                        xfs_iunlink_hash_params);
1973        return iu ? iu->iu_agino : NULLAGINO;
1974}
1975
1976/*
1977 * Take ownership of an iunlink cache entry and insert it into the hash table.
1978 * If successful, the entry will be owned by the cache; if not, it is freed.
1979 * Either way, the caller does not own @iu after this call.
1980 */
1981static int
1982xfs_iunlink_insert_backref(
1983        struct xfs_perag        *pag,
1984        struct xfs_iunlink      *iu)
1985{
1986        int                     error;
1987
1988        error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
1989                        &iu->iu_rhash_head, xfs_iunlink_hash_params);
1990        /*
1991         * Fail loudly if there already was an entry because that's a sign of
1992         * corruption of in-memory data.  Also fail loudly if we see an error
1993         * code we didn't anticipate from the rhashtable code.  Currently we
1994         * only anticipate ENOMEM.
1995         */
1996        if (error) {
1997                WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
1998                kmem_free(iu);
1999        }
2000        /*
2001         * Absorb any runtime errors that aren't a result of corruption because
2002         * this is a cache and we can always fall back to bucket list scanning.
2003         */
2004        if (error != 0 && error != -EEXIST)
2005                error = 0;
2006        return error;
2007}
2008
2009/* Remember that @prev_agino.next_unlinked = @this_agino. */
2010static int
2011xfs_iunlink_add_backref(
2012        struct xfs_perag        *pag,
2013        xfs_agino_t             prev_agino,
2014        xfs_agino_t             this_agino)
2015{
2016        struct xfs_iunlink      *iu;
2017
2018        if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
2019                return 0;
2020
2021        iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
2022        iu->iu_agino = prev_agino;
2023        iu->iu_next_unlinked = this_agino;
2024
2025        return xfs_iunlink_insert_backref(pag, iu);
2026}
2027
2028/*
2029 * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
2030 * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
2031 * wasn't any such entry then we don't bother.
2032 */
2033static int
2034xfs_iunlink_change_backref(
2035        struct xfs_perag        *pag,
2036        xfs_agino_t             agino,
2037        xfs_agino_t             next_unlinked)
2038{
2039        struct xfs_iunlink      *iu;
2040        int                     error;
2041
2042        /* Look up the old entry; if there wasn't one then exit. */
2043        iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
2044                        xfs_iunlink_hash_params);
2045        if (!iu)
2046                return 0;
2047
2048        /*
2049         * Remove the entry.  This shouldn't ever return an error, but if we
2050         * couldn't remove the old entry we don't want to add it again to the
2051         * hash table, and if the entry disappeared on us then someone's
2052         * violated the locking rules and we need to fail loudly.  Either way
2053         * we cannot remove the inode because internal state is or would have
2054         * been corrupt.
2055         */
2056        error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
2057                        &iu->iu_rhash_head, xfs_iunlink_hash_params);
2058        if (error)
2059                return error;
2060
2061        /* If there is no new next entry just free our item and return. */
2062        if (next_unlinked == NULLAGINO) {
2063                kmem_free(iu);
2064                return 0;
2065        }
2066
2067        /* Update the entry and re-add it to the hash table. */
2068        iu->iu_next_unlinked = next_unlinked;
2069        return xfs_iunlink_insert_backref(pag, iu);
2070}
2071
2072/* Set up the in-core predecessor structures. */
2073int
2074xfs_iunlink_init(
2075        struct xfs_perag        *pag)
2076{
2077        return rhashtable_init(&pag->pagi_unlinked_hash,
2078                        &xfs_iunlink_hash_params);
2079}
2080
2081/* Free the in-core predecessor structures. */
2082static void
2083xfs_iunlink_free_item(
2084        void                    *ptr,
2085        void                    *arg)
2086{
2087        struct xfs_iunlink      *iu = ptr;
2088        bool                    *freed_anything = arg;
2089
2090        *freed_anything = true;
2091        kmem_free(iu);
2092}
2093
2094void
2095xfs_iunlink_destroy(
2096        struct xfs_perag        *pag)
2097{
2098        bool                    freed_anything = false;
2099
2100        rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
2101                        xfs_iunlink_free_item, &freed_anything);
2102
2103        ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
2104}
2105
2106/*
2107 * Point the AGI unlinked bucket at an inode and log the results.  The caller
2108 * is responsible for validating the old value.
2109 */
2110STATIC int
2111xfs_iunlink_update_bucket(
2112        struct xfs_trans        *tp,
2113        xfs_agnumber_t          agno,
2114        struct xfs_buf          *agibp,
2115        unsigned int            bucket_index,
2116        xfs_agino_t             new_agino)
2117{
2118        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agibp);
2119        xfs_agino_t             old_value;
2120        int                     offset;
2121
2122        ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
2123
2124        old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2125        trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
2126                        old_value, new_agino);
2127
2128        /*
2129         * We should never find the head of the list already set to the value
2130         * passed in because either we're adding or removing ourselves from the
2131         * head of the list.
2132         */
2133        if (old_value == new_agino)
2134                return -EFSCORRUPTED;
2135
2136        agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
2137        offset = offsetof(struct xfs_agi, agi_unlinked) +
2138                        (sizeof(xfs_agino_t) * bucket_index);
2139        xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
2140        return 0;
2141}
2142
2143/* Set an on-disk inode's next_unlinked pointer. */
2144STATIC void
2145xfs_iunlink_update_dinode(
2146        struct xfs_trans        *tp,
2147        xfs_agnumber_t          agno,
2148        xfs_agino_t             agino,
2149        struct xfs_buf          *ibp,
2150        struct xfs_dinode       *dip,
2151        struct xfs_imap         *imap,
2152        xfs_agino_t             next_agino)
2153{
2154        struct xfs_mount        *mp = tp->t_mountp;
2155        int                     offset;
2156
2157        ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
2158
2159        trace_xfs_iunlink_update_dinode(mp, agno, agino,
2160                        be32_to_cpu(dip->di_next_unlinked), next_agino);
2161
2162        dip->di_next_unlinked = cpu_to_be32(next_agino);
2163        offset = imap->im_boffset +
2164                        offsetof(struct xfs_dinode, di_next_unlinked);
2165
2166        /* need to recalc the inode CRC if appropriate */
2167        xfs_dinode_calc_crc(mp, dip);
2168        xfs_trans_inode_buf(tp, ibp);
2169        xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
2170        xfs_inobp_check(mp, ibp);
2171}
2172
2173/* Set an in-core inode's unlinked pointer and return the old value. */
2174STATIC int
2175xfs_iunlink_update_inode(
2176        struct xfs_trans        *tp,
2177        struct xfs_inode        *ip,
2178        xfs_agnumber_t          agno,
2179        xfs_agino_t             next_agino,
2180        xfs_agino_t             *old_next_agino)
2181{
2182        struct xfs_mount        *mp = tp->t_mountp;
2183        struct xfs_dinode       *dip;
2184        struct xfs_buf          *ibp;
2185        xfs_agino_t             old_value;
2186        int                     error;
2187
2188        ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
2189
2190        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
2191        if (error)
2192                return error;
2193
2194        /* Make sure the old pointer isn't garbage. */
2195        old_value = be32_to_cpu(dip->di_next_unlinked);
2196        if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
2197                error = -EFSCORRUPTED;
2198                goto out;
2199        }
2200
2201        /*
2202         * Since we're updating a linked list, we should never find that the
2203         * current pointer is the same as the new value, unless we're
2204         * terminating the list.
2205         */
2206        *old_next_agino = old_value;
2207        if (old_value == next_agino) {
2208                if (next_agino != NULLAGINO)
2209                        error = -EFSCORRUPTED;
2210                goto out;
2211        }
2212
2213        /* Ok, update the new pointer. */
2214        xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
2215                        ibp, dip, &ip->i_imap, next_agino);
2216        return 0;
2217out:
2218        xfs_trans_brelse(tp, ibp);
2219        return error;
2220}
2221
2222/*
2223 * This is called when the inode's link count has gone to 0 or we are creating
2224 * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
2225 *
2226 * We place the on-disk inode on a list in the AGI.  It will be pulled from this
2227 * list when the inode is freed.
2228 */
2229STATIC int
2230xfs_iunlink(
2231        struct xfs_trans        *tp,
2232        struct xfs_inode        *ip)
2233{
2234        struct xfs_mount        *mp = tp->t_mountp;
2235        struct xfs_agi          *agi;
2236        struct xfs_buf          *agibp;
2237        xfs_agino_t             next_agino;
2238        xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2239        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2240        short                   bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2241        int                     error;
2242
2243        ASSERT(VFS_I(ip)->i_nlink == 0);
2244        ASSERT(VFS_I(ip)->i_mode != 0);
2245        trace_xfs_iunlink(ip);
2246
2247        /* Get the agi buffer first.  It ensures lock ordering on the list. */
2248        error = xfs_read_agi(mp, tp, agno, &agibp);
2249        if (error)
2250                return error;
2251        agi = XFS_BUF_TO_AGI(agibp);
2252
2253        /*
2254         * Get the index into the agi hash table for the list this inode will
2255         * go on.  Make sure the pointer isn't garbage and that this inode
2256         * isn't already on the list.
2257         */
2258        next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2259        if (next_agino == agino ||
2260            !xfs_verify_agino_or_null(mp, agno, next_agino))
2261                return -EFSCORRUPTED;
2262
2263        if (next_agino != NULLAGINO) {
2264                struct xfs_perag        *pag;
2265                xfs_agino_t             old_agino;
2266
2267                /*
2268                 * There is already another inode in the bucket, so point this
2269                 * inode to the current head of the list.
2270                 */
2271                error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
2272                                &old_agino);
2273                if (error)
2274                        return error;
2275                ASSERT(old_agino == NULLAGINO);
2276
2277                /*
2278                 * agino has been unlinked, add a backref from the next inode
2279                 * back to agino.
2280                 */
2281                pag = xfs_perag_get(mp, agno);
2282                error = xfs_iunlink_add_backref(pag, agino, next_agino);
2283                xfs_perag_put(pag);
2284                if (error)
2285                        return error;
2286        }
2287
2288        /* Point the head of the list to point to this inode. */
2289        return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
2290}
2291
2292/* Return the imap, dinode pointer, and buffer for an inode. */
2293STATIC int
2294xfs_iunlink_map_ino(
2295        struct xfs_trans        *tp,
2296        xfs_agnumber_t          agno,
2297        xfs_agino_t             agino,
2298        struct xfs_imap         *imap,
2299        struct xfs_dinode       **dipp,
2300        struct xfs_buf          **bpp)
2301{
2302        struct xfs_mount        *mp = tp->t_mountp;
2303        int                     error;
2304
2305        imap->im_blkno = 0;
2306        error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
2307        if (error) {
2308                xfs_warn(mp, "%s: xfs_imap returned error %d.",
2309                                __func__, error);
2310                return error;
2311        }
2312
2313        error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
2314        if (error) {
2315                xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
2316                                __func__, error);
2317                return error;
2318        }
2319
2320        return 0;
2321}
2322
2323/*
2324 * Walk the unlinked chain from @head_agino until we find the inode that
2325 * points to @target_agino.  Return the inode number, map, dinode pointer,
2326 * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
2327 *
2328 * @tp, @pag, @head_agino, and @target_agino are input parameters.
2329 * @agino, @imap, @dipp, and @bpp are all output parameters.
2330 *
2331 * Do not call this function if @target_agino is the head of the list.
2332 */
2333STATIC int
2334xfs_iunlink_map_prev(
2335        struct xfs_trans        *tp,
2336        xfs_agnumber_t          agno,
2337        xfs_agino_t             head_agino,
2338        xfs_agino_t             target_agino,
2339        xfs_agino_t             *agino,
2340        struct xfs_imap         *imap,
2341        struct xfs_dinode       **dipp,
2342        struct xfs_buf          **bpp,
2343        struct xfs_perag        *pag)
2344{
2345        struct xfs_mount        *mp = tp->t_mountp;
2346        xfs_agino_t             next_agino;
2347        int                     error;
2348
2349        ASSERT(head_agino != target_agino);
2350        *bpp = NULL;
2351
2352        /* See if our backref cache can find it faster. */
2353        *agino = xfs_iunlink_lookup_backref(pag, target_agino);
2354        if (*agino != NULLAGINO) {
2355                error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
2356                if (error)
2357                        return error;
2358
2359                if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
2360                        return 0;
2361
2362                /*
2363                 * If we get here the cache contents were corrupt, so drop the
2364                 * buffer and fall back to walking the bucket list.
2365                 */
2366                xfs_trans_brelse(tp, *bpp);
2367                *bpp = NULL;
2368                WARN_ON_ONCE(1);
2369        }
2370
2371        trace_xfs_iunlink_map_prev_fallback(mp, agno);
2372
2373        /* Otherwise, walk the entire bucket until we find it. */
2374        next_agino = head_agino;
2375        while (next_agino != target_agino) {
2376                xfs_agino_t     unlinked_agino;
2377
2378                if (*bpp)
2379                        xfs_trans_brelse(tp, *bpp);
2380
2381                *agino = next_agino;
2382                error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
2383                                bpp);
2384                if (error)
2385                        return error;
2386
2387                unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
2388                /*
2389                 * Make sure this pointer is valid and isn't an obvious
2390                 * infinite loop.
2391                 */
2392                if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
2393                    next_agino == unlinked_agino) {
2394                        XFS_CORRUPTION_ERROR(__func__,
2395                                        XFS_ERRLEVEL_LOW, mp,
2396                                        *dipp, sizeof(**dipp));
2397                        error = -EFSCORRUPTED;
2398                        return error;
2399                }
2400                next_agino = unlinked_agino;
2401        }
2402
2403        return 0;
2404}
2405
2406/*
2407 * Pull the on-disk inode from the AGI unlinked list.
2408 */
2409STATIC int
2410xfs_iunlink_remove(
2411        struct xfs_trans        *tp,
2412        struct xfs_inode        *ip)
2413{
2414        struct xfs_mount        *mp = tp->t_mountp;
2415        struct xfs_agi          *agi;
2416        struct xfs_buf          *agibp;
2417        struct xfs_buf          *last_ibp;
2418        struct xfs_dinode       *last_dip = NULL;
2419        struct xfs_perag        *pag = NULL;
2420        xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2421        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2422        xfs_agino_t             next_agino;
2423        xfs_agino_t             head_agino;
2424        short                   bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2425        int                     error;
2426
2427        trace_xfs_iunlink_remove(ip);
2428
2429        /* Get the agi buffer first.  It ensures lock ordering on the list. */
2430        error = xfs_read_agi(mp, tp, agno, &agibp);
2431        if (error)
2432                return error;
2433        agi = XFS_BUF_TO_AGI(agibp);
2434
2435        /*
2436         * Get the index into the agi hash table for the list this inode will
2437         * go on.  Make sure the head pointer isn't garbage.
2438         */
2439        head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2440        if (!xfs_verify_agino(mp, agno, head_agino)) {
2441                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
2442                                agi, sizeof(*agi));
2443                return -EFSCORRUPTED;
2444        }
2445
2446        /*
2447         * Set our inode's next_unlinked pointer to NULL and then return
2448         * the old pointer value so that we can update whatever was previous
2449         * to us in the list to point to whatever was next in the list.
2450         */
2451        error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
2452        if (error)
2453                return error;
2454
2455        /*
2456         * If there was a backref pointing from the next inode back to this
2457         * one, remove it because we've removed this inode from the list.
2458         *
2459         * Later, if this inode was in the middle of the list we'll update
2460         * this inode's backref to point from the next inode.
2461         */
2462        if (next_agino != NULLAGINO) {
2463                pag = xfs_perag_get(mp, agno);
2464                error = xfs_iunlink_change_backref(pag, next_agino,
2465                                NULLAGINO);
2466                if (error)
2467                        goto out;
2468        }
2469
2470        if (head_agino == agino) {
2471                /* Point the head of the list to the next unlinked inode. */
2472                error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
2473                                next_agino);
2474                if (error)
2475                        goto out;
2476        } else {
2477                struct xfs_imap imap;
2478                xfs_agino_t     prev_agino;
2479
2480                if (!pag)
2481                        pag = xfs_perag_get(mp, agno);
2482
2483                /* We need to search the list for the inode being freed. */
2484                error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
2485                                &prev_agino, &imap, &last_dip, &last_ibp,
2486                                pag);
2487                if (error)
2488                        goto out;
2489
2490                /* Point the previous inode on the list to the next inode. */
2491                xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
2492                                last_dip, &imap, next_agino);
2493
2494                /*
2495                 * Now we deal with the backref for this inode.  If this inode
2496                 * pointed at a real inode, change the backref that pointed to
2497                 * us to point to our old next.  If this inode was the end of
2498                 * the list, delete the backref that pointed to us.  Note that
2499                 * change_backref takes care of deleting the backref if
2500                 * next_agino is NULLAGINO.
2501                 */
2502                error = xfs_iunlink_change_backref(pag, agino, next_agino);
2503                if (error)
2504                        goto out;
2505        }
2506
2507out:
2508        if (pag)
2509                xfs_perag_put(pag);
2510        return error;
2511}
2512
2513/*
2514 * A big issue when freeing the inode cluster is that we _cannot_ skip any
2515 * inodes that are in memory - they all must be marked stale and attached to
2516 * the cluster buffer.
2517 */
2518STATIC int
2519xfs_ifree_cluster(
2520        xfs_inode_t             *free_ip,
2521        xfs_trans_t             *tp,
2522        struct xfs_icluster     *xic)
2523{
2524        xfs_mount_t             *mp = free_ip->i_mount;
2525        int                     nbufs;
2526        int                     i, j;
2527        int                     ioffset;
2528        xfs_daddr_t             blkno;
2529        xfs_buf_t               *bp;
2530        xfs_inode_t             *ip;
2531        xfs_inode_log_item_t    *iip;
2532        struct xfs_log_item     *lip;
2533        struct xfs_perag        *pag;
2534        struct xfs_ino_geometry *igeo = M_IGEO(mp);
2535        xfs_ino_t               inum;
2536
2537        inum = xic->first_ino;
2538        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2539        nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2540
2541        for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2542                /*
2543                 * The allocation bitmap tells us which inodes of the chunk were
2544                 * physically allocated. Skip the cluster if an inode falls into
2545                 * a sparse region.
2546                 */
2547                ioffset = inum - xic->first_ino;
2548                if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2549                        ASSERT(ioffset % igeo->inodes_per_cluster == 0);
2550                        continue;
2551                }
2552
2553                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2554                                         XFS_INO_TO_AGBNO(mp, inum));
2555
2556                /*
2557                 * We obtain and lock the backing buffer first in the process
2558                 * here, as we have to ensure that any dirty inode that we
2559                 * can't get the flush lock on is attached to the buffer.
2560                 * If we scan the in-memory inodes first, then buffer IO can
2561                 * complete before we get a lock on it, and hence we may fail
2562                 * to mark all the active inodes on the buffer stale.
2563                 */
2564                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2565                                        mp->m_bsize * igeo->blocks_per_cluster,
2566                                        XBF_UNMAPPED);
2567
2568                if (!bp)
2569                        return -ENOMEM;
2570
2571                /*
2572                 * This buffer may not have been correctly initialised as we
2573                 * didn't read it from disk. That's not important because we are
2574                 * only using to mark the buffer as stale in the log, and to
2575                 * attach stale cached inodes on it. That means it will never be
2576                 * dispatched for IO. If it is, we want to know about it, and we
2577                 * want it to fail. We can acheive this by adding a write
2578                 * verifier to the buffer.
2579                 */
2580                bp->b_ops = &xfs_inode_buf_ops;
2581
2582                /*
2583                 * Walk the inodes already attached to the buffer and mark them
2584                 * stale. These will all have the flush locks held, so an
2585                 * in-memory inode walk can't lock them. By marking them all
2586                 * stale first, we will not attempt to lock them in the loop
2587                 * below as the XFS_ISTALE flag will be set.
2588                 */
2589                list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
2590                        if (lip->li_type == XFS_LI_INODE) {
2591                                iip = (xfs_inode_log_item_t *)lip;
2592                                ASSERT(iip->ili_logged == 1);
2593                                lip->li_cb = xfs_istale_done;
2594                                xfs_trans_ail_copy_lsn(mp->m_ail,
2595                                                        &iip->ili_flush_lsn,
2596                                                        &iip->ili_item.li_lsn);
2597                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2598                        }
2599                }
2600
2601
2602                /*
2603                 * For each inode in memory attempt to add it to the inode
2604                 * buffer and set it up for being staled on buffer IO
2605                 * completion.  This is safe as we've locked out tail pushing
2606                 * and flushing by locking the buffer.
2607                 *
2608                 * We have already marked every inode that was part of a
2609                 * transaction stale above, which means there is no point in
2610                 * even trying to lock them.
2611                 */
2612                for (i = 0; i < igeo->inodes_per_cluster; i++) {
2613retry:
2614                        rcu_read_lock();
2615                        ip = radix_tree_lookup(&pag->pag_ici_root,
2616                                        XFS_INO_TO_AGINO(mp, (inum + i)));
2617
2618                        /* Inode not in memory, nothing to do */
2619                        if (!ip) {
2620                                rcu_read_unlock();
2621                                continue;
2622                        }
2623
2624                        /*
2625                         * because this is an RCU protected lookup, we could
2626                         * find a recently freed or even reallocated inode
2627                         * during the lookup. We need to check under the
2628                         * i_flags_lock for a valid inode here. Skip it if it
2629                         * is not valid, the wrong inode or stale.
2630                         */
2631                        spin_lock(&ip->i_flags_lock);
2632                        if (ip->i_ino != inum + i ||
2633                            __xfs_iflags_test(ip, XFS_ISTALE)) {
2634                                spin_unlock(&ip->i_flags_lock);
2635                                rcu_read_unlock();
2636                                continue;
2637                        }
2638                        spin_unlock(&ip->i_flags_lock);
2639
2640                        /*
2641                         * Don't try to lock/unlock the current inode, but we
2642                         * _cannot_ skip the other inodes that we did not find
2643                         * in the list attached to the buffer and are not
2644                         * already marked stale. If we can't lock it, back off
2645                         * and retry.
2646                         */
2647                        if (ip != free_ip) {
2648                                if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2649                                        rcu_read_unlock();
2650                                        delay(1);
2651                                        goto retry;
2652                                }
2653
2654                                /*
2655                                 * Check the inode number again in case we're
2656                                 * racing with freeing in xfs_reclaim_inode().
2657                                 * See the comments in that function for more
2658                                 * information as to why the initial check is
2659                                 * not sufficient.
2660                                 */
2661                                if (ip->i_ino != inum + i) {
2662                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
2663                                        rcu_read_unlock();
2664                                        continue;
2665                                }
2666                        }
2667                        rcu_read_unlock();
2668
2669                        xfs_iflock(ip);
2670                        xfs_iflags_set(ip, XFS_ISTALE);
2671
2672                        /*
2673                         * we don't need to attach clean inodes or those only
2674                         * with unlogged changes (which we throw away, anyway).
2675                         */
2676                        iip = ip->i_itemp;
2677                        if (!iip || xfs_inode_clean(ip)) {
2678                                ASSERT(ip != free_ip);
2679                                xfs_ifunlock(ip);
2680                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2681                                continue;
2682                        }
2683
2684                        iip->ili_last_fields = iip->ili_fields;
2685                        iip->ili_fields = 0;
2686                        iip->ili_fsync_fields = 0;
2687                        iip->ili_logged = 1;
2688                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2689                                                &iip->ili_item.li_lsn);
2690
2691                        xfs_buf_attach_iodone(bp, xfs_istale_done,
2692                                                  &iip->ili_item);
2693
2694                        if (ip != free_ip)
2695                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2696                }
2697
2698                xfs_trans_stale_inode_buf(tp, bp);
2699                xfs_trans_binval(tp, bp);
2700        }
2701
2702        xfs_perag_put(pag);
2703        return 0;
2704}
2705
2706/*
2707 * Free any local-format buffers sitting around before we reset to
2708 * extents format.
2709 */
2710static inline void
2711xfs_ifree_local_data(
2712        struct xfs_inode        *ip,
2713        int                     whichfork)
2714{
2715        struct xfs_ifork        *ifp;
2716
2717        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
2718                return;
2719
2720        ifp = XFS_IFORK_PTR(ip, whichfork);
2721        xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
2722}
2723
2724/*
2725 * This is called to return an inode to the inode free list.
2726 * The inode should already be truncated to 0 length and have
2727 * no pages associated with it.  This routine also assumes that
2728 * the inode is already a part of the transaction.
2729 *
2730 * The on-disk copy of the inode will have been added to the list
2731 * of unlinked inodes in the AGI. We need to remove the inode from
2732 * that list atomically with respect to freeing it here.
2733 */
2734int
2735xfs_ifree(
2736        struct xfs_trans        *tp,
2737        struct xfs_inode        *ip)
2738{
2739        int                     error;
2740        struct xfs_icluster     xic = { 0 };
2741
2742        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2743        ASSERT(VFS_I(ip)->i_nlink == 0);
2744        ASSERT(ip->i_d.di_nextents == 0);
2745        ASSERT(ip->i_d.di_anextents == 0);
2746        ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2747        ASSERT(ip->i_d.di_nblocks == 0);
2748
2749        /*
2750         * Pull the on-disk inode from the AGI unlinked list.
2751         */
2752        error = xfs_iunlink_remove(tp, ip);
2753        if (error)
2754                return error;
2755
2756        error = xfs_difree(tp, ip->i_ino, &xic);
2757        if (error)
2758                return error;
2759
2760        xfs_ifree_local_data(ip, XFS_DATA_FORK);
2761        xfs_ifree_local_data(ip, XFS_ATTR_FORK);
2762
2763        VFS_I(ip)->i_mode = 0;          /* mark incore inode as free */
2764        ip->i_d.di_flags = 0;
2765        ip->i_d.di_flags2 = 0;
2766        ip->i_d.di_dmevmask = 0;
2767        ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
2768        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2769        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2770
2771        /* Don't attempt to replay owner changes for a deleted inode */
2772        ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER);
2773
2774        /*
2775         * Bump the generation count so no one will be confused
2776         * by reincarnations of this inode.
2777         */
2778        VFS_I(ip)->i_generation++;
2779        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2780
2781        if (xic.deleted)
2782                error = xfs_ifree_cluster(ip, tp, &xic);
2783
2784        return error;
2785}
2786
2787/*
2788 * This is called to unpin an inode.  The caller must have the inode locked
2789 * in at least shared mode so that the buffer cannot be subsequently pinned
2790 * once someone is waiting for it to be unpinned.
2791 */
2792static void
2793xfs_iunpin(
2794        struct xfs_inode        *ip)
2795{
2796        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2797
2798        trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2799
2800        /* Give the log a push to start the unpinning I/O */
2801        xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL);
2802
2803}
2804
2805static void
2806__xfs_iunpin_wait(
2807        struct xfs_inode        *ip)
2808{
2809        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2810        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2811
2812        xfs_iunpin(ip);
2813
2814        do {
2815                prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2816                if (xfs_ipincount(ip))
2817                        io_schedule();
2818        } while (xfs_ipincount(ip));
2819        finish_wait(wq, &wait.wq_entry);
2820}
2821
2822void
2823xfs_iunpin_wait(
2824        struct xfs_inode        *ip)
2825{
2826        if (xfs_ipincount(ip))
2827                __xfs_iunpin_wait(ip);
2828}
2829
2830/*
2831 * Removing an inode from the namespace involves removing the directory entry
2832 * and dropping the link count on the inode. Removing the directory entry can
2833 * result in locking an AGF (directory blocks were freed) and removing a link
2834 * count can result in placing the inode on an unlinked list which results in
2835 * locking an AGI.
2836 *
2837 * The big problem here is that we have an ordering constraint on AGF and AGI
2838 * locking - inode allocation locks the AGI, then can allocate a new extent for
2839 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2840 * removes the inode from the unlinked list, requiring that we lock the AGI
2841 * first, and then freeing the inode can result in an inode chunk being freed
2842 * and hence freeing disk space requiring that we lock an AGF.
2843 *
2844 * Hence the ordering that is imposed by other parts of the code is AGI before
2845 * AGF. This means we cannot remove the directory entry before we drop the inode
2846 * reference count and put it on the unlinked list as this results in a lock
2847 * order of AGF then AGI, and this can deadlock against inode allocation and
2848 * freeing. Therefore we must drop the link counts before we remove the
2849 * directory entry.
2850 *
2851 * This is still safe from a transactional point of view - it is not until we
2852 * get to xfs_defer_finish() that we have the possibility of multiple
2853 * transactions in this operation. Hence as long as we remove the directory
2854 * entry and drop the link count in the first transaction of the remove
2855 * operation, there are no transactional constraints on the ordering here.
2856 */
2857int
2858xfs_remove(
2859        xfs_inode_t             *dp,
2860        struct xfs_name         *name,
2861        xfs_inode_t             *ip)
2862{
2863        xfs_mount_t             *mp = dp->i_mount;
2864        xfs_trans_t             *tp = NULL;
2865        int                     is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2866        int                     error = 0;
2867        uint                    resblks;
2868
2869        trace_xfs_remove(dp, name);
2870
2871        if (XFS_FORCED_SHUTDOWN(mp))
2872                return -EIO;
2873
2874        error = xfs_qm_dqattach(dp);
2875        if (error)
2876                goto std_return;
2877
2878        error = xfs_qm_dqattach(ip);
2879        if (error)
2880                goto std_return;
2881
2882        /*
2883         * We try to get the real space reservation first,
2884         * allowing for directory btree deletion(s) implying
2885         * possible bmap insert(s).  If we can't get the space
2886         * reservation then we use 0 instead, and avoid the bmap
2887         * btree insert(s) in the directory code by, if the bmap
2888         * insert tries to happen, instead trimming the LAST
2889         * block from the directory.
2890         */
2891        resblks = XFS_REMOVE_SPACE_RES(mp);
2892        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
2893        if (error == -ENOSPC) {
2894                resblks = 0;
2895                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
2896                                &tp);
2897        }
2898        if (error) {
2899                ASSERT(error != -ENOSPC);
2900                goto std_return;
2901        }
2902
2903        xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
2904
2905        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2906        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2907
2908        /*
2909         * If we're removing a directory perform some additional validation.
2910         */
2911        if (is_dir) {
2912                ASSERT(VFS_I(ip)->i_nlink >= 2);
2913                if (VFS_I(ip)->i_nlink != 2) {
2914                        error = -ENOTEMPTY;
2915                        goto out_trans_cancel;
2916                }
2917                if (!xfs_dir_isempty(ip)) {
2918                        error = -ENOTEMPTY;
2919                        goto out_trans_cancel;
2920                }
2921
2922                /* Drop the link from ip's "..".  */
2923                error = xfs_droplink(tp, dp);
2924                if (error)
2925                        goto out_trans_cancel;
2926
2927                /* Drop the "." link from ip to self.  */
2928                error = xfs_droplink(tp, ip);
2929                if (error)
2930                        goto out_trans_cancel;
2931        } else {
2932                /*
2933                 * When removing a non-directory we need to log the parent
2934                 * inode here.  For a directory this is done implicitly
2935                 * by the xfs_droplink call for the ".." entry.
2936                 */
2937                xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2938        }
2939        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2940
2941        /* Drop the link from dp to ip. */
2942        error = xfs_droplink(tp, ip);
2943        if (error)
2944                goto out_trans_cancel;
2945
2946        error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
2947        if (error) {
2948                ASSERT(error != -ENOENT);
2949                goto out_trans_cancel;
2950        }
2951
2952        /*
2953         * If this is a synchronous mount, make sure that the
2954         * remove transaction goes to disk before returning to
2955         * the user.
2956         */
2957        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2958                xfs_trans_set_sync(tp);
2959
2960        error = xfs_trans_commit(tp);
2961        if (error)
2962                goto std_return;
2963
2964        if (is_dir && xfs_inode_is_filestream(ip))
2965                xfs_filestream_deassociate(ip);
2966
2967        return 0;
2968
2969 out_trans_cancel:
2970        xfs_trans_cancel(tp);
2971 std_return:
2972        return error;
2973}
2974
2975/*
2976 * Enter all inodes for a rename transaction into a sorted array.
2977 */
2978#define __XFS_SORT_INODES       5
2979STATIC void
2980xfs_sort_for_rename(
2981        struct xfs_inode        *dp1,   /* in: old (source) directory inode */
2982        struct xfs_inode        *dp2,   /* in: new (target) directory inode */
2983        struct xfs_inode        *ip1,   /* in: inode of old entry */
2984        struct xfs_inode        *ip2,   /* in: inode of new entry */
2985        struct xfs_inode        *wip,   /* in: whiteout inode */
2986        struct xfs_inode        **i_tab,/* out: sorted array of inodes */
2987        int                     *num_inodes)  /* in/out: inodes in array */
2988{
2989        int                     i, j;
2990
2991        ASSERT(*num_inodes == __XFS_SORT_INODES);
2992        memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2993
2994        /*
2995         * i_tab contains a list of pointers to inodes.  We initialize
2996         * the table here & we'll sort it.  We will then use it to
2997         * order the acquisition of the inode locks.
2998         *
2999         * Note that the table may contain duplicates.  e.g., dp1 == dp2.
3000         */
3001        i = 0;
3002        i_tab[i++] = dp1;
3003        i_tab[i++] = dp2;
3004        i_tab[i++] = ip1;
3005        if (ip2)
3006                i_tab[i++] = ip2;
3007        if (wip)
3008                i_tab[i++] = wip;
3009        *num_inodes = i;
3010
3011        /*
3012         * Sort the elements via bubble sort.  (Remember, there are at
3013         * most 5 elements to sort, so this is adequate.)
3014         */
3015        for (i = 0; i < *num_inodes; i++) {
3016                for (j = 1; j < *num_inodes; j++) {
3017                        if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
3018                                struct xfs_inode *temp = i_tab[j];
3019                                i_tab[j] = i_tab[j-1];
3020                                i_tab[j-1] = temp;
3021                        }
3022                }
3023        }
3024}
3025
3026static int
3027xfs_finish_rename(
3028        struct xfs_trans        *tp)
3029{
3030        /*
3031         * If this is a synchronous mount, make sure that the rename transaction
3032         * goes to disk before returning to the user.
3033         */
3034        if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
3035                xfs_trans_set_sync(tp);
3036
3037        return xfs_trans_commit(tp);
3038}
3039
3040/*
3041 * xfs_cross_rename()
3042 *
3043 * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
3044 */
3045STATIC int
3046xfs_cross_rename(
3047        struct xfs_trans        *tp,
3048        struct xfs_inode        *dp1,
3049        struct xfs_name         *name1,
3050        struct xfs_inode        *ip1,
3051        struct xfs_inode        *dp2,
3052        struct xfs_name         *name2,
3053        struct xfs_inode        *ip2,
3054        int                     spaceres)
3055{
3056        int             error = 0;
3057        int             ip1_flags = 0;
3058        int             ip2_flags = 0;
3059        int             dp2_flags = 0;
3060
3061        /* Swap inode number for dirent in first parent */
3062        error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
3063        if (error)
3064                goto out_trans_abort;
3065
3066        /* Swap inode number for dirent in second parent */
3067        error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
3068        if (error)
3069                goto out_trans_abort;
3070
3071        /*
3072         * If we're renaming one or more directories across different parents,
3073         * update the respective ".." entries (and link counts) to match the new
3074         * parents.
3075         */
3076        if (dp1 != dp2) {
3077                dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
3078
3079                if (S_ISDIR(VFS_I(ip2)->i_mode)) {
3080                        error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
3081                                                dp1->i_ino, spaceres);
3082                        if (error)
3083                                goto out_trans_abort;
3084
3085                        /* transfer ip2 ".." reference to dp1 */
3086                        if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
3087                                error = xfs_droplink(tp, dp2);
3088                                if (error)
3089                                        goto out_trans_abort;
3090                                xfs_bumplink(tp, dp1);
3091                        }
3092
3093                        /*
3094                         * Although ip1 isn't changed here, userspace needs
3095                         * to be warned about the change, so that applications
3096                         * relying on it (like backup ones), will properly
3097                         * notify the change
3098                         */
3099                        ip1_flags |= XFS_ICHGTIME_CHG;
3100                        ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
3101                }
3102
3103                if (S_ISDIR(VFS_I(ip1)->i_mode)) {
3104                        error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
3105                                                dp2->i_ino, spaceres);
3106                        if (error)
3107                                goto out_trans_abort;
3108
3109                        /* transfer ip1 ".." reference to dp2 */
3110                        if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
3111                                error = xfs_droplink(tp, dp1);
3112                                if (error)
3113                                        goto out_trans_abort;
3114                                xfs_bumplink(tp, dp2);
3115                        }
3116
3117                        /*
3118                         * Although ip2 isn't changed here, userspace needs
3119                         * to be warned about the change, so that applications
3120                         * relying on it (like backup ones), will properly
3121                         * notify the change
3122                         */
3123                        ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
3124                        ip2_flags |= XFS_ICHGTIME_CHG;
3125                }
3126        }
3127
3128        if (ip1_flags) {
3129                xfs_trans_ichgtime(tp, ip1, ip1_flags);
3130                xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
3131        }
3132        if (ip2_flags) {
3133                xfs_trans_ichgtime(tp, ip2, ip2_flags);
3134                xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
3135        }
3136        if (dp2_flags) {
3137                xfs_trans_ichgtime(tp, dp2, dp2_flags);
3138                xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
3139        }
3140        xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3141        xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
3142        return xfs_finish_rename(tp);
3143
3144out_trans_abort:
3145        xfs_trans_cancel(tp);
3146        return error;
3147}
3148
3149/*
3150 * xfs_rename_alloc_whiteout()
3151 *
3152 * Return a referenced, unlinked, unlocked inode that that can be used as a
3153 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
3154 * crash between allocating the inode and linking it into the rename transaction
3155 * recovery will free the inode and we won't leak it.
3156 */
3157static int
3158xfs_rename_alloc_whiteout(
3159        struct xfs_inode        *dp,
3160        struct xfs_inode        **wip)
3161{
3162        struct xfs_inode        *tmpfile;
3163        int                     error;
3164
3165        error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile);
3166        if (error)
3167                return error;
3168
3169        /*
3170         * Prepare the tmpfile inode as if it were created through the VFS.
3171         * Complete the inode setup and flag it as linkable.  nlink is already
3172         * zero, so we can skip the drop_nlink.
3173         */
3174        xfs_setup_iops(tmpfile);
3175        xfs_finish_inode_setup(tmpfile);
3176        VFS_I(tmpfile)->i_state |= I_LINKABLE;
3177
3178        *wip = tmpfile;
3179        return 0;
3180}
3181
3182/*
3183 * xfs_rename
3184 */
3185int
3186xfs_rename(
3187        struct xfs_inode        *src_dp,
3188        struct xfs_name         *src_name,
3189        struct xfs_inode        *src_ip,
3190        struct xfs_inode        *target_dp,
3191        struct xfs_name         *target_name,
3192        struct xfs_inode        *target_ip,
3193        unsigned int            flags)
3194{
3195        struct xfs_mount        *mp = src_dp->i_mount;
3196        struct xfs_trans        *tp;
3197        struct xfs_inode        *wip = NULL;            /* whiteout inode */
3198        struct xfs_inode        *inodes[__XFS_SORT_INODES];
3199        int                     num_inodes = __XFS_SORT_INODES;
3200        bool                    new_parent = (src_dp != target_dp);
3201        bool                    src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
3202        int                     spaceres;
3203        int                     error;
3204
3205        trace_xfs_rename(src_dp, target_dp, src_name, target_name);
3206
3207        if ((flags & RENAME_EXCHANGE) && !target_ip)
3208                return -EINVAL;
3209
3210        /*
3211         * If we are doing a whiteout operation, allocate the whiteout inode
3212         * we will be placing at the target and ensure the type is set
3213         * appropriately.
3214         */
3215        if (flags & RENAME_WHITEOUT) {
3216                ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
3217                error = xfs_rename_alloc_whiteout(target_dp, &wip);
3218                if (error)
3219                        return error;
3220
3221                /* setup target dirent info as whiteout */
3222                src_name->type = XFS_DIR3_FT_CHRDEV;
3223        }
3224
3225        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
3226                                inodes, &num_inodes);
3227
3228        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
3229        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
3230        if (error == -ENOSPC) {
3231                spaceres = 0;
3232                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
3233                                &tp);
3234        }
3235        if (error)
3236                goto out_release_wip;
3237
3238        /*
3239         * Attach the dquots to the inodes
3240         */
3241        error = xfs_qm_vop_rename_dqattach(inodes);
3242        if (error)
3243                goto out_trans_cancel;
3244
3245        /*
3246         * Lock all the participating inodes. Depending upon whether
3247         * the target_name exists in the target directory, and
3248         * whether the target directory is the same as the source
3249         * directory, we can lock from 2 to 4 inodes.
3250         */
3251        xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
3252
3253        /*
3254         * Join all the inodes to the transaction. From this point on,
3255         * we can rely on either trans_commit or trans_cancel to unlock
3256         * them.
3257         */
3258        xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
3259        if (new_parent)
3260                xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
3261        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
3262        if (target_ip)
3263                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
3264        if (wip)
3265                xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
3266
3267        /*
3268         * If we are using project inheritance, we only allow renames
3269         * into our tree when the project IDs are the same; else the
3270         * tree quota mechanism would be circumvented.
3271         */
3272        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
3273                     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
3274                error = -EXDEV;
3275                goto out_trans_cancel;
3276        }
3277
3278        /* RENAME_EXCHANGE is unique from here on. */
3279        if (flags & RENAME_EXCHANGE)
3280                return xfs_cross_rename(tp, src_dp, src_name, src_ip,
3281                                        target_dp, target_name, target_ip,
3282                                        spaceres);
3283
3284        /*
3285         * Set up the target.
3286         */
3287        if (target_ip == NULL) {
3288                /*
3289                 * If there's no space reservation, check the entry will
3290                 * fit before actually inserting it.
3291                 */
3292                if (!spaceres) {
3293                        error = xfs_dir_canenter(tp, target_dp, target_name);
3294                        if (error)
3295                                goto out_trans_cancel;
3296                }
3297                /*
3298                 * If target does not exist and the rename crosses
3299                 * directories, adjust the target directory link count
3300                 * to account for the ".." reference from the new entry.
3301                 */
3302                error = xfs_dir_createname(tp, target_dp, target_name,
3303                                           src_ip->i_ino, spaceres);
3304                if (error)
3305                        goto out_trans_cancel;
3306
3307                xfs_trans_ichgtime(tp, target_dp,
3308                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3309
3310                if (new_parent && src_is_directory) {
3311                        xfs_bumplink(tp, target_dp);
3312                }
3313        } else { /* target_ip != NULL */
3314                /*
3315                 * If target exists and it's a directory, check that both
3316                 * target and source are directories and that target can be
3317                 * destroyed, or that neither is a directory.
3318                 */
3319                if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
3320                        /*
3321                         * Make sure target dir is empty.
3322                         */
3323                        if (!(xfs_dir_isempty(target_ip)) ||
3324                            (VFS_I(target_ip)->i_nlink > 2)) {
3325                                error = -EEXIST;
3326                                goto out_trans_cancel;
3327                        }
3328                }
3329
3330                /*
3331                 * Link the source inode under the target name.
3332                 * If the source inode is a directory and we are moving
3333                 * it across directories, its ".." entry will be
3334                 * inconsistent until we replace that down below.
3335                 *
3336                 * In case there is already an entry with the same
3337                 * name at the destination directory, remove it first.
3338                 */
3339                error = xfs_dir_replace(tp, target_dp, target_name,
3340                                        src_ip->i_ino, spaceres);
3341                if (error)
3342                        goto out_trans_cancel;
3343
3344                xfs_trans_ichgtime(tp, target_dp,
3345                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3346
3347                /*
3348                 * Decrement the link count on the target since the target
3349                 * dir no longer points to it.
3350                 */
3351                error = xfs_droplink(tp, target_ip);
3352                if (error)
3353                        goto out_trans_cancel;
3354
3355                if (src_is_directory) {
3356                        /*
3357                         * Drop the link from the old "." entry.
3358                         */
3359                        error = xfs_droplink(tp, target_ip);
3360                        if (error)
3361                                goto out_trans_cancel;
3362                }
3363        } /* target_ip != NULL */
3364
3365        /*
3366         * Remove the source.
3367         */
3368        if (new_parent && src_is_directory) {
3369                /*
3370                 * Rewrite the ".." entry to point to the new
3371                 * directory.
3372                 */
3373                error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
3374                                        target_dp->i_ino, spaceres);
3375                ASSERT(error != -EEXIST);
3376                if (error)
3377                        goto out_trans_cancel;
3378        }
3379
3380        /*
3381         * We always want to hit the ctime on the source inode.
3382         *
3383         * This isn't strictly required by the standards since the source
3384         * inode isn't really being changed, but old unix file systems did
3385         * it and some incremental backup programs won't work without it.
3386         */
3387        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
3388        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
3389
3390        /*
3391         * Adjust the link count on src_dp.  This is necessary when
3392         * renaming a directory, either within one parent when
3393         * the target existed, or across two parent directories.
3394         */
3395        if (src_is_directory && (new_parent || target_ip != NULL)) {
3396
3397                /*
3398                 * Decrement link count on src_directory since the
3399                 * entry that's moved no longer points to it.
3400                 */
3401                error = xfs_droplink(tp, src_dp);
3402                if (error)
3403                        goto out_trans_cancel;
3404        }
3405
3406        /*
3407         * For whiteouts, we only need to update the source dirent with the
3408         * inode number of the whiteout inode rather than removing it
3409         * altogether.
3410         */
3411        if (wip) {
3412                error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3413                                        spaceres);
3414        } else
3415                error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3416                                           spaceres);
3417        if (error)
3418                goto out_trans_cancel;
3419
3420        /*
3421         * For whiteouts, we need to bump the link count on the whiteout inode.
3422         * This means that failures all the way up to this point leave the inode
3423         * on the unlinked list and so cleanup is a simple matter of dropping
3424         * the remaining reference to it. If we fail here after bumping the link
3425         * count, we're shutting down the filesystem so we'll never see the
3426         * intermediate state on disk.
3427         */
3428        if (wip) {
3429                ASSERT(VFS_I(wip)->i_nlink == 0);
3430                xfs_bumplink(tp, wip);
3431                error = xfs_iunlink_remove(tp, wip);
3432                if (error)
3433                        goto out_trans_cancel;
3434                xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
3435
3436                /*
3437                 * Now we have a real link, clear the "I'm a tmpfile" state
3438                 * flag from the inode so it doesn't accidentally get misused in
3439                 * future.
3440                 */
3441                VFS_I(wip)->i_state &= ~I_LINKABLE;
3442        }
3443
3444        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3445        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3446        if (new_parent)
3447                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3448
3449        error = xfs_finish_rename(tp);
3450        if (wip)
3451                xfs_irele(wip);
3452        return error;
3453
3454out_trans_cancel:
3455        xfs_trans_cancel(tp);
3456out_release_wip:
3457        if (wip)
3458                xfs_irele(wip);
3459        return error;
3460}
3461
3462STATIC int
3463xfs_iflush_cluster(
3464        struct xfs_inode        *ip,
3465        struct xfs_buf          *bp)
3466{
3467        struct xfs_mount        *mp = ip->i_mount;
3468        struct xfs_perag        *pag;
3469        unsigned long           first_index, mask;
3470        int                     cilist_size;
3471        struct xfs_inode        **cilist;
3472        struct xfs_inode        *cip;
3473        struct xfs_ino_geometry *igeo = M_IGEO(mp);
3474        int                     nr_found;
3475        int                     clcount = 0;
3476        int                     i;
3477
3478        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
3479
3480        cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
3481        cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
3482        if (!cilist)
3483                goto out_put;
3484
3485        mask = ~(igeo->inodes_per_cluster - 1);
3486        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
3487        rcu_read_lock();
3488        /* really need a gang lookup range call here */
3489        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
3490                                        first_index, igeo->inodes_per_cluster);
3491        if (nr_found == 0)
3492                goto out_free;
3493
3494        for (i = 0; i < nr_found; i++) {
3495                cip = cilist[i];
3496                if (cip == ip)
3497                        continue;
3498
3499                /*
3500                 * because this is an RCU protected lookup, we could find a
3501                 * recently freed or even reallocated inode during the lookup.
3502                 * We need to check under the i_flags_lock for a valid inode
3503                 * here. Skip it if it is not valid or the wrong inode.
3504                 */
3505                spin_lock(&cip->i_flags_lock);
3506                if (!cip->i_ino ||
3507                    __xfs_iflags_test(cip, XFS_ISTALE)) {
3508                        spin_unlock(&cip->i_flags_lock);
3509                        continue;
3510                }
3511
3512                /*
3513                 * Once we fall off the end of the cluster, no point checking
3514                 * any more inodes in the list because they will also all be
3515                 * outside the cluster.
3516                 */
3517                if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
3518                        spin_unlock(&cip->i_flags_lock);
3519                        break;
3520                }
3521                spin_unlock(&cip->i_flags_lock);
3522
3523                /*
3524                 * Do an un-protected check to see if the inode is dirty and
3525                 * is a candidate for flushing.  These checks will be repeated
3526                 * later after the appropriate locks are acquired.
3527                 */
3528                if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
3529                        continue;
3530
3531                /*
3532                 * Try to get locks.  If any are unavailable or it is pinned,
3533                 * then this inode cannot be flushed and is skipped.
3534                 */
3535
3536                if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
3537                        continue;
3538                if (!xfs_iflock_nowait(cip)) {
3539                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
3540                        continue;
3541                }
3542                if (xfs_ipincount(cip)) {
3543                        xfs_ifunlock(cip);
3544                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
3545                        continue;
3546                }
3547
3548
3549                /*
3550                 * Check the inode number again, just to be certain we are not
3551                 * racing with freeing in xfs_reclaim_inode(). See the comments
3552                 * in that function for more information as to why the initial
3553                 * check is not sufficient.
3554                 */
3555                if (!cip->i_ino) {
3556                        xfs_ifunlock(cip);
3557                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
3558                        continue;
3559                }
3560
3561                /*
3562                 * arriving here means that this inode can be flushed.  First
3563                 * re-check that it's dirty before flushing.
3564                 */
3565                if (!xfs_inode_clean(cip)) {
3566                        int     error;
3567                        error = xfs_iflush_int(cip, bp);
3568                        if (error) {
3569                                xfs_iunlock(cip, XFS_ILOCK_SHARED);
3570                                goto cluster_corrupt_out;
3571                        }
3572                        clcount++;
3573                } else {
3574                        xfs_ifunlock(cip);
3575                }
3576                xfs_iunlock(cip, XFS_ILOCK_SHARED);
3577        }
3578
3579        if (clcount) {
3580                XFS_STATS_INC(mp, xs_icluster_flushcnt);
3581                XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3582        }
3583
3584out_free:
3585        rcu_read_unlock();
3586        kmem_free(cilist);
3587out_put:
3588        xfs_perag_put(pag);
3589        return 0;
3590
3591
3592cluster_corrupt_out:
3593        /*
3594         * Corruption detected in the clustering loop.  Invalidate the
3595         * inode buffer and shut down the filesystem.
3596         */
3597        rcu_read_unlock();
3598
3599        /*
3600         * We'll always have an inode attached to the buffer for completion
3601         * process by the time we are called from xfs_iflush(). Hence we have
3602         * always need to do IO completion processing to abort the inodes
3603         * attached to the buffer.  handle them just like the shutdown case in
3604         * xfs_buf_submit().
3605         */
3606        ASSERT(bp->b_iodone);
3607        bp->b_flags |= XBF_ASYNC;
3608        bp->b_flags &= ~XBF_DONE;
3609        xfs_buf_stale(bp);
3610        xfs_buf_ioerror(bp, -EIO);
3611        xfs_buf_ioend(bp);
3612
3613        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3614
3615        /* abort the corrupt inode, as it was not attached to the buffer */
3616        xfs_iflush_abort(cip, false);
3617        kmem_free(cilist);
3618        xfs_perag_put(pag);
3619        return -EFSCORRUPTED;
3620}
3621
3622/*
3623 * Flush dirty inode metadata into the backing buffer.
3624 *
3625 * The caller must have the inode lock and the inode flush lock held.  The
3626 * inode lock will still be held upon return to the caller, and the inode
3627 * flush lock will be released after the inode has reached the disk.
3628 *
3629 * The caller must write out the buffer returned in *bpp and release it.
3630 */
3631int
3632xfs_iflush(
3633        struct xfs_inode        *ip,
3634        struct xfs_buf          **bpp)
3635{
3636        struct xfs_mount        *mp = ip->i_mount;
3637        struct xfs_buf          *bp = NULL;
3638        struct xfs_dinode       *dip;
3639        int                     error;
3640
3641        XFS_STATS_INC(mp, xs_iflush_count);
3642
3643        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3644        ASSERT(xfs_isiflocked(ip));
3645        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3646               ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3647
3648        *bpp = NULL;
3649
3650        xfs_iunpin_wait(ip);
3651
3652        /*
3653         * For stale inodes we cannot rely on the backing buffer remaining
3654         * stale in cache for the remaining life of the stale inode and so
3655         * xfs_imap_to_bp() below may give us a buffer that no longer contains
3656         * inodes below. We have to check this after ensuring the inode is
3657         * unpinned so that it is safe to reclaim the stale inode after the
3658         * flush call.
3659         */
3660        if (xfs_iflags_test(ip, XFS_ISTALE)) {
3661                xfs_ifunlock(ip);
3662                return 0;
3663        }
3664
3665        /*
3666         * This may have been unpinned because the filesystem is shutting
3667         * down forcibly. If that's the case we must not write this inode
3668         * to disk, because the log record didn't make it to disk.
3669         *
3670         * We also have to remove the log item from the AIL in this case,
3671         * as we wait for an empty AIL as part of the unmount process.
3672         */
3673        if (XFS_FORCED_SHUTDOWN(mp)) {
3674                error = -EIO;
3675                goto abort_out;
3676        }
3677
3678        /*
3679         * Get the buffer containing the on-disk inode. We are doing a try-lock
3680         * operation here, so we may get  an EAGAIN error. In that case, we
3681         * simply want to return with the inode still dirty.
3682         *
3683         * If we get any other error, we effectively have a corruption situation
3684         * and we cannot flush the inode, so we treat it the same as failing
3685         * xfs_iflush_int().
3686         */
3687        error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
3688                               0);
3689        if (error == -EAGAIN) {
3690                xfs_ifunlock(ip);
3691                return error;
3692        }
3693        if (error)
3694                goto corrupt_out;
3695
3696        /*
3697         * First flush out the inode that xfs_iflush was called with.
3698         */
3699        error = xfs_iflush_int(ip, bp);
3700        if (error)
3701                goto corrupt_out;
3702
3703        /*
3704         * If the buffer is pinned then push on the log now so we won't
3705         * get stuck waiting in the write for too long.
3706         */
3707        if (xfs_buf_ispinned(bp))
3708                xfs_log_force(mp, 0);
3709
3710        /*
3711         * inode clustering: try to gather other inodes into this write
3712         *
3713         * Note: Any error during clustering will result in the filesystem
3714         * being shut down and completion callbacks run on the cluster buffer.
3715         * As we have already flushed and attached this inode to the buffer,
3716         * it has already been aborted and released by xfs_iflush_cluster() and
3717         * so we have no further error handling to do here.
3718         */
3719        error = xfs_iflush_cluster(ip, bp);
3720        if (error)
3721                return error;
3722
3723        *bpp = bp;
3724        return 0;
3725
3726corrupt_out:
3727        if (bp)
3728                xfs_buf_relse(bp);
3729        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3730abort_out:
3731        /* abort the corrupt inode, as it was not attached to the buffer */
3732        xfs_iflush_abort(ip, false);
3733        return error;
3734}
3735
3736/*
3737 * If there are inline format data / attr forks attached to this inode,
3738 * make sure they're not corrupt.
3739 */
3740bool
3741xfs_inode_verify_forks(
3742        struct xfs_inode        *ip)
3743{
3744        struct xfs_ifork        *ifp;
3745        xfs_failaddr_t          fa;
3746
3747        fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops);
3748        if (fa) {
3749                ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
3750                xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
3751                                ifp->if_u1.if_data, ifp->if_bytes, fa);
3752                return false;
3753        }
3754
3755        fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops);
3756        if (fa) {
3757                ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
3758                xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
3759                                ifp ? ifp->if_u1.if_data : NULL,
3760                                ifp ? ifp->if_bytes : 0, fa);
3761                return false;
3762        }
3763        return true;
3764}
3765
3766STATIC int
3767xfs_iflush_int(
3768        struct xfs_inode        *ip,
3769        struct xfs_buf          *bp)
3770{
3771        struct xfs_inode_log_item *iip = ip->i_itemp;
3772        struct xfs_dinode       *dip;
3773        struct xfs_mount        *mp = ip->i_mount;
3774
3775        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3776        ASSERT(xfs_isiflocked(ip));
3777        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3778               ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3779        ASSERT(iip != NULL && iip->ili_fields != 0);
3780        ASSERT(ip->i_d.di_version > 1);
3781
3782        /* set *dip = inode's place in the buffer */
3783        dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3784
3785        if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3786                               mp, XFS_ERRTAG_IFLUSH_1)) {
3787                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3788                        "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
3789                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3790                goto corrupt_out;
3791        }
3792        if (S_ISREG(VFS_I(ip)->i_mode)) {
3793                if (XFS_TEST_ERROR(
3794                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3795                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3796                    mp, XFS_ERRTAG_IFLUSH_3)) {
3797                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3798                                "%s: Bad regular inode %Lu, ptr "PTR_FMT,
3799                                __func__, ip->i_ino, ip);
3800                        goto corrupt_out;
3801                }
3802        } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3803                if (XFS_TEST_ERROR(
3804                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3805                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3806                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3807                    mp, XFS_ERRTAG_IFLUSH_4)) {
3808                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3809                                "%s: Bad directory inode %Lu, ptr "PTR_FMT,
3810                                __func__, ip->i_ino, ip);
3811                        goto corrupt_out;
3812                }
3813        }
3814        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3815                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
3816                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3817                        "%s: detected corrupt incore inode %Lu, "
3818                        "total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
3819                        __func__, ip->i_ino,
3820                        ip->i_d.di_nextents + ip->i_d.di_anextents,
3821                        ip->i_d.di_nblocks, ip);
3822                goto corrupt_out;
3823        }
3824        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3825                                mp, XFS_ERRTAG_IFLUSH_6)) {
3826                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3827                        "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
3828                        __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
3829                goto corrupt_out;
3830        }
3831
3832        /*
3833         * Inode item log recovery for v2 inodes are dependent on the
3834         * di_flushiter count for correct sequencing. We bump the flush
3835         * iteration count so we can detect flushes which postdate a log record
3836         * during recovery. This is redundant as we now log every change and
3837         * hence this can't happen but we need to still do it to ensure
3838         * backwards compatibility with old kernels that predate logging all
3839         * inode changes.
3840         */
3841        if (ip->i_d.di_version < 3)
3842                ip->i_d.di_flushiter++;
3843
3844        /* Check the inline fork data before we write out. */
3845        if (!xfs_inode_verify_forks(ip))
3846                goto corrupt_out;
3847
3848        /*
3849         * Copy the dirty parts of the inode into the on-disk inode.  We always
3850         * copy out the core of the inode, because if the inode is dirty at all
3851         * the core must be.
3852         */
3853        xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3854
3855        /* Wrap, we never let the log put out DI_MAX_FLUSH */
3856        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3857                ip->i_d.di_flushiter = 0;
3858
3859        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3860        if (XFS_IFORK_Q(ip))
3861                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3862        xfs_inobp_check(mp, bp);
3863
3864        /*
3865         * We've recorded everything logged in the inode, so we'd like to clear
3866         * the ili_fields bits so we don't log and flush things unnecessarily.
3867         * However, we can't stop logging all this information until the data
3868         * we've copied into the disk buffer is written to disk.  If we did we
3869         * might overwrite the copy of the inode in the log with all the data
3870         * after re-logging only part of it, and in the face of a crash we
3871         * wouldn't have all the data we need to recover.
3872         *
3873         * What we do is move the bits to the ili_last_fields field.  When
3874         * logging the inode, these bits are moved back to the ili_fields field.
3875         * In the xfs_iflush_done() routine we clear ili_last_fields, since we
3876         * know that the information those bits represent is permanently on
3877         * disk.  As long as the flush completes before the inode is logged
3878         * again, then both ili_fields and ili_last_fields will be cleared.
3879         *
3880         * We can play with the ili_fields bits here, because the inode lock
3881         * must be held exclusively in order to set bits there and the flush
3882         * lock protects the ili_last_fields bits.  Set ili_logged so the flush
3883         * done routine can tell whether or not to look in the AIL.  Also, store
3884         * the current LSN of the inode so that we can tell whether the item has
3885         * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
3886         * need the AIL lock, because it is a 64 bit value that cannot be read
3887         * atomically.
3888         */
3889        iip->ili_last_fields = iip->ili_fields;
3890        iip->ili_fields = 0;
3891        iip->ili_fsync_fields = 0;
3892        iip->ili_logged = 1;
3893
3894        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3895                                &iip->ili_item.li_lsn);
3896
3897        /*
3898         * Attach the function xfs_iflush_done to the inode's
3899         * buffer.  This will remove the inode from the AIL
3900         * and unlock the inode's flush lock when the inode is
3901         * completely written to disk.
3902         */
3903        xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3904
3905        /* generate the checksum. */
3906        xfs_dinode_calc_crc(mp, dip);
3907
3908        ASSERT(!list_empty(&bp->b_li_list));
3909        ASSERT(bp->b_iodone != NULL);
3910        return 0;
3911
3912corrupt_out:
3913        return -EFSCORRUPTED;
3914}
3915
3916/* Release an inode. */
3917void
3918xfs_irele(
3919        struct xfs_inode        *ip)
3920{
3921        trace_xfs_irele(ip, _RET_IP_);
3922        iput(VFS_I(ip));
3923}
3924