LXR linux/fs/xfs/xfs

   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include <linux/log2.h>
  19
  20#include "xfs.h"
  21#include "xfs_fs.h"
  22#include "xfs_shared.h"
  23#include "xfs_format.h"
  24#include "xfs_log_format.h"
  25#include "xfs_trans_resv.h"
  26#include "xfs_sb.h"
  27#include "xfs_mount.h"
  28#include "xfs_defer.h"
  29#include "xfs_inode.h"
  30#include "xfs_da_format.h"
  31#include "xfs_da_btree.h"
  32#include "xfs_dir2.h"
  33#include "xfs_attr_sf.h"
  34#include "xfs_attr.h"
  35#include "xfs_trans_space.h"
  36#include "xfs_trans.h"
  37#include "xfs_buf_item.h"
  38#include "xfs_inode_item.h"
  39#include "xfs_ialloc.h"
  40#include "xfs_bmap.h"
  41#include "xfs_bmap_util.h"
  42#include "xfs_errortag.h"
  43#include "xfs_error.h"
  44#include "xfs_quota.h"
  45#include "xfs_filestream.h"
  46#include "xfs_cksum.h"
  47#include "xfs_trace.h"
  48#include "xfs_icache.h"
  49#include "xfs_symlink.h"
  50#include "xfs_trans_priv.h"
  51#include "xfs_log.h"
  52#include "xfs_bmap_btree.h"
  53#include "xfs_dir2_priv.h"
  54
  55kmem_zone_t *xfs_inode_zone;
  56
  57/*
  58 * Used in xfs_itruncate_extents().  This is the maximum number of extents
  59 * freed from a file in a single transaction.
  60 */
  61#define XFS_ITRUNC_MAX_EXTENTS  2
  62
  63STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
  64STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
  65STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
  66
  67/*
  68 * helper function to extract extent size hint from inode
  69 */
  70xfs_extlen_t
  71xfs_get_extsz_hint(
  72        struct xfs_inode        *ip)
  73{
  74        if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
  75                return ip->i_d.di_extsize;
  76        if (XFS_IS_REALTIME_INODE(ip))
  77                return ip->i_mount->m_sb.sb_rextsize;
  78        return 0;
  79}
  80
  81/*
  82 * These two are wrapper routines around the xfs_ilock() routine used to
  83 * centralize some grungy code.  They are used in places that wish to lock the
  84 * inode solely for reading the extents.  The reason these places can't just
  85 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
  86 * bringing in of the extents from disk for a file in b-tree format.  If the
  87 * inode is in b-tree format, then we need to lock the inode exclusively until
  88 * the extents are read in.  Locking it exclusively all the time would limit
  89 * our parallelism unnecessarily, though.  What we do instead is check to see
  90 * if the extents have been read in yet, and only lock the inode exclusively
  91 * if they have not.
  92 *
  93 * The functions return a value which should be given to the corresponding
  94 * xfs_iunlock() call.
  95 */
  96uint
  97xfs_ilock_data_map_shared(
  98        struct xfs_inode        *ip)
  99{
 100        uint                    lock_mode = XFS_ILOCK_SHARED;
 101
 102        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
 103            (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
 104                lock_mode = XFS_ILOCK_EXCL;
 105        xfs_ilock(ip, lock_mode);
 106        return lock_mode;
 107}
 108
 109uint
 110xfs_ilock_attr_map_shared(
 111        struct xfs_inode        *ip)
 112{
 113        uint                    lock_mode = XFS_ILOCK_SHARED;
 114
 115        if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
 116            (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
 117                lock_mode = XFS_ILOCK_EXCL;
 118        xfs_ilock(ip, lock_mode);
 119        return lock_mode;
 120}
 121
 122/*
 123 * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
 124 * the i_lock.  This routine allows various combinations of the locks to be
 125 * obtained.
 126 *
 127 * The 3 locks should always be ordered so that the IO lock is obtained first,
 128 * the mmap lock second and the ilock last in order to prevent deadlock.
 129 *
 130 * Basic locking order:
 131 *
 132 * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
 133 *
 134 * mmap_sem locking order:
 135 *
 136 * i_iolock -> page lock -> mmap_sem
 137 * mmap_sem -> i_mmap_lock -> page_lock
 138 *
 139 * The difference in mmap_sem locking order mean that we cannot hold the
 140 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
 141 * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
 142 * in get_user_pages() to map the user pages into the kernel address space for
 143 * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
 144 * page faults already hold the mmap_sem.
 145 *
 146 * Hence to serialise fully against both syscall and mmap based IO, we need to
 147 * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
 148 * taken in places where we need to invalidate the page cache in a race
 149 * free manner (e.g. truncate, hole punch and other extent manipulation
 150 * functions).
 151 */
 152void
 153xfs_ilock(
 154        xfs_inode_t             *ip,
 155        uint                    lock_flags)
 156{
 157        trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 158
 159        /*
 160         * You can't set both SHARED and EXCL for the same lock,
 161         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 162         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 163         */
 164        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 165               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 166        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 167               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 168        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 169               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 170        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 171
 172        if (lock_flags & XFS_IOLOCK_EXCL)
 173                mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 174        else if (lock_flags & XFS_IOLOCK_SHARED)
 175                mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 176
 177        if (lock_flags & XFS_MMAPLOCK_EXCL)
 178                mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
 179        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 180                mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
 181
 182        if (lock_flags & XFS_ILOCK_EXCL)
 183                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 184        else if (lock_flags & XFS_ILOCK_SHARED)
 185                mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 186}
 187
 188/*
 189 * This is just like xfs_ilock(), except that the caller
 190 * is guaranteed not to sleep.  It returns 1 if it gets
 191 * the requested locks and 0 otherwise.  If the IO lock is
 192 * obtained but the inode lock cannot be, then the IO lock
 193 * is dropped before returning.
 194 *
 195 * ip -- the inode being locked
 196 * lock_flags -- this parameter indicates the inode's locks to be
 197 *       to be locked.  See the comment for xfs_ilock() for a list
 198 *       of valid values.
 199 */
 200int
 201xfs_ilock_nowait(
 202        xfs_inode_t             *ip,
 203        uint                    lock_flags)
 204{
 205        trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
 206
 207        /*
 208         * You can't set both SHARED and EXCL for the same lock,
 209         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 210         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 211         */
 212        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 213               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 214        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 215               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 216        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 217               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 218        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 219
 220        if (lock_flags & XFS_IOLOCK_EXCL) {
 221                if (!mrtryupdate(&ip->i_iolock))
 222                        goto out;
 223        } else if (lock_flags & XFS_IOLOCK_SHARED) {
 224                if (!mrtryaccess(&ip->i_iolock))
 225                        goto out;
 226        }
 227
 228        if (lock_flags & XFS_MMAPLOCK_EXCL) {
 229                if (!mrtryupdate(&ip->i_mmaplock))
 230                        goto out_undo_iolock;
 231        } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
 232                if (!mrtryaccess(&ip->i_mmaplock))
 233                        goto out_undo_iolock;
 234        }
 235
 236        if (lock_flags & XFS_ILOCK_EXCL) {
 237                if (!mrtryupdate(&ip->i_lock))
 238                        goto out_undo_mmaplock;
 239        } else if (lock_flags & XFS_ILOCK_SHARED) {
 240                if (!mrtryaccess(&ip->i_lock))
 241                        goto out_undo_mmaplock;
 242        }
 243        return 1;
 244
 245out_undo_mmaplock:
 246        if (lock_flags & XFS_MMAPLOCK_EXCL)
 247                mrunlock_excl(&ip->i_mmaplock);
 248        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 249                mrunlock_shared(&ip->i_mmaplock);
 250out_undo_iolock:
 251        if (lock_flags & XFS_IOLOCK_EXCL)
 252                mrunlock_excl(&ip->i_iolock);
 253        else if (lock_flags & XFS_IOLOCK_SHARED)
 254                mrunlock_shared(&ip->i_iolock);
 255out:
 256        return 0;
 257}
 258
 259/*
 260 * xfs_iunlock() is used to drop the inode locks acquired with
 261 * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
 262 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
 263 * that we know which locks to drop.
 264 *
 265 * ip -- the inode being unlocked
 266 * lock_flags -- this parameter indicates the inode's locks to be
 267 *       to be unlocked.  See the comment for xfs_ilock() for a list
 268 *       of valid values for this parameter.
 269 *
 270 */
 271void
 272xfs_iunlock(
 273        xfs_inode_t             *ip,
 274        uint                    lock_flags)
 275{
 276        /*
 277         * You can't set both SHARED and EXCL for the same lock,
 278         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 279         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 280         */
 281        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 282               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 283        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 284               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 285        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 286               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 287        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 288        ASSERT(lock_flags != 0);
 289
 290        if (lock_flags & XFS_IOLOCK_EXCL)
 291                mrunlock_excl(&ip->i_iolock);
 292        else if (lock_flags & XFS_IOLOCK_SHARED)
 293                mrunlock_shared(&ip->i_iolock);
 294
 295        if (lock_flags & XFS_MMAPLOCK_EXCL)
 296                mrunlock_excl(&ip->i_mmaplock);
 297        else if (lock_flags & XFS_MMAPLOCK_SHARED)
 298                mrunlock_shared(&ip->i_mmaplock);
 299
 300        if (lock_flags & XFS_ILOCK_EXCL)
 301                mrunlock_excl(&ip->i_lock);
 302        else if (lock_flags & XFS_ILOCK_SHARED)
 303                mrunlock_shared(&ip->i_lock);
 304
 305        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 306}
 307
 308/*
 309 * give up write locks.  the i/o lock cannot be held nested
 310 * if it is being demoted.
 311 */
 312void
 313xfs_ilock_demote(
 314        xfs_inode_t             *ip,
 315        uint                    lock_flags)
 316{
 317        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
 318        ASSERT((lock_flags &
 319                ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 320
 321        if (lock_flags & XFS_ILOCK_EXCL)
 322                mrdemote(&ip->i_lock);
 323        if (lock_flags & XFS_MMAPLOCK_EXCL)
 324                mrdemote(&ip->i_mmaplock);
 325        if (lock_flags & XFS_IOLOCK_EXCL)
 326                mrdemote(&ip->i_iolock);
 327
 328        trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 329}
 330
 331#if defined(DEBUG) || defined(XFS_WARN)
 332int
 333xfs_isilocked(
 334        xfs_inode_t             *ip,
 335        uint                    lock_flags)
 336{
 337        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
 338                if (!(lock_flags & XFS_ILOCK_SHARED))
 339                        return !!ip->i_lock.mr_writer;
 340                return rwsem_is_locked(&ip->i_lock.mr_lock);
 341        }
 342
 343        if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
 344                if (!(lock_flags & XFS_MMAPLOCK_SHARED))
 345                        return !!ip->i_mmaplock.mr_writer;
 346                return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
 347        }
 348
 349        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 350                if (!(lock_flags & XFS_IOLOCK_SHARED))
 351                        return !!ip->i_iolock.mr_writer;
 352                return rwsem_is_locked(&ip->i_iolock.mr_lock);
 353        }
 354
 355        ASSERT(0);
 356        return 0;
 357}
 358#endif
 359
 360/*
 361 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
 362 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
 363 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
 364 * errors and warnings.
 365 */
 366#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
 367static bool
 368xfs_lockdep_subclass_ok(
 369        int subclass)
 370{
 371        return subclass < MAX_LOCKDEP_SUBCLASSES;
 372}
 373#else
 374#define xfs_lockdep_subclass_ok(subclass)       (true)
 375#endif
 376
 377/*
 378 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
 379 * value. This can be called for any type of inode lock combination, including
 380 * parent locking. Care must be taken to ensure we don't overrun the subclass
 381 * storage fields in the class mask we build.
 382 */
 383static inline int
 384xfs_lock_inumorder(int lock_mode, int subclass)
 385{
 386        int     class = 0;
 387
 388        ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
 389                              XFS_ILOCK_RTSUM)));
 390        ASSERT(xfs_lockdep_subclass_ok(subclass));
 391
 392        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 393                ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
 394                ASSERT(xfs_lockdep_subclass_ok(subclass +
 395                                                XFS_IOLOCK_PARENT_VAL));
 396                class += subclass << XFS_IOLOCK_SHIFT;
 397                if (lock_mode & XFS_IOLOCK_PARENT)
 398                        class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
 399        }
 400
 401        if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
 402                ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
 403                class += subclass << XFS_MMAPLOCK_SHIFT;
 404        }
 405
 406        if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
 407                ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
 408                class += subclass << XFS_ILOCK_SHIFT;
 409        }
 410
 411        return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
 412}
 413
 414/*
 415 * The following routine will lock n inodes in exclusive mode.  We assume the
 416 * caller calls us with the inodes in i_ino order.
 417 *
 418 * We need to detect deadlock where an inode that we lock is in the AIL and we
 419 * start waiting for another inode that is locked by a thread in a long running
 420 * transaction (such as truncate). This can result in deadlock since the long
 421 * running trans might need to wait for the inode we just locked in order to
 422 * push the tail and free space in the log.
 423 *
 424 * xfs_lock_inodes() can only be used to lock one type of lock at a time -
 425 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
 426 * lock more than one at a time, lockdep will report false positives saying we
 427 * have violated locking orders.
 428 */
 429static void
 430xfs_lock_inodes(
 431        xfs_inode_t     **ips,
 432        int             inodes,
 433        uint            lock_mode)
 434{
 435        int             attempts = 0, i, j, try_lock;
 436        xfs_log_item_t  *lp;
 437
 438        /*
 439         * Currently supports between 2 and 5 inodes with exclusive locking.  We
 440         * support an arbitrary depth of locking here, but absolute limits on
 441         * inodes depend on the the type of locking and the limits placed by
 442         * lockdep annotations in xfs_lock_inumorder.  These are all checked by
 443         * the asserts.
 444         */
 445        ASSERT(ips && inodes >= 2 && inodes <= 5);
 446        ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
 447                            XFS_ILOCK_EXCL));
 448        ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
 449                              XFS_ILOCK_SHARED)));
 450        ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
 451                inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
 452        ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
 453                inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
 454        ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
 455                inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
 456
 457        if (lock_mode & XFS_IOLOCK_EXCL) {
 458                ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
 459        } else if (lock_mode & XFS_MMAPLOCK_EXCL)
 460                ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
 461
 462        try_lock = 0;
 463        i = 0;
 464again:
 465        for (; i < inodes; i++) {
 466                ASSERT(ips[i]);
 467
 468                if (i && (ips[i] == ips[i - 1]))        /* Already locked */
 469                        continue;
 470
 471                /*
 472                 * If try_lock is not set yet, make sure all locked inodes are
 473                 * not in the AIL.  If any are, set try_lock to be used later.
 474                 */
 475                if (!try_lock) {
 476                        for (j = (i - 1); j >= 0 && !try_lock; j--) {
 477                                lp = (xfs_log_item_t *)ips[j]->i_itemp;
 478                                if (lp && (lp->li_flags & XFS_LI_IN_AIL))
 479                                        try_lock++;
 480                        }
 481                }
 482
 483                /*
 484                 * If any of the previous locks we have locked is in the AIL,
 485                 * we must TRY to get the second and subsequent locks. If
 486                 * we can't get any, we must release all we have
 487                 * and try again.
 488                 */
 489                if (!try_lock) {
 490                        xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
 491                        continue;
 492                }
 493
 494                /* try_lock means we have an inode locked that is in the AIL. */
 495                ASSERT(i != 0);
 496                if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
 497                        continue;
 498
 499                /*
 500                 * Unlock all previous guys and try again.  xfs_iunlock will try
 501                 * to push the tail if the inode is in the AIL.
 502                 */
 503                attempts++;
 504                for (j = i - 1; j >= 0; j--) {
 505                        /*
 506                         * Check to see if we've already unlocked this one.  Not
 507                         * the first one going back, and the inode ptr is the
 508                         * same.
 509                         */
 510                        if (j != (i - 1) && ips[j] == ips[j + 1])
 511                                continue;
 512
 513                        xfs_iunlock(ips[j], lock_mode);
 514                }
 515
 516                if ((attempts % 5) == 0) {
 517                        delay(1); /* Don't just spin the CPU */
 518                }
 519                i = 0;
 520                try_lock = 0;
 521                goto again;
 522        }
 523}
 524
 525/*
 526 * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
 527 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
 528 * lock more than one at a time, lockdep will report false positives saying we
 529 * have violated locking orders.
 530 */
 531void
 532xfs_lock_two_inodes(
 533        xfs_inode_t             *ip0,
 534        xfs_inode_t             *ip1,
 535        uint                    lock_mode)
 536{
 537        xfs_inode_t             *temp;
 538        int                     attempts = 0;
 539        xfs_log_item_t          *lp;
 540
 541        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 542                ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
 543                ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 544        } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
 545                ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 546
 547        ASSERT(ip0->i_ino != ip1->i_ino);
 548
 549        if (ip0->i_ino > ip1->i_ino) {
 550                temp = ip0;
 551                ip0 = ip1;
 552                ip1 = temp;
 553        }
 554
 555 again:
 556        xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
 557
 558        /*
 559         * If the first lock we have locked is in the AIL, we must TRY to get
 560         * the second lock. If we can't get it, we must release the first one
 561         * and try again.
 562         */
 563        lp = (xfs_log_item_t *)ip0->i_itemp;
 564        if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
 565                if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
 566                        xfs_iunlock(ip0, lock_mode);
 567                        if ((++attempts % 5) == 0)
 568                                delay(1); /* Don't just spin the CPU */
 569                        goto again;
 570                }
 571        } else {
 572                xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
 573        }
 574}
 575
 576
 577void
 578__xfs_iflock(
 579        struct xfs_inode        *ip)
 580{
 581        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
 582        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
 583
 584        do {
 585                prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 586                if (xfs_isiflocked(ip))
 587                        io_schedule();
 588        } while (!xfs_iflock_nowait(ip));
 589
 590        finish_wait(wq, &wait.wait);
 591}
 592
 593STATIC uint
 594_xfs_dic2xflags(
 595        uint16_t                di_flags,
 596        uint64_t                di_flags2,
 597        bool                    has_attr)
 598{
 599        uint                    flags = 0;
 600
 601        if (di_flags & XFS_DIFLAG_ANY) {
 602                if (di_flags & XFS_DIFLAG_REALTIME)
 603                        flags |= XFS_XFLAG_REALTIME;
 604                if (di_flags & XFS_DIFLAG_PREALLOC)
 605                        flags |= XFS_XFLAG_PREALLOC;
 606                if (di_flags & XFS_DIFLAG_IMMUTABLE)
 607                        flags |= XFS_XFLAG_IMMUTABLE;
 608                if (di_flags & XFS_DIFLAG_APPEND)
 609                        flags |= XFS_XFLAG_APPEND;
 610                if (di_flags & XFS_DIFLAG_SYNC)
 611                        flags |= XFS_XFLAG_SYNC;
 612                if (di_flags & XFS_DIFLAG_NOATIME)
 613                        flags |= XFS_XFLAG_NOATIME;
 614                if (di_flags & XFS_DIFLAG_NODUMP)
 615                        flags |= XFS_XFLAG_NODUMP;
 616                if (di_flags & XFS_DIFLAG_RTINHERIT)
 617                        flags |= XFS_XFLAG_RTINHERIT;
 618                if (di_flags & XFS_DIFLAG_PROJINHERIT)
 619                        flags |= XFS_XFLAG_PROJINHERIT;
 620                if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 621                        flags |= XFS_XFLAG_NOSYMLINKS;
 622                if (di_flags & XFS_DIFLAG_EXTSIZE)
 623                        flags |= XFS_XFLAG_EXTSIZE;
 624                if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 625                        flags |= XFS_XFLAG_EXTSZINHERIT;
 626                if (di_flags & XFS_DIFLAG_NODEFRAG)
 627                        flags |= XFS_XFLAG_NODEFRAG;
 628                if (di_flags & XFS_DIFLAG_FILESTREAM)
 629                        flags |= XFS_XFLAG_FILESTREAM;
 630        }
 631
 632        if (has_attr)
 633                flags |= XFS_XFLAG_HASATTR;
 634
 635        return flags;
 636}
 637
 638uint
 639xfs_ip2xflags(
 640        struct xfs_inode        *ip)
 641{
 642        struct xfs_icdinode     *dic = &ip->i_d;
 643
 644        return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
 645}
 646
 647/*
 648 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 649 * is allowed, otherwise it has to be an exact match. If a CI match is found,
 650 * ci_name->name will point to a the actual name (caller must free) or
 651 * will be set to NULL if an exact match is found.
 652 */
 653int
 654xfs_lookup(
 655        xfs_inode_t             *dp,
 656        struct xfs_name         *name,
 657        xfs_inode_t             **ipp,
 658        struct xfs_name         *ci_name)
 659{
 660        xfs_ino_t               inum;
 661        int                     error;
 662
 663        trace_xfs_lookup(dp, name);
 664
 665        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 666                return -EIO;
 667
 668        xfs_ilock(dp, XFS_IOLOCK_SHARED);
 669        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 670        if (error)
 671                goto out_unlock;
 672
 673        error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
 674        if (error)
 675                goto out_free_name;
 676
 677        xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 678        return 0;
 679
 680out_free_name:
 681        if (ci_name)
 682                kmem_free(ci_name->name);
 683out_unlock:
 684        xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 685        *ipp = NULL;
 686        return error;
 687}
 688
 689/*
 690 * Allocate an inode on disk and return a copy of its in-core version.
 691 * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
 692 * appropriately within the inode.  The uid and gid for the inode are
 693 * set according to the contents of the given cred structure.
 694 *
 695 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
 696 * has a free inode available, call xfs_iget() to obtain the in-core
 697 * version of the allocated inode.  Finally, fill in the inode and
 698 * log its initial contents.  In this case, ialloc_context would be
 699 * set to NULL.
 700 *
 701 * If xfs_dialloc() does not have an available inode, it will replenish
 702 * its supply by doing an allocation. Since we can only do one
 703 * allocation within a transaction without deadlocks, we must commit
 704 * the current transaction before returning the inode itself.
 705 * In this case, therefore, we will set ialloc_context and return.
 706 * The caller should then commit the current transaction, start a new
 707 * transaction, and call xfs_ialloc() again to actually get the inode.
 708 *
 709 * To ensure that some other process does not grab the inode that
 710 * was allocated during the first call to xfs_ialloc(), this routine
 711 * also returns the [locked] bp pointing to the head of the freelist
 712 * as ialloc_context.  The caller should hold this buffer across
 713 * the commit and pass it back into this routine on the second call.
 714 *
 715 * If we are allocating quota inodes, we do not have a parent inode
 716 * to attach to or associate with (i.e. pip == NULL) because they
 717 * are not linked into the directory structure - they are attached
 718 * directly to the superblock - and so have no parent.
 719 */
 720static int
 721xfs_ialloc(
 722        xfs_trans_t     *tp,
 723        xfs_inode_t     *pip,
 724        umode_t         mode,
 725        xfs_nlink_t     nlink,
 726        dev_t           rdev,
 727        prid_t          prid,
 728        xfs_buf_t       **ialloc_context,
 729        xfs_inode_t     **ipp)
 730{
 731        struct xfs_mount *mp = tp->t_mountp;
 732        xfs_ino_t       ino;
 733        xfs_inode_t     *ip;
 734        uint            flags;
 735        int             error;
 736        struct timespec tv;
 737        struct inode    *inode;
 738
 739        /*
 740         * Call the space management code to pick
 741         * the on-disk inode to be allocated.
 742         */
 743        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode,
 744                            ialloc_context, &ino);
 745        if (error)
 746                return error;
 747        if (*ialloc_context || ino == NULLFSINO) {
 748                *ipp = NULL;
 749                return 0;
 750        }
 751        ASSERT(*ialloc_context == NULL);
 752
 753        /*
 754         * Get the in-core inode with the lock held exclusively.
 755         * This is because we're setting fields here we need
 756         * to prevent others from looking at until we're done.
 757         */
 758        error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
 759                         XFS_ILOCK_EXCL, &ip);
 760        if (error)
 761                return error;
 762        ASSERT(ip != NULL);
 763        inode = VFS_I(ip);
 764
 765        /*
 766         * We always convert v1 inodes to v2 now - we only support filesystems
 767         * with >= v2 inode capability, so there is no reason for ever leaving
 768         * an inode in v1 format.
 769         */
 770        if (ip->i_d.di_version == 1)
 771                ip->i_d.di_version = 2;
 772
 773        inode->i_mode = mode;
 774        set_nlink(inode, nlink);
 775        ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
 776        ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
 777        inode->i_rdev = rdev;
 778        xfs_set_projid(ip, prid);
 779
 780        if (pip && XFS_INHERIT_GID(pip)) {
 781                ip->i_d.di_gid = pip->i_d.di_gid;
 782                if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
 783                        inode->i_mode |= S_ISGID;
 784        }
 785
 786        /*
 787         * If the group ID of the new file does not match the effective group
 788         * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
 789         * (and only if the irix_sgid_inherit compatibility variable is set).
 790         */
 791        if ((irix_sgid_inherit) &&
 792            (inode->i_mode & S_ISGID) &&
 793            (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
 794                inode->i_mode &= ~S_ISGID;
 795
 796        ip->i_d.di_size = 0;
 797        ip->i_d.di_nextents = 0;
 798        ASSERT(ip->i_d.di_nblocks == 0);
 799
 800        tv = current_fs_time(mp->m_super);
 801        inode->i_mtime = tv;
 802        inode->i_atime = tv;
 803        inode->i_ctime = tv;
 804
 805        ip->i_d.di_extsize = 0;
 806        ip->i_d.di_dmevmask = 0;
 807        ip->i_d.di_dmstate = 0;
 808        ip->i_d.di_flags = 0;
 809
 810        if (ip->i_d.di_version == 3) {
 811                inode->i_version = 1;
 812                ip->i_d.di_flags2 = 0;
 813                ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
 814                ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
 815        }
 816
 817
 818        flags = XFS_ILOG_CORE;
 819        switch (mode & S_IFMT) {
 820        case S_IFIFO:
 821        case S_IFCHR:
 822        case S_IFBLK:
 823        case S_IFSOCK:
 824                ip->i_d.di_format = XFS_DINODE_FMT_DEV;
 825                ip->i_df.if_flags = 0;
 826                flags |= XFS_ILOG_DEV;
 827                break;
 828        case S_IFREG:
 829        case S_IFDIR:
 830                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
 831                        uint    di_flags = 0;
 832
 833                        if (S_ISDIR(mode)) {
 834                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 835                                        di_flags |= XFS_DIFLAG_RTINHERIT;
 836                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 837                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 838                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
 839                                }
 840                                if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 841                                        di_flags |= XFS_DIFLAG_PROJINHERIT;
 842                        } else if (S_ISREG(mode)) {
 843                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 844                                        di_flags |= XFS_DIFLAG_REALTIME;
 845                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 846                                        di_flags |= XFS_DIFLAG_EXTSIZE;
 847                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
 848                                }
 849                        }
 850                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
 851                            xfs_inherit_noatime)
 852                                di_flags |= XFS_DIFLAG_NOATIME;
 853                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
 854                            xfs_inherit_nodump)
 855                                di_flags |= XFS_DIFLAG_NODUMP;
 856                        if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
 857                            xfs_inherit_sync)
 858                                di_flags |= XFS_DIFLAG_SYNC;
 859                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
 860                            xfs_inherit_nosymlinks)
 861                                di_flags |= XFS_DIFLAG_NOSYMLINKS;
 862                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
 863                            xfs_inherit_nodefrag)
 864                                di_flags |= XFS_DIFLAG_NODEFRAG;
 865                        if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
 866                                di_flags |= XFS_DIFLAG_FILESTREAM;
 867                        ip->i_d.di_flags |= di_flags;
 868                }
 869                /* FALLTHROUGH */
 870        case S_IFLNK:
 871                ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
 872                ip->i_df.if_flags = XFS_IFEXTENTS;
 873                ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
 874                ip->i_df.if_u1.if_root = NULL;
 875                break;
 876        default:
 877                ASSERT(0);
 878        }
 879        /*
 880         * Attribute fork settings for new inode.
 881         */
 882        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 883        ip->i_d.di_anextents = 0;
 884
 885        /*
 886         * Log the new values stuffed into the inode.
 887         */
 888        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 889        xfs_trans_log_inode(tp, ip, flags);
 890
 891        /* now that we have an i_mode we can setup the inode structure */
 892        xfs_setup_inode(ip);
 893
 894        *ipp = ip;
 895        return 0;
 896}
 897
 898/*
 899 * Allocates a new inode from disk and return a pointer to the
 900 * incore copy. This routine will internally commit the current
 901 * transaction and allocate a new one if the Space Manager needed
 902 * to do an allocation to replenish the inode free-list.
 903 *
 904 * This routine is designed to be called from xfs_create and
 905 * xfs_create_dir.
 906 *
 907 */
 908int
 909xfs_dir_ialloc(
 910        xfs_trans_t     **tpp,          /* input: current transaction;
 911                                           output: may be a new transaction. */
 912        xfs_inode_t     *dp,            /* directory within whose allocate
 913                                           the inode. */
 914        umode_t         mode,
 915        xfs_nlink_t     nlink,
 916        dev_t           rdev,
 917        prid_t          prid,           /* project id */
 918        xfs_inode_t     **ipp,          /* pointer to inode; it will be
 919                                           locked. */
 920        int             *committed)
 921
 922{
 923        xfs_trans_t     *tp;
 924        xfs_inode_t     *ip;
 925        xfs_buf_t       *ialloc_context = NULL;
 926        int             code;
 927        void            *dqinfo;
 928        uint            tflags;
 929
 930        tp = *tpp;
 931        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 932
 933        /*
 934         * xfs_ialloc will return a pointer to an incore inode if
 935         * the Space Manager has an available inode on the free
 936         * list. Otherwise, it will do an allocation and replenish
 937         * the freelist.  Since we can only do one allocation per
 938         * transaction without deadlocks, we will need to commit the
 939         * current transaction and start a new one.  We will then
 940         * need to call xfs_ialloc again to get the inode.
 941         *
 942         * If xfs_ialloc did an allocation to replenish the freelist,
 943         * it returns the bp containing the head of the freelist as
 944         * ialloc_context. We will hold a lock on it across the
 945         * transaction commit so that no other process can steal
 946         * the inode(s) that we've just allocated.
 947         */
 948        code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
 949                        &ip);
 950
 951        /*
 952         * Return an error if we were unable to allocate a new inode.
 953         * This should only happen if we run out of space on disk or
 954         * encounter a disk error.
 955         */
 956        if (code) {
 957                *ipp = NULL;
 958                return code;
 959        }
 960        if (!ialloc_context && !ip) {
 961                *ipp = NULL;
 962                return -ENOSPC;
 963        }
 964
 965        /*
 966         * If the AGI buffer is non-NULL, then we were unable to get an
 967         * inode in one operation.  We need to commit the current
 968         * transaction and call xfs_ialloc() again.  It is guaranteed
 969         * to succeed the second time.
 970         */
 971        if (ialloc_context) {
 972                /*
 973                 * Normally, xfs_trans_commit releases all the locks.
 974                 * We call bhold to hang on to the ialloc_context across
 975                 * the commit.  Holding this buffer prevents any other
 976                 * processes from doing any allocations in this
 977                 * allocation group.
 978                 */
 979                xfs_trans_bhold(tp, ialloc_context);
 980
 981                /*
 982                 * We want the quota changes to be associated with the next
 983                 * transaction, NOT this one. So, detach the dqinfo from this
 984                 * and attach it to the next transaction.
 985                 */
 986                dqinfo = NULL;
 987                tflags = 0;
 988                if (tp->t_dqinfo) {
 989                        dqinfo = (void *)tp->t_dqinfo;
 990                        tp->t_dqinfo = NULL;
 991                        tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
 992                        tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
 993                }
 994
 995                code = xfs_trans_roll(&tp);
 996                if (committed != NULL)
 997                        *committed = 1;
 998
 999                /*
1000                 * Re-attach the quota info that we detached from prev trx.

1001                 */
1002                if (dqinfo) {
1003                        tp->t_dqinfo = dqinfo;
1004                        tp->t_flags |= tflags;
1005                }
1006
1007                if (code) {
1008                        xfs_buf_relse(ialloc_context);
1009                        *tpp = tp;
1010                        *ipp = NULL;
1011                        return code;
1012                }
1013                xfs_trans_bjoin(tp, ialloc_context);
1014
1015                /*
1016                 * Call ialloc again. Since we've locked out all
1017                 * other allocations in this allocation group,
1018                 * this call should always succeed.
1019                 */
1020                code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
1021                                  &ialloc_context, &ip);
1022
1023                /*
1024                 * If we get an error at this point, return to the caller
1025                 * so that the current transaction can be aborted.
1026                 */
1027                if (code) {
1028                        *tpp = tp;
1029                        *ipp = NULL;
1030                        return code;
1031                }
1032                ASSERT(!ialloc_context && ip);
1033
1034        } else {
1035                if (committed != NULL)
1036                        *committed = 0;
1037        }
1038
1039        *ipp = ip;
1040        *tpp = tp;
1041
1042        return 0;
1043}
1044
1045/*
1046 * Decrement the link count on an inode & log the change.  If this causes the
1047 * link count to go to zero, move the inode to AGI unlinked list so that it can
1048 * be freed when the last active reference goes away via xfs_inactive().
1049 */
1050static int                      /* error */
1051xfs_droplink(
1052        xfs_trans_t *tp,
1053        xfs_inode_t *ip)
1054{
1055        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1056
1057        drop_nlink(VFS_I(ip));
1058        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1059
1060        if (VFS_I(ip)->i_nlink)
1061                return 0;
1062
1063        return xfs_iunlink(tp, ip);
1064}
1065
1066/*
1067 * Increment the link count on an inode & log the change.
1068 */
1069static int
1070xfs_bumplink(
1071        xfs_trans_t *tp,
1072        xfs_inode_t *ip)
1073{
1074        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1075
1076        ASSERT(ip->i_d.di_version > 1);
1077        inc_nlink(VFS_I(ip));
1078        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1079        return 0;
1080}
1081
1082int
1083xfs_create(
1084        xfs_inode_t             *dp,
1085        struct xfs_name         *name,
1086        umode_t                 mode,
1087        dev_t                   rdev,
1088        xfs_inode_t             **ipp)
1089{
1090        int                     is_dir = S_ISDIR(mode);
1091        struct xfs_mount        *mp = dp->i_mount;
1092        struct xfs_inode        *ip = NULL;
1093        struct xfs_trans        *tp = NULL;
1094        int                     error;
1095        struct xfs_defer_ops    dfops;
1096        xfs_fsblock_t           first_block;
1097        bool                    unlock_dp_on_error = false;
1098        prid_t                  prid;
1099        struct xfs_dquot        *udqp = NULL;
1100        struct xfs_dquot        *gdqp = NULL;
1101        struct xfs_dquot        *pdqp = NULL;
1102        struct xfs_trans_res    *tres;
1103        uint                    resblks;
1104
1105        trace_xfs_create(dp, name);
1106
1107        if (XFS_FORCED_SHUTDOWN(mp))
1108                return -EIO;
1109
1110        prid = xfs_get_initial_prid(dp);
1111
1112        /*
1113         * Make sure that we have allocated dquot(s) on disk.
1114         */
1115        error = xfs_qm_vop_dqalloc(dp,
1116                                xfs_kuid_to_uid(current_fsuid()),
1117                                xfs_kgid_to_gid(current_fsgid()), prid,
1118                                XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1119                                &udqp, &gdqp, &pdqp);
1120        if (error)
1121                return error;
1122
1123        if (is_dir) {
1124                resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1125                tres = &M_RES(mp)->tr_mkdir;
1126        } else {
1127                resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1128                tres = &M_RES(mp)->tr_create;
1129        }
1130
1131        /*
1132         * Initially assume that the file does not exist and
1133         * reserve the resources for that case.  If that is not
1134         * the case we'll drop the one we have and get a more
1135         * appropriate transaction later.
1136         */
1137        error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1138        if (error == -ENOSPC) {
1139                /* flush outstanding delalloc blocks and retry */
1140                xfs_flush_inodes(mp);
1141                error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1142        }
1143        if (error)
1144                goto out_release_inode;
1145
1146        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
1147                      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
1148        unlock_dp_on_error = true;
1149
1150        xfs_defer_init(&dfops, &first_block);
1151
1152        /*
1153         * Reserve disk quota and the inode.
1154         */
1155        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1156                                                pdqp, resblks, 1, 0);
1157        if (error)
1158                goto out_trans_cancel;
1159
1160        /*
1161         * A newly created regular or special file just has one directory
1162         * entry pointing to them, but a directory also the "." entry
1163         * pointing to itself.
1164         */
1165        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip,
1166                        NULL);
1167        if (error)
1168                goto out_trans_cancel;
1169
1170        /*
1171         * Now we join the directory inode to the transaction.  We do not do it
1172         * earlier because xfs_dir_ialloc might commit the previous transaction
1173         * (and release all the locks).  An error from here on will result in
1174         * the transaction cancel unlocking dp so don't do it explicitly in the
1175         * error path.
1176         */
1177        xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1178        unlock_dp_on_error = false;
1179
1180        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1181                                        &first_block, &dfops, resblks ?
1182                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1183        if (error) {
1184                ASSERT(error != -ENOSPC);
1185                goto out_trans_cancel;
1186        }
1187        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1188        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1189
1190        if (is_dir) {
1191                error = xfs_dir_init(tp, ip, dp);
1192                if (error)
1193                        goto out_bmap_cancel;
1194
1195                error = xfs_bumplink(tp, dp);
1196                if (error)
1197                        goto out_bmap_cancel;
1198        }
1199
1200        /*
1201         * If this is a synchronous mount, make sure that the
1202         * create transaction goes to disk before returning to
1203         * the user.
1204         */
1205        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1206                xfs_trans_set_sync(tp);
1207
1208        /*
1209         * Attach the dquot(s) to the inodes and modify them incore.
1210         * These ids of the inode couldn't have changed since the new
1211         * inode has been locked ever since it was created.
1212         */
1213        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1214
1215        error = xfs_defer_finish(&tp, &dfops);
1216        if (error)
1217                goto out_bmap_cancel;
1218
1219        error = xfs_trans_commit(tp);
1220        if (error)
1221                goto out_release_inode;
1222
1223        xfs_qm_dqrele(udqp);
1224        xfs_qm_dqrele(gdqp);
1225        xfs_qm_dqrele(pdqp);
1226
1227        *ipp = ip;
1228        return 0;
1229
1230 out_bmap_cancel:
1231        xfs_defer_cancel(&dfops);
1232 out_trans_cancel:
1233        xfs_trans_cancel(tp);
1234 out_release_inode:
1235        /*
1236         * Wait until after the current transaction is aborted to finish the
1237         * setup of the inode and release the inode.  This prevents recursive
1238         * transactions and deadlocks from xfs_inactive.
1239         */
1240        if (ip) {
1241                xfs_finish_inode_setup(ip);
1242                IRELE(ip);
1243        }
1244
1245        xfs_qm_dqrele(udqp);
1246        xfs_qm_dqrele(gdqp);
1247        xfs_qm_dqrele(pdqp);
1248
1249        if (unlock_dp_on_error)
1250                xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1251        return error;
1252}
1253
1254int
1255xfs_create_tmpfile(
1256        struct xfs_inode        *dp,
1257        struct dentry           *dentry,
1258        umode_t                 mode,
1259        struct xfs_inode        **ipp)
1260{
1261        struct xfs_mount        *mp = dp->i_mount;
1262        struct xfs_inode        *ip = NULL;
1263        struct xfs_trans        *tp = NULL;
1264        int                     error;
1265        prid_t                  prid;
1266        struct xfs_dquot        *udqp = NULL;
1267        struct xfs_dquot        *gdqp = NULL;
1268        struct xfs_dquot        *pdqp = NULL;
1269        struct xfs_trans_res    *tres;
1270        uint                    resblks;
1271
1272        if (XFS_FORCED_SHUTDOWN(mp))
1273                return -EIO;
1274
1275        prid = xfs_get_initial_prid(dp);
1276
1277        /*
1278         * Make sure that we have allocated dquot(s) on disk.
1279         */
1280        error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1281                                xfs_kgid_to_gid(current_fsgid()), prid,
1282                                XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1283                                &udqp, &gdqp, &pdqp);
1284        if (error)
1285                return error;
1286
1287        resblks = XFS_IALLOC_SPACE_RES(mp);
1288        tres = &M_RES(mp)->tr_create_tmpfile;
1289
1290        error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
1291        if (error)
1292                goto out_release_inode;
1293
1294        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1295                                                pdqp, resblks, 1, 0);
1296        if (error)
1297                goto out_trans_cancel;
1298
1299        error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL);
1300        if (error)
1301                goto out_trans_cancel;
1302
1303        if (mp->m_flags & XFS_MOUNT_WSYNC)
1304                xfs_trans_set_sync(tp);
1305
1306        /*
1307         * Attach the dquot(s) to the inodes and modify them incore.
1308         * These ids of the inode couldn't have changed since the new
1309         * inode has been locked ever since it was created.
1310         */
1311        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1312
1313        error = xfs_iunlink(tp, ip);
1314        if (error)
1315                goto out_trans_cancel;
1316
1317        error = xfs_trans_commit(tp);
1318        if (error)
1319                goto out_release_inode;
1320
1321        xfs_qm_dqrele(udqp);
1322        xfs_qm_dqrele(gdqp);
1323        xfs_qm_dqrele(pdqp);
1324
1325        *ipp = ip;
1326        return 0;
1327
1328 out_trans_cancel:
1329        xfs_trans_cancel(tp);
1330 out_release_inode:
1331        /*
1332         * Wait until after the current transaction is aborted to finish the
1333         * setup of the inode and release the inode.  This prevents recursive
1334         * transactions and deadlocks from xfs_inactive.
1335         */
1336        if (ip) {
1337                xfs_finish_inode_setup(ip);
1338                IRELE(ip);
1339        }
1340
1341        xfs_qm_dqrele(udqp);
1342        xfs_qm_dqrele(gdqp);
1343        xfs_qm_dqrele(pdqp);
1344
1345        return error;
1346}
1347
1348int
1349xfs_link(
1350        xfs_inode_t             *tdp,
1351        xfs_inode_t             *sip,
1352        struct xfs_name         *target_name)
1353{
1354        xfs_mount_t             *mp = tdp->i_mount;
1355        xfs_trans_t             *tp;
1356        int                     error;
1357        struct xfs_defer_ops    dfops;
1358        xfs_fsblock_t           first_block;
1359        int                     resblks;
1360
1361        trace_xfs_link(tdp, target_name);
1362
1363        ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1364
1365        if (XFS_FORCED_SHUTDOWN(mp))
1366                return -EIO;
1367
1368        error = xfs_qm_dqattach(sip, 0);
1369        if (error)
1370                goto std_return;
1371
1372        error = xfs_qm_dqattach(tdp, 0);
1373        if (error)
1374                goto std_return;
1375
1376        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1377        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
1378        if (error == -ENOSPC) {
1379                resblks = 0;
1380                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
1381        }
1382        if (error)
1383                goto std_return;
1384
1385        xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
1386        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1387
1388        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1389        xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1390
1391        /*
1392         * If we are using project inheritance, we only allow hard link
1393         * creation in our tree when the project IDs are the same; else
1394         * the tree quota mechanism could be circumvented.
1395         */
1396        if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1397                     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1398                error = -EXDEV;
1399                goto error_return;
1400        }
1401
1402        if (!resblks) {
1403                error = xfs_dir_canenter(tp, tdp, target_name);
1404                if (error)
1405                        goto error_return;
1406        }
1407
1408        xfs_defer_init(&dfops, &first_block);
1409
1410        /*
1411         * Handle initial link state of O_TMPFILE inode
1412         */
1413        if (VFS_I(sip)->i_nlink == 0) {
1414                error = xfs_iunlink_remove(tp, sip);
1415                if (error)
1416                        goto error_return;
1417        }
1418
1419        error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1420                                        &first_block, &dfops, resblks);
1421        if (error)
1422                goto error_return;
1423        xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1424        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1425
1426        error = xfs_bumplink(tp, sip);
1427        if (error)
1428                goto error_return;
1429
1430        /*
1431         * If this is a synchronous mount, make sure that the
1432         * link transaction goes to disk before returning to
1433         * the user.
1434         */
1435        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1436                xfs_trans_set_sync(tp);
1437
1438        error = xfs_defer_finish(&tp, &dfops);
1439        if (error) {
1440                xfs_defer_cancel(&dfops);
1441                goto error_return;
1442        }
1443
1444        return xfs_trans_commit(tp);
1445
1446 error_return:
1447        xfs_trans_cancel(tp);
1448 std_return:
1449        return error;
1450}
1451
1452/*
1453 * Free up the underlying blocks past new_size.  The new size must be smaller
1454 * than the current size.  This routine can be used both for the attribute and
1455 * data fork, and does not modify the inode size, which is left to the caller.
1456 *
1457 * The transaction passed to this routine must have made a permanent log
1458 * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1459 * given transaction and start new ones, so make sure everything involved in
1460 * the transaction is tidy before calling here.  Some transaction will be
1461 * returned to the caller to be committed.  The incoming transaction must
1462 * already include the inode, and both inode locks must be held exclusively.
1463 * The inode must also be "held" within the transaction.  On return the inode
1464 * will be "held" within the returned transaction.  This routine does NOT
1465 * require any disk space to be reserved for it within the transaction.
1466 *
1467 * If we get an error, we must return with the inode locked and linked into the
1468 * current transaction. This keeps things simple for the higher level code,
1469 * because it always knows that the inode is locked and held in the transaction
1470 * that returns to it whether errors occur or not.  We don't mark the inode
1471 * dirty on error so that transactions can be easily aborted if possible.
1472 */
1473int
1474xfs_itruncate_extents(
1475        struct xfs_trans        **tpp,
1476        struct xfs_inode        *ip,
1477        int                     whichfork,
1478        xfs_fsize_t             new_size)
1479{
1480        struct xfs_mount        *mp = ip->i_mount;
1481        struct xfs_trans        *tp = *tpp;
1482        struct xfs_defer_ops    dfops;
1483        xfs_fsblock_t           first_block;
1484        xfs_fileoff_t           first_unmap_block;
1485        xfs_fileoff_t           last_block;
1486        xfs_filblks_t           unmap_len;
1487        int                     error = 0;
1488        int                     done = 0;
1489
1490        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1491        ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1492               xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1493        ASSERT(new_size <= XFS_ISIZE(ip));
1494        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1495        ASSERT(ip->i_itemp != NULL);
1496        ASSERT(ip->i_itemp->ili_lock_flags == 0);
1497        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1498
1499        trace_xfs_itruncate_extents_start(ip, new_size);
1500
1501        /*
1502         * Since it is possible for space to become allocated beyond
1503         * the end of the file (in a crash where the space is allocated
1504         * but the inode size is not yet updated), simply remove any
1505         * blocks which show up between the new EOF and the maximum
1506         * possible file size.  If the first block to be removed is
1507         * beyond the maximum file size (ie it is the same as last_block),
1508         * then there is nothing to do.
1509         */
1510        first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1511        last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1512        if (first_unmap_block == last_block)
1513                return 0;
1514
1515        ASSERT(first_unmap_block < last_block);
1516        unmap_len = last_block - first_unmap_block + 1;
1517        while (!done) {
1518                xfs_defer_init(&dfops, &first_block);
1519                error = xfs_bunmapi(tp, ip,
1520                                    first_unmap_block, unmap_len,
1521                                    xfs_bmapi_aflag(whichfork),
1522                                    XFS_ITRUNC_MAX_EXTENTS,
1523                                    &first_block, &dfops,
1524                                    &done);
1525                if (error)
1526                        goto out_bmap_cancel;
1527
1528                /*
1529                 * Duplicate the transaction that has the permanent
1530                 * reservation and commit the old transaction.
1531                 */
1532                error = xfs_defer_finish(&tp, &dfops);
1533                if (error)
1534                        goto out_bmap_cancel;
1535
1536                error = xfs_trans_roll_inode(&tp, ip);
1537                if (error)
1538                        goto out;
1539        }
1540
1541        /*
1542         * Always re-log the inode so that our permanent transaction can keep
1543         * on rolling it forward in the log.
1544         */
1545        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1546
1547        trace_xfs_itruncate_extents_end(ip, new_size);
1548
1549out:
1550        *tpp = tp;
1551        return error;
1552out_bmap_cancel:
1553        /*
1554         * If the bunmapi call encounters an error, return to the caller where
1555         * the transaction can be properly aborted.  We just need to make sure
1556         * we're not holding any resources that we were not when we came in.
1557         */
1558        xfs_defer_cancel(&dfops);
1559        goto out;
1560}
1561
1562int
1563xfs_release(
1564        xfs_inode_t     *ip)
1565{
1566        xfs_mount_t     *mp = ip->i_mount;
1567        int             error;
1568
1569        if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1570                return 0;
1571
1572        /* If this is a read-only mount, don't do this (would generate I/O) */
1573        if (mp->m_flags & XFS_MOUNT_RDONLY)
1574                return 0;
1575
1576        if (!XFS_FORCED_SHUTDOWN(mp)) {
1577                int truncated;
1578
1579                /*
1580                 * If we previously truncated this file and removed old data
1581                 * in the process, we want to initiate "early" writeout on
1582                 * the last close.  This is an attempt to combat the notorious
1583                 * NULL files problem which is particularly noticeable from a
1584                 * truncate down, buffered (re-)write (delalloc), followed by
1585                 * a crash.  What we are effectively doing here is
1586                 * significantly reducing the time window where we'd otherwise
1587                 * be exposed to that problem.
1588                 */
1589                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1590                if (truncated) {
1591                        xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1592                        if (ip->i_delayed_blks > 0) {
1593                                error = filemap_flush(VFS_I(ip)->i_mapping);
1594                                if (error)
1595                                        return error;
1596                        }
1597                }
1598        }
1599
1600        if (VFS_I(ip)->i_nlink == 0)
1601                return 0;
1602
1603        if (xfs_can_free_eofblocks(ip, false)) {
1604
1605                /*
1606                 * Check if the inode is being opened, written and closed
1607                 * frequently and we have delayed allocation blocks outstanding
1608                 * (e.g. streaming writes from the NFS server), truncating the
1609                 * blocks past EOF will cause fragmentation to occur.
1610                 *
1611                 * In this case don't do the truncation, but we have to be
1612                 * careful how we detect this case. Blocks beyond EOF show up as
1613                 * i_delayed_blks even when the inode is clean, so we need to
1614                 * truncate them away first before checking for a dirty release.
1615                 * Hence on the first dirty close we will still remove the
1616                 * speculative allocation, but after that we will leave it in
1617                 * place.
1618                 */
1619                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1620                        return 0;
1621                /*
1622                 * If we can't get the iolock just skip truncating the blocks
1623                 * past EOF because we could deadlock with the mmap_sem
1624                 * otherwise. We'll get another chance to drop them once the
1625                 * last reference to the inode is dropped, so we'll never leak
1626                 * blocks permanently.
1627                 */
1628                if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1629                        error = xfs_free_eofblocks(ip);
1630                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1631                        if (error)
1632                                return error;
1633                }
1634
1635                /* delalloc blocks after truncation means it really is dirty */
1636                if (ip->i_delayed_blks)
1637                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1638        }
1639        return 0;
1640}
1641
1642/*
1643 * xfs_inactive_truncate
1644 *
1645 * Called to perform a truncate when an inode becomes unlinked.
1646 */
1647STATIC int
1648xfs_inactive_truncate(
1649        struct xfs_inode *ip)
1650{
1651        struct xfs_mount        *mp = ip->i_mount;
1652        struct xfs_trans        *tp;
1653        int                     error;
1654
1655        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
1656        if (error) {
1657                ASSERT(XFS_FORCED_SHUTDOWN(mp));
1658                return error;
1659        }
1660
1661        xfs_ilock(ip, XFS_ILOCK_EXCL);
1662        xfs_trans_ijoin(tp, ip, 0);
1663
1664        /*
1665         * Log the inode size first to prevent stale data exposure in the event
1666         * of a system crash before the truncate completes. See the related
1667         * comment in xfs_vn_setattr_size() for details.
1668         */
1669        ip->i_d.di_size = 0;
1670        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1671
1672        error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1673        if (error)
1674                goto error_trans_cancel;
1675
1676        ASSERT(ip->i_d.di_nextents == 0);
1677
1678        error = xfs_trans_commit(tp);
1679        if (error)
1680                goto error_unlock;
1681
1682        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1683        return 0;
1684
1685error_trans_cancel:
1686        xfs_trans_cancel(tp);
1687error_unlock:
1688        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1689        return error;
1690}
1691
1692/*
1693 * xfs_inactive_ifree()
1694 *
1695 * Perform the inode free when an inode is unlinked.
1696 */
1697STATIC int
1698xfs_inactive_ifree(
1699        struct xfs_inode *ip)
1700{
1701        struct xfs_defer_ops    dfops;
1702        xfs_fsblock_t           first_block;
1703        struct xfs_mount        *mp = ip->i_mount;
1704        struct xfs_trans        *tp;
1705        int                     error;
1706
1707        /*
1708         * The ifree transaction might need to allocate blocks for record
1709         * insertion to the finobt. We don't want to fail here at ENOSPC, so
1710         * allow ifree to dip into the reserved block pool if necessary.
1711         *
1712         * Freeing large sets of inodes generally means freeing inode chunks,
1713         * directory and file data blocks, so this should be relatively safe.
1714         * Only under severe circumstances should it be possible to free enough
1715         * inodes to exhaust the reserve block pool via finobt expansion while
1716         * at the same time not creating free space in the filesystem.
1717         *
1718         * Send a warning if the reservation does happen to fail, as the inode
1719         * now remains allocated and sits on the unlinked list until the fs is
1720         * repaired.
1721         */
1722        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1723                        XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
1724        if (error) {
1725                if (error == -ENOSPC) {
1726                        xfs_warn_ratelimited(mp,
1727                        "Failed to remove inode(s) from unlinked list. "
1728                        "Please free space, unmount and run xfs_repair.");
1729                } else {
1730                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
1731                }
1732                return error;
1733        }
1734
1735        xfs_ilock(ip, XFS_ILOCK_EXCL);
1736        xfs_trans_ijoin(tp, ip, 0);
1737
1738        xfs_defer_init(&dfops, &first_block);
1739        error = xfs_ifree(tp, ip, &dfops);
1740        if (error) {
1741                /*
1742                 * If we fail to free the inode, shut down.  The cancel
1743                 * might do that, we need to make sure.  Otherwise the
1744                 * inode might be lost for a long time or forever.
1745                 */
1746                if (!XFS_FORCED_SHUTDOWN(mp)) {
1747                        xfs_notice(mp, "%s: xfs_ifree returned error %d",
1748                                __func__, error);
1749                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1750                }
1751                xfs_trans_cancel(tp);
1752                xfs_iunlock(ip, XFS_ILOCK_EXCL);
1753                return error;
1754        }
1755
1756        /*
1757         * Credit the quota account(s). The inode is gone.
1758         */
1759        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1760
1761        /*
1762         * Just ignore errors at this point.  There is nothing we can do except
1763         * to try to keep going. Make sure it's not a silent error.
1764         */
1765        error = xfs_defer_finish(&tp, &dfops);
1766        if (error) {
1767                xfs_notice(mp, "%s: xfs_defer_finish returned error %d",
1768                        __func__, error);
1769                xfs_defer_cancel(&dfops);
1770        }
1771        error = xfs_trans_commit(tp);
1772        if (error)
1773                xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1774                        __func__, error);
1775
1776        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1777        return 0;
1778}
1779
1780/*
1781 * xfs_inactive
1782 *
1783 * This is called when the vnode reference count for the vnode
1784 * goes to zero.  If the file has been unlinked, then it must
1785 * now be truncated.  Also, we clear all of the read-ahead state
1786 * kept for the inode here since the file is now closed.
1787 */
1788void
1789xfs_inactive(
1790        xfs_inode_t     *ip)
1791{
1792        struct xfs_mount        *mp;
1793        int                     error;
1794        int                     truncate = 0;
1795
1796        /*
1797         * If the inode is already free, then there can be nothing
1798         * to clean up here.
1799         */
1800        if (VFS_I(ip)->i_mode == 0) {
1801                ASSERT(ip->i_df.if_real_bytes == 0);
1802                ASSERT(ip->i_df.if_broot_bytes == 0);
1803                return;
1804        }
1805
1806        mp = ip->i_mount;
1807
1808        /* If this is a read-only mount, don't do this (would generate I/O) */
1809        if (mp->m_flags & XFS_MOUNT_RDONLY)
1810                return;
1811
1812        if (VFS_I(ip)->i_nlink != 0) {
1813                /*
1814                 * force is true because we are evicting an inode from the
1815                 * cache. Post-eof blocks must be freed, lest we end up with
1816                 * broken free space accounting.
1817                 *
1818                 * Note: don't bother with iolock here since lockdep complains
1819                 * about acquiring it in reclaim context. We have the only
1820                 * reference to the inode at this point anyways.
1821                 */
1822                if (xfs_can_free_eofblocks(ip, true))
1823                        xfs_free_eofblocks(ip);
1824
1825                return;
1826        }
1827
1828        if (S_ISREG(VFS_I(ip)->i_mode) &&
1829            (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
1830             ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
1831                truncate = 1;
1832
1833        error = xfs_qm_dqattach(ip, 0);
1834        if (error)
1835                return;
1836
1837        if (S_ISLNK(VFS_I(ip)->i_mode))
1838                error = xfs_inactive_symlink(ip);
1839        else if (truncate)
1840                error = xfs_inactive_truncate(ip);
1841        if (error)
1842                return;
1843
1844        /*
1845         * If there are attributes associated with the file then blow them away
1846         * now.  The code calls a routine that recursively deconstructs the
1847         * attribute fork. If also blows away the in-core attribute fork.
1848         */
1849        if (XFS_IFORK_Q(ip)) {
1850                error = xfs_attr_inactive(ip);
1851                if (error)
1852                        return;
1853        }
1854
1855        ASSERT(!ip->i_afp);
1856        ASSERT(ip->i_d.di_anextents == 0);
1857        ASSERT(ip->i_d.di_forkoff == 0);
1858
1859        /*
1860         * Free the inode.
1861         */
1862        error = xfs_inactive_ifree(ip);
1863        if (error)
1864                return;
1865
1866        /*
1867         * Release the dquots held by inode, if any.
1868         */
1869        xfs_qm_dqdetach(ip);
1870}
1871
1872/*
1873 * This is called when the inode's link count goes to 0 or we are creating a
1874 * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
1875 * set to true as the link count is dropped to zero by the VFS after we've
1876 * created the file successfully, so we have to add it to the unlinked list
1877 * while the link count is non-zero.
1878 *
1879 * We place the on-disk inode on a list in the AGI.  It will be pulled from this
1880 * list when the inode is freed.
1881 */
1882STATIC int
1883xfs_iunlink(
1884        struct xfs_trans *tp,
1885        struct xfs_inode *ip)
1886{
1887        xfs_mount_t     *mp = tp->t_mountp;
1888        xfs_agi_t       *agi;
1889        xfs_dinode_t    *dip;
1890        xfs_buf_t       *agibp;
1891        xfs_buf_t       *ibp;
1892        xfs_agino_t     agino;
1893        short           bucket_index;
1894        int             offset;
1895        int             error;
1896
1897        ASSERT(VFS_I(ip)->i_mode != 0);
1898
1899        /*
1900         * Get the agi buffer first.  It ensures lock ordering
1901         * on the list.
1902         */
1903        error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1904        if (error)
1905                return error;
1906        agi = XFS_BUF_TO_AGI(agibp);
1907
1908        /*
1909         * Get the index into the agi hash table for the
1910         * list this inode will go on.
1911         */
1912        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1913        ASSERT(agino != 0);
1914        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1915        ASSERT(agi->agi_unlinked[bucket_index]);
1916        ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1917
1918        if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
1919                /*
1920                 * There is already another inode in the bucket we need
1921                 * to add ourselves to.  Add us at the front of the list.
1922                 * Here we put the head pointer into our next pointer,
1923                 * and then we fall through to point the head at us.
1924                 */
1925                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1926                                       0, 0);
1927                if (error)
1928                        return error;
1929
1930                ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1931                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1932                offset = ip->i_imap.im_boffset +
1933                        offsetof(xfs_dinode_t, di_next_unlinked);
1934
1935                /* need to recalc the inode CRC if appropriate */
1936                xfs_dinode_calc_crc(mp, dip);
1937
1938                xfs_trans_inode_buf(tp, ibp);
1939                xfs_trans_log_buf(tp, ibp, offset,
1940                                  (offset + sizeof(xfs_agino_t) - 1));
1941                xfs_inobp_check(mp, ibp);
1942        }
1943
1944        /*
1945         * Point the bucket head pointer at the inode being inserted.
1946         */
1947        ASSERT(agino != 0);
1948        agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1949        offset = offsetof(xfs_agi_t, agi_unlinked) +
1950                (sizeof(xfs_agino_t) * bucket_index);
1951        xfs_trans_log_buf(tp, agibp, offset,
1952                          (offset + sizeof(xfs_agino_t) - 1));
1953        return 0;
1954}
1955
1956/*
1957 * Pull the on-disk inode from the AGI unlinked list.
1958 */
1959STATIC int
1960xfs_iunlink_remove(
1961        xfs_trans_t     *tp,
1962        xfs_inode_t     *ip)
1963{
1964        xfs_ino_t       next_ino;
1965        xfs_mount_t     *mp;
1966        xfs_agi_t       *agi;
1967        xfs_dinode_t    *dip;
1968        xfs_buf_t       *agibp;
1969        xfs_buf_t       *ibp;
1970        xfs_agnumber_t  agno;
1971        xfs_agino_t     agino;
1972        xfs_agino_t     next_agino;
1973        xfs_buf_t       *last_ibp;
1974        xfs_dinode_t    *last_dip = NULL;
1975        short           bucket_index;
1976        int             offset, last_offset = 0;
1977        int             error;
1978
1979        mp = tp->t_mountp;
1980        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1981
1982        /*
1983         * Get the agi buffer first.  It ensures lock ordering
1984         * on the list.
1985         */
1986        error = xfs_read_agi(mp, tp, agno, &agibp);
1987        if (error)
1988                return error;
1989
1990        agi = XFS_BUF_TO_AGI(agibp);
1991
1992        /*
1993         * Get the index into the agi hash table for the
1994         * list this inode will go on.
1995         */
1996        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1997        ASSERT(agino != 0);
1998        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1999        ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
2000        ASSERT(agi->agi_unlinked[bucket_index]);

2001
2002        if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
2003                /*
2004                 * We're at the head of the list.  Get the inode's on-disk
2005                 * buffer to see if there is anyone after us on the list.
2006                 * Only modify our next pointer if it is not already NULLAGINO.
2007                 * This saves us the overhead of dealing with the buffer when
2008                 * there is no need to change it.
2009                 */
2010                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2011                                       0, 0);
2012                if (error) {
2013                        xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
2014                                __func__, error);
2015                        return error;
2016                }
2017                next_agino = be32_to_cpu(dip->di_next_unlinked);
2018                ASSERT(next_agino != 0);
2019                if (next_agino != NULLAGINO) {
2020                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2021                        offset = ip->i_imap.im_boffset +
2022                                offsetof(xfs_dinode_t, di_next_unlinked);
2023
2024                        /* need to recalc the inode CRC if appropriate */
2025                        xfs_dinode_calc_crc(mp, dip);
2026
2027                        xfs_trans_inode_buf(tp, ibp);
2028                        xfs_trans_log_buf(tp, ibp, offset,
2029                                          (offset + sizeof(xfs_agino_t) - 1));
2030                        xfs_inobp_check(mp, ibp);
2031                } else {
2032                        xfs_trans_brelse(tp, ibp);
2033                }
2034                /*
2035                 * Point the bucket head pointer at the next inode.
2036                 */
2037                ASSERT(next_agino != 0);
2038                ASSERT(next_agino != agino);
2039                agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2040                offset = offsetof(xfs_agi_t, agi_unlinked) +
2041                        (sizeof(xfs_agino_t) * bucket_index);
2042                xfs_trans_log_buf(tp, agibp, offset,
2043                                  (offset + sizeof(xfs_agino_t) - 1));
2044        } else {
2045                /*
2046                 * We need to search the list for the inode being freed.
2047                 */
2048                next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2049                last_ibp = NULL;
2050                while (next_agino != agino) {
2051                        struct xfs_imap imap;
2052
2053                        if (last_ibp)
2054                                xfs_trans_brelse(tp, last_ibp);
2055
2056                        imap.im_blkno = 0;
2057                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
2058
2059                        error = xfs_imap(mp, tp, next_ino, &imap, 0);
2060                        if (error) {
2061                                xfs_warn(mp,
2062        "%s: xfs_imap returned error %d.",
2063                                         __func__, error);
2064                                return error;
2065                        }
2066
2067                        error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
2068                                               &last_ibp, 0, 0);
2069                        if (error) {
2070                                xfs_warn(mp,
2071        "%s: xfs_imap_to_bp returned error %d.",
2072                                        __func__, error);
2073                                return error;
2074                        }
2075
2076                        last_offset = imap.im_boffset;
2077                        next_agino = be32_to_cpu(last_dip->di_next_unlinked);
2078                        ASSERT(next_agino != NULLAGINO);
2079                        ASSERT(next_agino != 0);
2080                }
2081
2082                /*
2083                 * Now last_ibp points to the buffer previous to us on the
2084                 * unlinked list.  Pull us from the list.
2085                 */
2086                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2087                                       0, 0);
2088                if (error) {
2089                        xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
2090                                __func__, error);
2091                        return error;
2092                }
2093                next_agino = be32_to_cpu(dip->di_next_unlinked);
2094                ASSERT(next_agino != 0);
2095                ASSERT(next_agino != agino);
2096                if (next_agino != NULLAGINO) {
2097                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2098                        offset = ip->i_imap.im_boffset +
2099                                offsetof(xfs_dinode_t, di_next_unlinked);
2100
2101                        /* need to recalc the inode CRC if appropriate */
2102                        xfs_dinode_calc_crc(mp, dip);
2103
2104                        xfs_trans_inode_buf(tp, ibp);
2105                        xfs_trans_log_buf(tp, ibp, offset,
2106                                          (offset + sizeof(xfs_agino_t) - 1));
2107                        xfs_inobp_check(mp, ibp);
2108                } else {
2109                        xfs_trans_brelse(tp, ibp);
2110                }
2111                /*
2112                 * Point the previous inode on the list to the next inode.
2113                 */
2114                last_dip->di_next_unlinked = cpu_to_be32(next_agino);
2115                ASSERT(next_agino != 0);
2116                offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2117
2118                /* need to recalc the inode CRC if appropriate */
2119                xfs_dinode_calc_crc(mp, last_dip);
2120
2121                xfs_trans_inode_buf(tp, last_ibp);
2122                xfs_trans_log_buf(tp, last_ibp, offset,
2123                                  (offset + sizeof(xfs_agino_t) - 1));
2124                xfs_inobp_check(mp, last_ibp);
2125        }
2126        return 0;
2127}
2128
2129/*
2130 * A big issue when freeing the inode cluster is that we _cannot_ skip any
2131 * inodes that are in memory - they all must be marked stale and attached to
2132 * the cluster buffer.
2133 */
2134STATIC int
2135xfs_ifree_cluster(
2136        xfs_inode_t             *free_ip,
2137        xfs_trans_t             *tp,
2138        struct xfs_icluster     *xic)
2139{
2140        xfs_mount_t             *mp = free_ip->i_mount;
2141        int                     blks_per_cluster;
2142        int                     inodes_per_cluster;
2143        int                     nbufs;
2144        int                     i, j;
2145        int                     ioffset;
2146        xfs_daddr_t             blkno;
2147        xfs_buf_t               *bp;
2148        xfs_inode_t             *ip;
2149        xfs_inode_log_item_t    *iip;
2150        xfs_log_item_t          *lip;
2151        struct xfs_perag        *pag;
2152        xfs_ino_t               inum;
2153
2154        inum = xic->first_ino;
2155        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2156        blks_per_cluster = xfs_icluster_size_fsb(mp);
2157        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
2158        nbufs = mp->m_ialloc_blks / blks_per_cluster;
2159
2160        for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
2161                /*
2162                 * The allocation bitmap tells us which inodes of the chunk were
2163                 * physically allocated. Skip the cluster if an inode falls into
2164                 * a sparse region.
2165                 */
2166                ioffset = inum - xic->first_ino;
2167                if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2168                        ASSERT(do_mod(ioffset, inodes_per_cluster) == 0);
2169                        continue;
2170                }
2171
2172                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2173                                         XFS_INO_TO_AGBNO(mp, inum));
2174
2175                /*
2176                 * We obtain and lock the backing buffer first in the process
2177                 * here, as we have to ensure that any dirty inode that we
2178                 * can't get the flush lock on is attached to the buffer.
2179                 * If we scan the in-memory inodes first, then buffer IO can
2180                 * complete before we get a lock on it, and hence we may fail
2181                 * to mark all the active inodes on the buffer stale.
2182                 */
2183                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2184                                        mp->m_bsize * blks_per_cluster,
2185                                        XBF_UNMAPPED);
2186
2187                if (!bp)
2188                        return -ENOMEM;
2189
2190                /*
2191                 * This buffer may not have been correctly initialised as we
2192                 * didn't read it from disk. That's not important because we are
2193                 * only using to mark the buffer as stale in the log, and to
2194                 * attach stale cached inodes on it. That means it will never be
2195                 * dispatched for IO. If it is, we want to know about it, and we
2196                 * want it to fail. We can acheive this by adding a write
2197                 * verifier to the buffer.
2198                 */
2199                 bp->b_ops = &xfs_inode_buf_ops;
2200
2201                /*
2202                 * Walk the inodes already attached to the buffer and mark them
2203                 * stale. These will all have the flush locks held, so an
2204                 * in-memory inode walk can't lock them. By marking them all
2205                 * stale first, we will not attempt to lock them in the loop
2206                 * below as the XFS_ISTALE flag will be set.
2207                 */
2208                lip = bp->b_fspriv;
2209                while (lip) {
2210                        if (lip->li_type == XFS_LI_INODE) {
2211                                iip = (xfs_inode_log_item_t *)lip;
2212                                ASSERT(iip->ili_logged == 1);
2213                                lip->li_cb = xfs_istale_done;
2214                                xfs_trans_ail_copy_lsn(mp->m_ail,
2215                                                        &iip->ili_flush_lsn,
2216                                                        &iip->ili_item.li_lsn);
2217                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2218                        }
2219                        lip = lip->li_bio_list;
2220                }
2221
2222
2223                /*
2224                 * For each inode in memory attempt to add it to the inode
2225                 * buffer and set it up for being staled on buffer IO
2226                 * completion.  This is safe as we've locked out tail pushing
2227                 * and flushing by locking the buffer.
2228                 *
2229                 * We have already marked every inode that was part of a
2230                 * transaction stale above, which means there is no point in
2231                 * even trying to lock them.
2232                 */
2233                for (i = 0; i < inodes_per_cluster; i++) {
2234retry:
2235                        rcu_read_lock();
2236                        ip = radix_tree_lookup(&pag->pag_ici_root,
2237                                        XFS_INO_TO_AGINO(mp, (inum + i)));
2238
2239                        /* Inode not in memory, nothing to do */
2240                        if (!ip) {
2241                                rcu_read_unlock();
2242                                continue;
2243                        }
2244
2245                        /*
2246                         * because this is an RCU protected lookup, we could
2247                         * find a recently freed or even reallocated inode
2248                         * during the lookup. We need to check under the
2249                         * i_flags_lock for a valid inode here. Skip it if it
2250                         * is not valid, the wrong inode or stale.
2251                         */
2252                        spin_lock(&ip->i_flags_lock);
2253                        if (ip->i_ino != inum + i ||
2254                            __xfs_iflags_test(ip, XFS_ISTALE)) {
2255                                spin_unlock(&ip->i_flags_lock);
2256                                rcu_read_unlock();
2257                                continue;
2258                        }
2259                        spin_unlock(&ip->i_flags_lock);
2260
2261                        /*
2262                         * Don't try to lock/unlock the current inode, but we
2263                         * _cannot_ skip the other inodes that we did not find
2264                         * in the list attached to the buffer and are not
2265                         * already marked stale. If we can't lock it, back off
2266                         * and retry.
2267                         */
2268                        if (ip != free_ip) {
2269                                if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2270                                        rcu_read_unlock();
2271                                        delay(1);
2272                                        goto retry;
2273                                }
2274
2275                                /*
2276                                 * Check the inode number again in case we're
2277                                 * racing with freeing in xfs_reclaim_inode().
2278                                 * See the comments in that function for more
2279                                 * information as to why the initial check is
2280                                 * not sufficient.
2281                                 */
2282                                if (ip->i_ino != inum + i) {
2283                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
2284                                        rcu_read_unlock();
2285                                        continue;
2286                                }
2287                        }
2288                        rcu_read_unlock();
2289
2290                        xfs_iflock(ip);
2291                        xfs_iflags_set(ip, XFS_ISTALE);
2292
2293                        /*
2294                         * we don't need to attach clean inodes or those only
2295                         * with unlogged changes (which we throw away, anyway).
2296                         */
2297                        iip = ip->i_itemp;
2298                        if (!iip || xfs_inode_clean(ip)) {
2299                                ASSERT(ip != free_ip);
2300                                xfs_ifunlock(ip);
2301                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2302                                continue;
2303                        }
2304
2305                        iip->ili_last_fields = iip->ili_fields;
2306                        iip->ili_fields = 0;
2307                        iip->ili_fsync_fields = 0;
2308                        iip->ili_logged = 1;
2309                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2310                                                &iip->ili_item.li_lsn);
2311
2312                        xfs_buf_attach_iodone(bp, xfs_istale_done,
2313                                                  &iip->ili_item);
2314
2315                        if (ip != free_ip)
2316                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2317                }
2318
2319                xfs_trans_stale_inode_buf(tp, bp);
2320                xfs_trans_binval(tp, bp);
2321        }
2322
2323        xfs_perag_put(pag);
2324        return 0;
2325}
2326
2327/*
2328 * Free any local-format buffers sitting around before we reset to
2329 * extents format.
2330 */
2331static inline void
2332xfs_ifree_local_data(
2333        struct xfs_inode        *ip,
2334        int                     whichfork)
2335{
2336        struct xfs_ifork        *ifp;
2337
2338        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
2339                return;
2340
2341        ifp = XFS_IFORK_PTR(ip, whichfork);
2342        xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
2343}
2344
2345/*
2346 * This is called to return an inode to the inode free list.
2347 * The inode should already be truncated to 0 length and have
2348 * no pages associated with it.  This routine also assumes that
2349 * the inode is already a part of the transaction.
2350 *
2351 * The on-disk copy of the inode will have been added to the list
2352 * of unlinked inodes in the AGI. We need to remove the inode from
2353 * that list atomically with respect to freeing it here.
2354 */
2355int
2356xfs_ifree(
2357        xfs_trans_t     *tp,
2358        xfs_inode_t     *ip,
2359        struct xfs_defer_ops    *dfops)
2360{
2361        int                     error;
2362        struct xfs_icluster     xic = { 0 };
2363
2364        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2365        ASSERT(VFS_I(ip)->i_nlink == 0);
2366        ASSERT(ip->i_d.di_nextents == 0);
2367        ASSERT(ip->i_d.di_anextents == 0);
2368        ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2369        ASSERT(ip->i_d.di_nblocks == 0);
2370
2371        /*
2372         * Pull the on-disk inode from the AGI unlinked list.
2373         */
2374        error = xfs_iunlink_remove(tp, ip);
2375        if (error)
2376                return error;
2377
2378        error = xfs_difree(tp, ip->i_ino, dfops, &xic);
2379        if (error)
2380                return error;
2381
2382        xfs_ifree_local_data(ip, XFS_DATA_FORK);
2383        xfs_ifree_local_data(ip, XFS_ATTR_FORK);
2384
2385        VFS_I(ip)->i_mode = 0;          /* mark incore inode as free */
2386        ip->i_d.di_flags = 0;
2387        ip->i_d.di_dmevmask = 0;
2388        ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
2389        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2390        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2391        /*
2392         * Bump the generation count so no one will be confused
2393         * by reincarnations of this inode.
2394         */
2395        VFS_I(ip)->i_generation++;
2396        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2397
2398        if (xic.deleted)
2399                error = xfs_ifree_cluster(ip, tp, &xic);
2400
2401        return error;
2402}
2403
2404/*
2405 * This is called to unpin an inode.  The caller must have the inode locked
2406 * in at least shared mode so that the buffer cannot be subsequently pinned
2407 * once someone is waiting for it to be unpinned.
2408 */
2409static void
2410xfs_iunpin(
2411        struct xfs_inode        *ip)
2412{
2413        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2414
2415        trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2416
2417        /* Give the log a push to start the unpinning I/O */
2418        xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2419
2420}
2421
2422static void
2423__xfs_iunpin_wait(
2424        struct xfs_inode        *ip)
2425{
2426        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2427        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2428
2429        xfs_iunpin(ip);
2430
2431        do {
2432                prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2433                if (xfs_ipincount(ip))
2434                        io_schedule();
2435        } while (xfs_ipincount(ip));
2436        finish_wait(wq, &wait.wait);
2437}
2438
2439void
2440xfs_iunpin_wait(
2441        struct xfs_inode        *ip)
2442{
2443        if (xfs_ipincount(ip))
2444                __xfs_iunpin_wait(ip);
2445}
2446
2447/*
2448 * Removing an inode from the namespace involves removing the directory entry
2449 * and dropping the link count on the inode. Removing the directory entry can
2450 * result in locking an AGF (directory blocks were freed) and removing a link
2451 * count can result in placing the inode on an unlinked list which results in
2452 * locking an AGI.
2453 *
2454 * The big problem here is that we have an ordering constraint on AGF and AGI
2455 * locking - inode allocation locks the AGI, then can allocate a new extent for
2456 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2457 * removes the inode from the unlinked list, requiring that we lock the AGI
2458 * first, and then freeing the inode can result in an inode chunk being freed
2459 * and hence freeing disk space requiring that we lock an AGF.
2460 *
2461 * Hence the ordering that is imposed by other parts of the code is AGI before
2462 * AGF. This means we cannot remove the directory entry before we drop the inode
2463 * reference count and put it on the unlinked list as this results in a lock
2464 * order of AGF then AGI, and this can deadlock against inode allocation and
2465 * freeing. Therefore we must drop the link counts before we remove the
2466 * directory entry.
2467 *
2468 * This is still safe from a transactional point of view - it is not until we
2469 * get to xfs_defer_finish() that we have the possibility of multiple
2470 * transactions in this operation. Hence as long as we remove the directory
2471 * entry and drop the link count in the first transaction of the remove
2472 * operation, there are no transactional constraints on the ordering here.
2473 */
2474int
2475xfs_remove(
2476        xfs_inode_t             *dp,
2477        struct xfs_name         *name,
2478        xfs_inode_t             *ip)
2479{
2480        xfs_mount_t             *mp = dp->i_mount;
2481        xfs_trans_t             *tp = NULL;
2482        int                     is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2483        int                     error = 0;
2484        struct xfs_defer_ops    dfops;
2485        xfs_fsblock_t           first_block;
2486        uint                    resblks;
2487
2488        trace_xfs_remove(dp, name);
2489
2490        if (XFS_FORCED_SHUTDOWN(mp))
2491                return -EIO;
2492
2493        error = xfs_qm_dqattach(dp, 0);
2494        if (error)
2495                goto std_return;
2496
2497        error = xfs_qm_dqattach(ip, 0);
2498        if (error)
2499                goto std_return;
2500
2501        /*
2502         * We try to get the real space reservation first,
2503         * allowing for directory btree deletion(s) implying
2504         * possible bmap insert(s).  If we can't get the space
2505         * reservation then we use 0 instead, and avoid the bmap
2506         * btree insert(s) in the directory code by, if the bmap
2507         * insert tries to happen, instead trimming the LAST
2508         * block from the directory.
2509         */
2510        resblks = XFS_REMOVE_SPACE_RES(mp);
2511        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
2512        if (error == -ENOSPC) {
2513                resblks = 0;
2514                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
2515                                &tp);
2516        }
2517        if (error) {
2518                ASSERT(error != -ENOSPC);
2519                goto std_return;
2520        }
2521
2522        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
2523        xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
2524
2525        xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
2526        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2527
2528        /*
2529         * If we're removing a directory perform some additional validation.
2530         */
2531        if (is_dir) {
2532                ASSERT(VFS_I(ip)->i_nlink >= 2);
2533                if (VFS_I(ip)->i_nlink != 2) {
2534                        error = -ENOTEMPTY;
2535                        goto out_trans_cancel;
2536                }
2537                if (!xfs_dir_isempty(ip)) {
2538                        error = -ENOTEMPTY;
2539                        goto out_trans_cancel;
2540                }
2541
2542                /* Drop the link from ip's "..".  */
2543                error = xfs_droplink(tp, dp);
2544                if (error)
2545                        goto out_trans_cancel;
2546
2547                /* Drop the "." link from ip to self.  */
2548                error = xfs_droplink(tp, ip);
2549                if (error)
2550                        goto out_trans_cancel;
2551        } else {
2552                /*
2553                 * When removing a non-directory we need to log the parent
2554                 * inode here.  For a directory this is done implicitly
2555                 * by the xfs_droplink call for the ".." entry.
2556                 */
2557                xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2558        }
2559        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2560
2561        /* Drop the link from dp to ip. */
2562        error = xfs_droplink(tp, ip);
2563        if (error)
2564                goto out_trans_cancel;
2565
2566        xfs_defer_init(&dfops, &first_block);
2567        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2568                                        &first_block, &dfops, resblks);
2569        if (error) {
2570                ASSERT(error != -ENOENT);
2571                goto out_bmap_cancel;
2572        }
2573
2574        /*
2575         * If this is a synchronous mount, make sure that the
2576         * remove transaction goes to disk before returning to
2577         * the user.
2578         */
2579        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2580                xfs_trans_set_sync(tp);
2581
2582        error = xfs_defer_finish(&tp, &dfops);
2583        if (error)
2584                goto out_bmap_cancel;
2585
2586        error = xfs_trans_commit(tp);
2587        if (error)
2588                goto std_return;
2589
2590        if (is_dir && xfs_inode_is_filestream(ip))
2591                xfs_filestream_deassociate(ip);
2592
2593        return 0;
2594
2595 out_bmap_cancel:
2596        xfs_defer_cancel(&dfops);
2597 out_trans_cancel:
2598        xfs_trans_cancel(tp);
2599 std_return:
2600        return error;
2601}
2602
2603/*
2604 * Enter all inodes for a rename transaction into a sorted array.
2605 */
2606#define __XFS_SORT_INODES       5
2607STATIC void
2608xfs_sort_for_rename(
2609        struct xfs_inode        *dp1,   /* in: old (source) directory inode */
2610        struct xfs_inode        *dp2,   /* in: new (target) directory inode */
2611        struct xfs_inode        *ip1,   /* in: inode of old entry */
2612        struct xfs_inode        *ip2,   /* in: inode of new entry */
2613        struct xfs_inode        *wip,   /* in: whiteout inode */
2614        struct xfs_inode        **i_tab,/* out: sorted array of inodes */
2615        int                     *num_inodes)  /* in/out: inodes in array */
2616{
2617        int                     i, j;
2618
2619        ASSERT(*num_inodes == __XFS_SORT_INODES);
2620        memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2621
2622        /*
2623         * i_tab contains a list of pointers to inodes.  We initialize
2624         * the table here & we'll sort it.  We will then use it to
2625         * order the acquisition of the inode locks.
2626         *
2627         * Note that the table may contain duplicates.  e.g., dp1 == dp2.
2628         */
2629        i = 0;
2630        i_tab[i++] = dp1;
2631        i_tab[i++] = dp2;
2632        i_tab[i++] = ip1;
2633        if (ip2)
2634                i_tab[i++] = ip2;
2635        if (wip)
2636                i_tab[i++] = wip;
2637        *num_inodes = i;
2638
2639        /*
2640         * Sort the elements via bubble sort.  (Remember, there are at
2641         * most 5 elements to sort, so this is adequate.)
2642         */
2643        for (i = 0; i < *num_inodes; i++) {
2644                for (j = 1; j < *num_inodes; j++) {
2645                        if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2646                                struct xfs_inode *temp = i_tab[j];
2647                                i_tab[j] = i_tab[j-1];
2648                                i_tab[j-1] = temp;
2649                        }
2650                }
2651        }
2652}
2653
2654static int
2655xfs_finish_rename(
2656        struct xfs_trans        *tp,
2657        struct xfs_defer_ops    *dfops)
2658{
2659        int                     error;
2660
2661        /*
2662         * If this is a synchronous mount, make sure that the rename transaction
2663         * goes to disk before returning to the user.
2664         */
2665        if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2666                xfs_trans_set_sync(tp);
2667
2668        error = xfs_defer_finish(&tp, dfops);
2669        if (error) {
2670                xfs_defer_cancel(dfops);
2671                xfs_trans_cancel(tp);
2672                return error;
2673        }
2674
2675        return xfs_trans_commit(tp);
2676}
2677
2678/*
2679 * xfs_cross_rename()
2680 *
2681 * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
2682 */
2683STATIC int
2684xfs_cross_rename(
2685        struct xfs_trans        *tp,
2686        struct xfs_inode        *dp1,
2687        struct xfs_name         *name1,
2688        struct xfs_inode        *ip1,
2689        struct xfs_inode        *dp2,
2690        struct xfs_name         *name2,
2691        struct xfs_inode        *ip2,
2692        struct xfs_defer_ops    *dfops,
2693        xfs_fsblock_t           *first_block,
2694        int                     spaceres)
2695{
2696        int             error = 0;
2697        int             ip1_flags = 0;
2698        int             ip2_flags = 0;
2699        int             dp2_flags = 0;
2700
2701        /* Swap inode number for dirent in first parent */
2702        error = xfs_dir_replace(tp, dp1, name1,
2703                                ip2->i_ino,
2704                                first_block, dfops, spaceres);
2705        if (error)
2706                goto out_trans_abort;
2707
2708        /* Swap inode number for dirent in second parent */
2709        error = xfs_dir_replace(tp, dp2, name2,
2710                                ip1->i_ino,
2711                                first_block, dfops, spaceres);
2712        if (error)
2713                goto out_trans_abort;
2714
2715        /*
2716         * If we're renaming one or more directories across different parents,
2717         * update the respective ".." entries (and link counts) to match the new
2718         * parents.
2719         */
2720        if (dp1 != dp2) {
2721                dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2722
2723                if (S_ISDIR(VFS_I(ip2)->i_mode)) {
2724                        error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2725                                                dp1->i_ino, first_block,
2726                                                dfops, spaceres);
2727                        if (error)
2728                                goto out_trans_abort;
2729
2730                        /* transfer ip2 ".." reference to dp1 */
2731                        if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
2732                                error = xfs_droplink(tp, dp2);
2733                                if (error)
2734                                        goto out_trans_abort;
2735                                error = xfs_bumplink(tp, dp1);
2736                                if (error)
2737                                        goto out_trans_abort;
2738                        }
2739
2740                        /*
2741                         * Although ip1 isn't changed here, userspace needs
2742                         * to be warned about the change, so that applications
2743                         * relying on it (like backup ones), will properly
2744                         * notify the change
2745                         */
2746                        ip1_flags |= XFS_ICHGTIME_CHG;
2747                        ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2748                }
2749
2750                if (S_ISDIR(VFS_I(ip1)->i_mode)) {
2751                        error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
2752                                                dp2->i_ino, first_block,
2753                                                dfops, spaceres);
2754                        if (error)
2755                                goto out_trans_abort;
2756
2757                        /* transfer ip1 ".." reference to dp2 */
2758                        if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
2759                                error = xfs_droplink(tp, dp1);
2760                                if (error)
2761                                        goto out_trans_abort;
2762                                error = xfs_bumplink(tp, dp2);
2763                                if (error)
2764                                        goto out_trans_abort;
2765                        }
2766
2767                        /*
2768                         * Although ip2 isn't changed here, userspace needs
2769                         * to be warned about the change, so that applications
2770                         * relying on it (like backup ones), will properly
2771                         * notify the change
2772                         */
2773                        ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2774                        ip2_flags |= XFS_ICHGTIME_CHG;
2775                }
2776        }
2777
2778        if (ip1_flags) {
2779                xfs_trans_ichgtime(tp, ip1, ip1_flags);
2780                xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
2781        }
2782        if (ip2_flags) {
2783                xfs_trans_ichgtime(tp, ip2, ip2_flags);
2784                xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
2785        }
2786        if (dp2_flags) {
2787                xfs_trans_ichgtime(tp, dp2, dp2_flags);
2788                xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
2789        }
2790        xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2791        xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
2792        return xfs_finish_rename(tp, dfops);
2793
2794out_trans_abort:
2795        xfs_defer_cancel(dfops);
2796        xfs_trans_cancel(tp);
2797        return error;
2798}
2799
2800/*
2801 * xfs_rename_alloc_whiteout()
2802 *
2803 * Return a referenced, unlinked, unlocked inode that that can be used as a
2804 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
2805 * crash between allocating the inode and linking it into the rename transaction
2806 * recovery will free the inode and we won't leak it.
2807 */
2808static int
2809xfs_rename_alloc_whiteout(
2810        struct xfs_inode        *dp,
2811        struct xfs_inode        **wip)
2812{
2813        struct xfs_inode        *tmpfile;
2814        int                     error;
2815
2816        error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile);
2817        if (error)
2818                return error;
2819
2820        /*
2821         * Prepare the tmpfile inode as if it were created through the VFS.
2822         * Otherwise, the link increment paths will complain about nlink 0->1.
2823         * Drop the link count as done by d_tmpfile(), complete the inode setup
2824         * and flag it as linkable.
2825         */
2826        drop_nlink(VFS_I(tmpfile));
2827        xfs_setup_iops(tmpfile);
2828        xfs_finish_inode_setup(tmpfile);
2829        VFS_I(tmpfile)->i_state |= I_LINKABLE;
2830
2831        *wip = tmpfile;
2832        return 0;
2833}
2834
2835/*
2836 * xfs_rename
2837 */
2838int
2839xfs_rename(
2840        struct xfs_inode        *src_dp,
2841        struct xfs_name         *src_name,
2842        struct xfs_inode        *src_ip,
2843        struct xfs_inode        *target_dp,
2844        struct xfs_name         *target_name,
2845        struct xfs_inode        *target_ip,
2846        unsigned int            flags)
2847{
2848        struct xfs_mount        *mp = src_dp->i_mount;
2849        struct xfs_trans        *tp;
2850        struct xfs_defer_ops    dfops;
2851        xfs_fsblock_t           first_block;
2852        struct xfs_inode        *wip = NULL;            /* whiteout inode */
2853        struct xfs_inode        *inodes[__XFS_SORT_INODES];
2854        int                     num_inodes = __XFS_SORT_INODES;
2855        bool                    new_parent = (src_dp != target_dp);
2856        bool                    src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
2857        int                     spaceres;
2858        int                     error;
2859
2860        trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2861
2862        if ((flags & RENAME_EXCHANGE) && !target_ip)
2863                return -EINVAL;
2864
2865        /*
2866         * If we are doing a whiteout operation, allocate the whiteout inode
2867         * we will be placing at the target and ensure the type is set
2868         * appropriately.
2869         */
2870        if (flags & RENAME_WHITEOUT) {
2871                ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
2872                error = xfs_rename_alloc_whiteout(target_dp, &wip);
2873                if (error)
2874                        return error;
2875
2876                /* setup target dirent info as whiteout */
2877                src_name->type = XFS_DIR3_FT_CHRDEV;
2878        }
2879
2880        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
2881                                inodes, &num_inodes);
2882
2883        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2884        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
2885        if (error == -ENOSPC) {
2886                spaceres = 0;
2887                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
2888                                &tp);
2889        }
2890        if (error)
2891                goto out_release_wip;
2892
2893        /*
2894         * Attach the dquots to the inodes
2895         */
2896        error = xfs_qm_vop_rename_dqattach(inodes);
2897        if (error)
2898                goto out_trans_cancel;
2899
2900        /*
2901         * Lock all the participating inodes. Depending upon whether
2902         * the target_name exists in the target directory, and
2903         * whether the target directory is the same as the source
2904         * directory, we can lock from 2 to 4 inodes.
2905         */
2906        if (!new_parent)
2907                xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
2908        else
2909                xfs_lock_two_inodes(src_dp, target_dp,
2910                                    XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
2911
2912        xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2913
2914        /*
2915         * Join all the inodes to the transaction. From this point on,
2916         * we can rely on either trans_commit or trans_cancel to unlock
2917         * them.
2918         */
2919        xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
2920        if (new_parent)
2921                xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
2922        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2923        if (target_ip)
2924                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2925        if (wip)
2926                xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
2927
2928        /*
2929         * If we are using project inheritance, we only allow renames
2930         * into our tree when the project IDs are the same; else the
2931         * tree quota mechanism would be circumvented.
2932         */
2933        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2934                     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
2935                error = -EXDEV;
2936                goto out_trans_cancel;
2937        }
2938
2939        xfs_defer_init(&dfops, &first_block);
2940
2941        /* RENAME_EXCHANGE is unique from here on. */
2942        if (flags & RENAME_EXCHANGE)
2943                return xfs_cross_rename(tp, src_dp, src_name, src_ip,
2944                                        target_dp, target_name, target_ip,
2945                                        &dfops, &first_block, spaceres);
2946
2947        /*
2948         * Set up the target.
2949         */
2950        if (target_ip == NULL) {
2951                /*
2952                 * If there's no space reservation, check the entry will
2953                 * fit before actually inserting it.
2954                 */
2955                if (!spaceres) {
2956                        error = xfs_dir_canenter(tp, target_dp, target_name);
2957                        if (error)
2958                                goto out_trans_cancel;
2959                }
2960                /*
2961                 * If target does not exist and the rename crosses
2962                 * directories, adjust the target directory link count
2963                 * to account for the ".." reference from the new entry.
2964                 */
2965                error = xfs_dir_createname(tp, target_dp, target_name,
2966                                                src_ip->i_ino, &first_block,
2967                                                &dfops, spaceres);
2968                if (error)
2969                        goto out_bmap_cancel;
2970
2971                xfs_trans_ichgtime(tp, target_dp,
2972                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2973
2974                if (new_parent && src_is_directory) {
2975                        error = xfs_bumplink(tp, target_dp);
2976                        if (error)
2977                                goto out_bmap_cancel;
2978                }
2979        } else { /* target_ip != NULL */
2980                /*
2981                 * If target exists and it's a directory, check that both
2982                 * target and source are directories and that target can be
2983                 * destroyed, or that neither is a directory.
2984                 */
2985                if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
2986                        /*
2987                         * Make sure target dir is empty.
2988                         */
2989                        if (!(xfs_dir_isempty(target_ip)) ||
2990                            (VFS_I(target_ip)->i_nlink > 2)) {
2991                                error = -EEXIST;
2992                                goto out_trans_cancel;
2993                        }
2994                }
2995
2996                /*
2997                 * Link the source inode under the target name.
2998                 * If the source inode is a directory and we are moving
2999                 * it across directories, its ".." entry will be
3000                 * inconsistent until we replace that down below.

3001                 *
3002                 * In case there is already an entry with the same
3003                 * name at the destination directory, remove it first.
3004                 */
3005                error = xfs_dir_replace(tp, target_dp, target_name,
3006                                        src_ip->i_ino,
3007                                        &first_block, &dfops, spaceres);
3008                if (error)
3009                        goto out_bmap_cancel;
3010
3011                xfs_trans_ichgtime(tp, target_dp,
3012                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3013
3014                /*
3015                 * Decrement the link count on the target since the target
3016                 * dir no longer points to it.
3017                 */
3018                error = xfs_droplink(tp, target_ip);
3019                if (error)
3020                        goto out_bmap_cancel;
3021
3022                if (src_is_directory) {
3023                        /*
3024                         * Drop the link from the old "." entry.
3025                         */
3026                        error = xfs_droplink(tp, target_ip);
3027                        if (error)
3028                                goto out_bmap_cancel;
3029                }
3030        } /* target_ip != NULL */
3031
3032        /*
3033         * Remove the source.
3034         */
3035        if (new_parent && src_is_directory) {
3036                /*
3037                 * Rewrite the ".." entry to point to the new
3038                 * directory.
3039                 */
3040                error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
3041                                        target_dp->i_ino,
3042                                        &first_block, &dfops, spaceres);
3043                ASSERT(error != -EEXIST);
3044                if (error)
3045                        goto out_bmap_cancel;
3046        }
3047
3048        /*
3049         * We always want to hit the ctime on the source inode.
3050         *
3051         * This isn't strictly required by the standards since the source
3052         * inode isn't really being changed, but old unix file systems did
3053         * it and some incremental backup programs won't work without it.
3054         */
3055        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
3056        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
3057
3058        /*
3059         * Adjust the link count on src_dp.  This is necessary when
3060         * renaming a directory, either within one parent when
3061         * the target existed, or across two parent directories.
3062         */
3063        if (src_is_directory && (new_parent || target_ip != NULL)) {
3064
3065                /*
3066                 * Decrement link count on src_directory since the
3067                 * entry that's moved no longer points to it.
3068                 */
3069                error = xfs_droplink(tp, src_dp);
3070                if (error)
3071                        goto out_bmap_cancel;
3072        }
3073
3074        /*
3075         * For whiteouts, we only need to update the source dirent with the
3076         * inode number of the whiteout inode rather than removing it
3077         * altogether.
3078         */
3079        if (wip) {
3080                error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3081                                        &first_block, &dfops, spaceres);
3082        } else
3083                error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3084                                           &first_block, &dfops, spaceres);
3085        if (error)
3086                goto out_bmap_cancel;
3087
3088        /*
3089         * For whiteouts, we need to bump the link count on the whiteout inode.
3090         * This means that failures all the way up to this point leave the inode
3091         * on the unlinked list and so cleanup is a simple matter of dropping
3092         * the remaining reference to it. If we fail here after bumping the link
3093         * count, we're shutting down the filesystem so we'll never see the
3094         * intermediate state on disk.
3095         */
3096        if (wip) {
3097                ASSERT(VFS_I(wip)->i_nlink == 0);
3098                error = xfs_bumplink(tp, wip);
3099                if (error)
3100                        goto out_bmap_cancel;
3101                error = xfs_iunlink_remove(tp, wip);
3102                if (error)
3103                        goto out_bmap_cancel;
3104                xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
3105
3106                /*
3107                 * Now we have a real link, clear the "I'm a tmpfile" state
3108                 * flag from the inode so it doesn't accidentally get misused in
3109                 * future.
3110                 */
3111                VFS_I(wip)->i_state &= ~I_LINKABLE;
3112        }
3113
3114        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3115        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3116        if (new_parent)
3117                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3118
3119        error = xfs_finish_rename(tp, &dfops);
3120        if (wip)
3121                IRELE(wip);
3122        return error;
3123
3124out_bmap_cancel:
3125        xfs_defer_cancel(&dfops);
3126out_trans_cancel:
3127        xfs_trans_cancel(tp);
3128out_release_wip:
3129        if (wip)
3130                IRELE(wip);
3131        return error;
3132}
3133
3134STATIC int
3135xfs_iflush_cluster(
3136        struct xfs_inode        *ip,
3137        struct xfs_buf          *bp)
3138{
3139        struct xfs_mount        *mp = ip->i_mount;
3140        struct xfs_perag        *pag;
3141        unsigned long           first_index, mask;
3142        unsigned long           inodes_per_cluster;
3143        int                     cilist_size;
3144        struct xfs_inode        **cilist;
3145        struct xfs_inode        *cip;
3146        int                     nr_found;
3147        int                     clcount = 0;
3148        int                     bufwasdelwri;
3149        int                     i;
3150
3151        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
3152
3153        inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
3154        cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
3155        cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
3156        if (!cilist)
3157                goto out_put;
3158
3159        mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
3160        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
3161        rcu_read_lock();
3162        /* really need a gang lookup range call here */
3163        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
3164                                        first_index, inodes_per_cluster);
3165        if (nr_found == 0)
3166                goto out_free;
3167
3168        for (i = 0; i < nr_found; i++) {
3169                cip = cilist[i];
3170                if (cip == ip)
3171                        continue;
3172
3173                /*
3174                 * because this is an RCU protected lookup, we could find a
3175                 * recently freed or even reallocated inode during the lookup.
3176                 * We need to check under the i_flags_lock for a valid inode
3177                 * here. Skip it if it is not valid or the wrong inode.
3178                 */
3179                spin_lock(&cip->i_flags_lock);
3180                if (!cip->i_ino ||
3181                    __xfs_iflags_test(cip, XFS_ISTALE)) {
3182                        spin_unlock(&cip->i_flags_lock);
3183                        continue;
3184                }
3185
3186                /*
3187                 * Once we fall off the end of the cluster, no point checking
3188                 * any more inodes in the list because they will also all be
3189                 * outside the cluster.
3190                 */
3191                if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
3192                        spin_unlock(&cip->i_flags_lock);
3193                        break;
3194                }
3195                spin_unlock(&cip->i_flags_lock);
3196
3197                /*
3198                 * Do an un-protected check to see if the inode is dirty and
3199                 * is a candidate for flushing.  These checks will be repeated
3200                 * later after the appropriate locks are acquired.
3201                 */
3202                if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
3203                        continue;
3204
3205                /*
3206                 * Try to get locks.  If any are unavailable or it is pinned,
3207                 * then this inode cannot be flushed and is skipped.
3208                 */
3209
3210                if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
3211                        continue;
3212                if (!xfs_iflock_nowait(cip)) {
3213                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
3214                        continue;
3215                }
3216                if (xfs_ipincount(cip)) {
3217                        xfs_ifunlock(cip);
3218                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
3219                        continue;
3220                }
3221
3222
3223                /*
3224                 * Check the inode number again, just to be certain we are not
3225                 * racing with freeing in xfs_reclaim_inode(). See the comments
3226                 * in that function for more information as to why the initial
3227                 * check is not sufficient.
3228                 */
3229                if (!cip->i_ino) {
3230                        xfs_ifunlock(cip);
3231                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
3232                        continue;
3233                }
3234
3235                /*
3236                 * arriving here means that this inode can be flushed.  First
3237                 * re-check that it's dirty before flushing.
3238                 */
3239                if (!xfs_inode_clean(cip)) {
3240                        int     error;
3241                        error = xfs_iflush_int(cip, bp);
3242                        if (error) {
3243                                xfs_iunlock(cip, XFS_ILOCK_SHARED);
3244                                goto cluster_corrupt_out;
3245                        }
3246                        clcount++;
3247                } else {
3248                        xfs_ifunlock(cip);
3249                }
3250                xfs_iunlock(cip, XFS_ILOCK_SHARED);
3251        }
3252
3253        if (clcount) {
3254                XFS_STATS_INC(mp, xs_icluster_flushcnt);
3255                XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3256        }
3257
3258out_free:
3259        rcu_read_unlock();
3260        kmem_free(cilist);
3261out_put:
3262        xfs_perag_put(pag);
3263        return 0;
3264
3265
3266cluster_corrupt_out:
3267        /*
3268         * Corruption detected in the clustering loop.  Invalidate the
3269         * inode buffer and shut down the filesystem.
3270         */
3271        rcu_read_unlock();
3272        /*
3273         * Clean up the buffer.  If it was delwri, just release it --
3274         * brelse can handle it with no problems.  If not, shut down the
3275         * filesystem before releasing the buffer.
3276         */
3277        bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
3278        if (bufwasdelwri)
3279                xfs_buf_relse(bp);
3280
3281        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3282
3283        if (!bufwasdelwri) {
3284                /*
3285                 * Just like incore_relse: if we have b_iodone functions,
3286                 * mark the buffer as an error and call them.  Otherwise
3287                 * mark it as stale and brelse.
3288                 */
3289                if (bp->b_iodone) {
3290                        bp->b_flags &= ~XBF_DONE;
3291                        xfs_buf_stale(bp);
3292                        xfs_buf_ioerror(bp, -EIO);
3293                        xfs_buf_ioend(bp);
3294                } else {
3295                        xfs_buf_stale(bp);
3296                        xfs_buf_relse(bp);
3297                }
3298        }
3299
3300        /*
3301         * Unlocks the flush lock
3302         */
3303        xfs_iflush_abort(cip, false);
3304        kmem_free(cilist);
3305        xfs_perag_put(pag);
3306        return -EFSCORRUPTED;
3307}
3308
3309/*
3310 * Flush dirty inode metadata into the backing buffer.
3311 *
3312 * The caller must have the inode lock and the inode flush lock held.  The
3313 * inode lock will still be held upon return to the caller, and the inode
3314 * flush lock will be released after the inode has reached the disk.
3315 *
3316 * The caller must write out the buffer returned in *bpp and release it.
3317 */
3318int
3319xfs_iflush(
3320        struct xfs_inode        *ip,
3321        struct xfs_buf          **bpp)
3322{
3323        struct xfs_mount        *mp = ip->i_mount;
3324        struct xfs_buf          *bp = NULL;
3325        struct xfs_dinode       *dip;
3326        int                     error;
3327
3328        XFS_STATS_INC(mp, xs_iflush_count);
3329
3330        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3331        ASSERT(xfs_isiflocked(ip));
3332        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3333               ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3334
3335        *bpp = NULL;
3336
3337        xfs_iunpin_wait(ip);
3338
3339        /*
3340         * For stale inodes we cannot rely on the backing buffer remaining
3341         * stale in cache for the remaining life of the stale inode and so
3342         * xfs_imap_to_bp() below may give us a buffer that no longer contains
3343         * inodes below. We have to check this after ensuring the inode is
3344         * unpinned so that it is safe to reclaim the stale inode after the
3345         * flush call.
3346         */
3347        if (xfs_iflags_test(ip, XFS_ISTALE)) {
3348                xfs_ifunlock(ip);
3349                return 0;
3350        }
3351
3352        /*
3353         * This may have been unpinned because the filesystem is shutting
3354         * down forcibly. If that's the case we must not write this inode
3355         * to disk, because the log record didn't make it to disk.
3356         *
3357         * We also have to remove the log item from the AIL in this case,
3358         * as we wait for an empty AIL as part of the unmount process.
3359         */
3360        if (XFS_FORCED_SHUTDOWN(mp)) {
3361                error = -EIO;
3362                goto abort_out;
3363        }
3364
3365        /*
3366         * Get the buffer containing the on-disk inode. We are doing a try-lock
3367         * operation here, so we may get  an EAGAIN error. In that case, we
3368         * simply want to return with the inode still dirty.
3369         *
3370         * If we get any other error, we effectively have a corruption situation
3371         * and we cannot flush the inode, so we treat it the same as failing
3372         * xfs_iflush_int().
3373         */
3374        error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
3375                               0);
3376        if (error == -EAGAIN) {
3377                xfs_ifunlock(ip);
3378                return error;
3379        }
3380        if (error)
3381                goto corrupt_out;
3382
3383        /*
3384         * First flush out the inode that xfs_iflush was called with.
3385         */
3386        error = xfs_iflush_int(ip, bp);
3387        if (error)
3388                goto corrupt_out;
3389
3390        /*
3391         * If the buffer is pinned then push on the log now so we won't
3392         * get stuck waiting in the write for too long.
3393         */
3394        if (xfs_buf_ispinned(bp))
3395                xfs_log_force(mp, 0);
3396
3397        /*
3398         * inode clustering:
3399         * see if other inodes can be gathered into this write
3400         */
3401        error = xfs_iflush_cluster(ip, bp);
3402        if (error)
3403                goto cluster_corrupt_out;
3404
3405        *bpp = bp;
3406        return 0;
3407
3408corrupt_out:
3409        if (bp)
3410                xfs_buf_relse(bp);
3411        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3412cluster_corrupt_out:
3413        error = -EFSCORRUPTED;
3414abort_out:
3415        /*
3416         * Unlocks the flush lock
3417         */
3418        xfs_iflush_abort(ip, false);
3419        return error;
3420}
3421
3422STATIC int
3423xfs_iflush_int(
3424        struct xfs_inode        *ip,
3425        struct xfs_buf          *bp)
3426{
3427        struct xfs_inode_log_item *iip = ip->i_itemp;
3428        struct xfs_dinode       *dip;
3429        struct xfs_mount        *mp = ip->i_mount;
3430
3431        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3432        ASSERT(xfs_isiflocked(ip));
3433        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3434               ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3435        ASSERT(iip != NULL && iip->ili_fields != 0);
3436        ASSERT(ip->i_d.di_version > 1);
3437
3438        /* set *dip = inode's place in the buffer */
3439        dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3440
3441        if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3442                               mp, XFS_ERRTAG_IFLUSH_1)) {
3443                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3444                        "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
3445                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3446                goto corrupt_out;
3447        }
3448        if (S_ISREG(VFS_I(ip)->i_mode)) {
3449                if (XFS_TEST_ERROR(
3450                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3451                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3452                    mp, XFS_ERRTAG_IFLUSH_3)) {
3453                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3454                                "%s: Bad regular inode %Lu, ptr 0x%p",
3455                                __func__, ip->i_ino, ip);
3456                        goto corrupt_out;
3457                }
3458        } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3459                if (XFS_TEST_ERROR(
3460                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3461                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3462                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3463                    mp, XFS_ERRTAG_IFLUSH_4)) {
3464                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3465                                "%s: Bad directory inode %Lu, ptr 0x%p",
3466                                __func__, ip->i_ino, ip);
3467                        goto corrupt_out;
3468                }
3469        }
3470        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3471                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
3472                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3473                        "%s: detected corrupt incore inode %Lu, "
3474                        "total extents = %d, nblocks = %Ld, ptr 0x%p",
3475                        __func__, ip->i_ino,
3476                        ip->i_d.di_nextents + ip->i_d.di_anextents,
3477                        ip->i_d.di_nblocks, ip);
3478                goto corrupt_out;
3479        }
3480        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3481                                mp, XFS_ERRTAG_IFLUSH_6)) {
3482                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3483                        "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
3484                        __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
3485                goto corrupt_out;
3486        }
3487
3488        /*
3489         * Inode item log recovery for v2 inodes are dependent on the
3490         * di_flushiter count for correct sequencing. We bump the flush
3491         * iteration count so we can detect flushes which postdate a log record
3492         * during recovery. This is redundant as we now log every change and
3493         * hence this can't happen but we need to still do it to ensure
3494         * backwards compatibility with old kernels that predate logging all
3495         * inode changes.
3496         */
3497        if (ip->i_d.di_version < 3)
3498                ip->i_d.di_flushiter++;
3499
3500        /* Check the inline directory data. */
3501        if (S_ISDIR(VFS_I(ip)->i_mode) &&
3502            ip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
3503            xfs_dir2_sf_verify(ip))
3504                goto corrupt_out;
3505
3506        /*
3507         * Copy the dirty parts of the inode into the on-disk inode.  We always
3508         * copy out the core of the inode, because if the inode is dirty at all
3509         * the core must be.
3510         */
3511        xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3512
3513        /* Wrap, we never let the log put out DI_MAX_FLUSH */
3514        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3515                ip->i_d.di_flushiter = 0;
3516
3517        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3518        if (XFS_IFORK_Q(ip))
3519                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3520        xfs_inobp_check(mp, bp);
3521
3522        /*
3523         * We've recorded everything logged in the inode, so we'd like to clear
3524         * the ili_fields bits so we don't log and flush things unnecessarily.
3525         * However, we can't stop logging all this information until the data
3526         * we've copied into the disk buffer is written to disk.  If we did we
3527         * might overwrite the copy of the inode in the log with all the data
3528         * after re-logging only part of it, and in the face of a crash we
3529         * wouldn't have all the data we need to recover.
3530         *
3531         * What we do is move the bits to the ili_last_fields field.  When
3532         * logging the inode, these bits are moved back to the ili_fields field.
3533         * In the xfs_iflush_done() routine we clear ili_last_fields, since we
3534         * know that the information those bits represent is permanently on
3535         * disk.  As long as the flush completes before the inode is logged
3536         * again, then both ili_fields and ili_last_fields will be cleared.
3537         *
3538         * We can play with the ili_fields bits here, because the inode lock
3539         * must be held exclusively in order to set bits there and the flush
3540         * lock protects the ili_last_fields bits.  Set ili_logged so the flush
3541         * done routine can tell whether or not to look in the AIL.  Also, store
3542         * the current LSN of the inode so that we can tell whether the item has
3543         * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
3544         * need the AIL lock, because it is a 64 bit value that cannot be read
3545         * atomically.
3546         */
3547        iip->ili_last_fields = iip->ili_fields;
3548        iip->ili_fields = 0;
3549        iip->ili_fsync_fields = 0;
3550        iip->ili_logged = 1;
3551
3552        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3553                                &iip->ili_item.li_lsn);
3554
3555        /*
3556         * Attach the function xfs_iflush_done to the inode's
3557         * buffer.  This will remove the inode from the AIL
3558         * and unlock the inode's flush lock when the inode is
3559         * completely written to disk.
3560         */
3561        xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3562
3563        /* generate the checksum. */
3564        xfs_dinode_calc_crc(mp, dip);
3565
3566        ASSERT(bp->b_fspriv != NULL);
3567        ASSERT(bp->b_iodone != NULL);
3568        return 0;
3569
3570corrupt_out:
3571        return -EFSCORRUPTED;
3572}
3573