LXR linux/fs/xfs/xfs

   1/*
   2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_format.h"
  21#include "xfs_log_format.h"
  22#include "xfs_trans_resv.h"
  23#include "xfs_sb.h"
  24#include "xfs_mount.h"
  25#include "xfs_inode.h"
  26#include "xfs_error.h"
  27#include "xfs_trans.h"
  28#include "xfs_trans_priv.h"
  29#include "xfs_inode_item.h"
  30#include "xfs_quota.h"
  31#include "xfs_trace.h"
  32#include "xfs_icache.h"
  33#include "xfs_bmap_util.h"
  34#include "xfs_dquot_item.h"
  35#include "xfs_dquot.h"
  36
  37#include <linux/kthread.h>
  38#include <linux/freezer.h>
  39
  40STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
  41                                struct xfs_perag *pag, struct xfs_inode *ip);
  42
  43/*
  44 * Allocate and initialise an xfs_inode.
  45 */
  46struct xfs_inode *
  47xfs_inode_alloc(
  48        struct xfs_mount        *mp,
  49        xfs_ino_t               ino)
  50{
  51        struct xfs_inode        *ip;
  52
  53        /*
  54         * if this didn't occur in transactions, we could use
  55         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
  56         * code up to do this anyway.
  57         */
  58        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
  59        if (!ip)
  60                return NULL;
  61        if (inode_init_always(mp->m_super, VFS_I(ip))) {
  62                kmem_zone_free(xfs_inode_zone, ip);
  63                return NULL;
  64        }
  65
  66        XFS_STATS_INC(vn_active);
  67        ASSERT(atomic_read(&ip->i_pincount) == 0);
  68        ASSERT(!spin_is_locked(&ip->i_flags_lock));
  69        ASSERT(!xfs_isiflocked(ip));
  70        ASSERT(ip->i_ino == 0);
  71
  72        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
  73
  74        /* initialise the xfs inode */
  75        ip->i_ino = ino;
  76        ip->i_mount = mp;
  77        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
  78        ip->i_afp = NULL;
  79        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
  80        ip->i_flags = 0;
  81        ip->i_delayed_blks = 0;
  82        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
  83
  84        return ip;
  85}
  86
  87STATIC void
  88xfs_inode_free_callback(
  89        struct rcu_head         *head)
  90{
  91        struct inode            *inode = container_of(head, struct inode, i_rcu);
  92        struct xfs_inode        *ip = XFS_I(inode);
  93
  94        kmem_zone_free(xfs_inode_zone, ip);
  95}
  96
  97void
  98xfs_inode_free(
  99        struct xfs_inode        *ip)
 100{
 101        switch (ip->i_d.di_mode & S_IFMT) {
 102        case S_IFREG:
 103        case S_IFDIR:
 104        case S_IFLNK:
 105                xfs_idestroy_fork(ip, XFS_DATA_FORK);
 106                break;
 107        }
 108
 109        if (ip->i_afp)
 110                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 111
 112        if (ip->i_itemp) {
 113                ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
 114                xfs_inode_item_destroy(ip);
 115                ip->i_itemp = NULL;
 116        }
 117
 118        /*
 119         * Because we use RCU freeing we need to ensure the inode always
 120         * appears to be reclaimed with an invalid inode number when in the
 121         * free state. The ip->i_flags_lock provides the barrier against lookup
 122         * races.
 123         */
 124        spin_lock(&ip->i_flags_lock);
 125        ip->i_flags = XFS_IRECLAIM;
 126        ip->i_ino = 0;
 127        spin_unlock(&ip->i_flags_lock);
 128
 129        /* asserts to verify all state is correct here */
 130        ASSERT(atomic_read(&ip->i_pincount) == 0);
 131        ASSERT(!xfs_isiflocked(ip));
 132        XFS_STATS_DEC(vn_active);
 133
 134        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 135}
 136
 137/*
 138 * Check the validity of the inode we just found it the cache
 139 */
 140static int
 141xfs_iget_cache_hit(
 142        struct xfs_perag        *pag,
 143        struct xfs_inode        *ip,
 144        xfs_ino_t               ino,
 145        int                     flags,
 146        int                     lock_flags) __releases(RCU)
 147{
 148        struct inode            *inode = VFS_I(ip);
 149        struct xfs_mount        *mp = ip->i_mount;
 150        int                     error;
 151
 152        /*
 153         * check for re-use of an inode within an RCU grace period due to the
 154         * radix tree nodes not being updated yet. We monitor for this by
 155         * setting the inode number to zero before freeing the inode structure.
 156         * If the inode has been reallocated and set up, then the inode number
 157         * will not match, so check for that, too.
 158         */
 159        spin_lock(&ip->i_flags_lock);
 160        if (ip->i_ino != ino) {
 161                trace_xfs_iget_skip(ip);
 162                XFS_STATS_INC(xs_ig_frecycle);
 163                error = -EAGAIN;
 164                goto out_error;
 165        }
 166
 167
 168        /*
 169         * If we are racing with another cache hit that is currently
 170         * instantiating this inode or currently recycling it out of
 171         * reclaimabe state, wait for the initialisation to complete
 172         * before continuing.
 173         *
 174         * XXX(hch): eventually we should do something equivalent to
 175         *           wait_on_inode to wait for these flags to be cleared
 176         *           instead of polling for it.
 177         */
 178        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
 179                trace_xfs_iget_skip(ip);
 180                XFS_STATS_INC(xs_ig_frecycle);
 181                error = -EAGAIN;
 182                goto out_error;
 183        }
 184
 185        /*
 186         * If lookup is racing with unlink return an error immediately.
 187         */
 188        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
 189                error = -ENOENT;
 190                goto out_error;
 191        }
 192
 193        /*
 194         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
 195         * Need to carefully get it back into useable state.
 196         */
 197        if (ip->i_flags & XFS_IRECLAIMABLE) {
 198                trace_xfs_iget_reclaim(ip);
 199
 200                /*
 201                 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
 202                 * from stomping over us while we recycle the inode.  We can't
 203                 * clear the radix tree reclaimable tag yet as it requires
 204                 * pag_ici_lock to be held exclusive.
 205                 */
 206                ip->i_flags |= XFS_IRECLAIM;
 207
 208                spin_unlock(&ip->i_flags_lock);
 209                rcu_read_unlock();
 210
 211                error = inode_init_always(mp->m_super, inode);
 212                if (error) {
 213                        /*
 214                         * Re-initializing the inode failed, and we are in deep
 215                         * trouble.  Try to re-add it to the reclaim list.
 216                         */
 217                        rcu_read_lock();
 218                        spin_lock(&ip->i_flags_lock);
 219
 220                        ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
 221                        ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
 222                        trace_xfs_iget_reclaim_fail(ip);
 223                        goto out_error;
 224                }
 225
 226                spin_lock(&pag->pag_ici_lock);
 227                spin_lock(&ip->i_flags_lock);
 228
 229                /*
 230                 * Clear the per-lifetime state in the inode as we are now
 231                 * effectively a new inode and need to return to the initial
 232                 * state before reuse occurs.
 233                 */
 234                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
 235                ip->i_flags |= XFS_INEW;
 236                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
 237                inode->i_state = I_NEW;
 238
 239                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
 240                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 241
 242                spin_unlock(&ip->i_flags_lock);
 243                spin_unlock(&pag->pag_ici_lock);
 244        } else {
 245                /* If the VFS inode is being torn down, pause and try again. */
 246                if (!igrab(inode)) {
 247                        trace_xfs_iget_skip(ip);
 248                        error = -EAGAIN;
 249                        goto out_error;
 250                }
 251
 252                /* We've got a live one. */
 253                spin_unlock(&ip->i_flags_lock);
 254                rcu_read_unlock();
 255                trace_xfs_iget_hit(ip);
 256        }
 257
 258        if (lock_flags != 0)
 259                xfs_ilock(ip, lock_flags);
 260
 261        xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
 262        XFS_STATS_INC(xs_ig_found);
 263
 264        return 0;
 265
 266out_error:
 267        spin_unlock(&ip->i_flags_lock);
 268        rcu_read_unlock();
 269        return error;
 270}
 271
 272
 273static int
 274xfs_iget_cache_miss(
 275        struct xfs_mount        *mp,
 276        struct xfs_perag        *pag,
 277        xfs_trans_t             *tp,
 278        xfs_ino_t               ino,
 279        struct xfs_inode        **ipp,
 280        int                     flags,
 281        int                     lock_flags)
 282{
 283        struct xfs_inode        *ip;
 284        int                     error;
 285        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
 286        int                     iflags;
 287
 288        ip = xfs_inode_alloc(mp, ino);
 289        if (!ip)
 290                return -ENOMEM;
 291
 292        error = xfs_iread(mp, tp, ip, flags);
 293        if (error)
 294                goto out_destroy;
 295
 296        trace_xfs_iget_miss(ip);
 297
 298        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
 299                error = -ENOENT;
 300                goto out_destroy;
 301        }
 302
 303        /*
 304         * Preload the radix tree so we can insert safely under the
 305         * write spinlock. Note that we cannot sleep inside the preload
 306         * region. Since we can be called from transaction context, don't
 307         * recurse into the file system.
 308         */
 309        if (radix_tree_preload(GFP_NOFS)) {
 310                error = -EAGAIN;
 311                goto out_destroy;
 312        }
 313
 314        /*
 315         * Because the inode hasn't been added to the radix-tree yet it can't
 316         * be found by another thread, so we can do the non-sleeping lock here.
 317         */
 318        if (lock_flags) {
 319                if (!xfs_ilock_nowait(ip, lock_flags))
 320                        BUG();
 321        }
 322
 323        /*
 324         * These values must be set before inserting the inode into the radix
 325         * tree as the moment it is inserted a concurrent lookup (allowed by the
 326         * RCU locking mechanism) can find it and that lookup must see that this
 327         * is an inode currently under construction (i.e. that XFS_INEW is set).
 328         * The ip->i_flags_lock that protects the XFS_INEW flag forms the
 329         * memory barrier that ensures this detection works correctly at lookup
 330         * time.
 331         */
 332        iflags = XFS_INEW;
 333        if (flags & XFS_IGET_DONTCACHE)
 334                iflags |= XFS_IDONTCACHE;
 335        ip->i_udquot = NULL;
 336        ip->i_gdquot = NULL;
 337        ip->i_pdquot = NULL;
 338        xfs_iflags_set(ip, iflags);
 339
 340        /* insert the new inode */
 341        spin_lock(&pag->pag_ici_lock);
 342        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
 343        if (unlikely(error)) {
 344                WARN_ON(error != -EEXIST);
 345                XFS_STATS_INC(xs_ig_dup);
 346                error = -EAGAIN;
 347                goto out_preload_end;
 348        }
 349        spin_unlock(&pag->pag_ici_lock);
 350        radix_tree_preload_end();
 351
 352        *ipp = ip;
 353        return 0;
 354
 355out_preload_end:
 356        spin_unlock(&pag->pag_ici_lock);
 357        radix_tree_preload_end();
 358        if (lock_flags)
 359                xfs_iunlock(ip, lock_flags);
 360out_destroy:
 361        __destroy_inode(VFS_I(ip));
 362        xfs_inode_free(ip);
 363        return error;
 364}
 365
 366/*
 367 * Look up an inode by number in the given file system.
 368 * The inode is looked up in the cache held in each AG.
 369 * If the inode is found in the cache, initialise the vfs inode
 370 * if necessary.
 371 *
 372 * If it is not in core, read it in from the file system's device,
 373 * add it to the cache and initialise the vfs inode.
 374 *
 375 * The inode is locked according to the value of the lock_flags parameter.
 376 * This flag parameter indicates how and if the inode's IO lock and inode lock
 377 * should be taken.
 378 *
 379 * mp -- the mount point structure for the current file system.  It points
 380 *       to the inode hash table.
 381 * tp -- a pointer to the current transaction if there is one.  This is
 382 *       simply passed through to the xfs_iread() call.
 383 * ino -- the number of the inode desired.  This is the unique identifier
 384 *        within the file system for the inode being requested.
 385 * lock_flags -- flags indicating how to lock the inode.  See the comment
 386 *               for xfs_ilock() for a list of valid values.
 387 */
 388int
 389xfs_iget(
 390        xfs_mount_t     *mp,
 391        xfs_trans_t     *tp,
 392        xfs_ino_t       ino,
 393        uint            flags,
 394        uint            lock_flags,
 395        xfs_inode_t     **ipp)
 396{
 397        xfs_inode_t     *ip;
 398        int             error;
 399        xfs_perag_t     *pag;
 400        xfs_agino_t     agino;
 401
 402        /*
 403         * xfs_reclaim_inode() uses the ILOCK to ensure an inode
 404         * doesn't get freed while it's being referenced during a
 405         * radix tree traversal here.  It assumes this function
 406         * aqcuires only the ILOCK (and therefore it has no need to
 407         * involve the IOLOCK in this synchronization).
 408         */
 409        ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
 410
 411        /* reject inode numbers outside existing AGs */
 412        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 413                return -EINVAL;
 414
 415        XFS_STATS_INC(xs_ig_attempts);
 416
 417        /* get the perag structure and ensure that it's inode capable */
 418        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
 419        agino = XFS_INO_TO_AGINO(mp, ino);
 420
 421again:
 422        error = 0;
 423        rcu_read_lock();
 424        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 425
 426        if (ip) {
 427                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
 428                if (error)
 429                        goto out_error_or_again;
 430        } else {
 431                rcu_read_unlock();
 432                XFS_STATS_INC(xs_ig_missed);
 433
 434                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
 435                                                        flags, lock_flags);
 436                if (error)
 437                        goto out_error_or_again;
 438        }
 439        xfs_perag_put(pag);
 440
 441        *ipp = ip;
 442
 443        /*
 444         * If we have a real type for an on-disk inode, we can setup the inode
 445         * now.  If it's a new inode being created, xfs_ialloc will handle it.
 446         */
 447        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
 448                xfs_setup_existing_inode(ip);
 449        return 0;
 450
 451out_error_or_again:
 452        if (error == -EAGAIN) {
 453                delay(1);
 454                goto again;
 455        }
 456        xfs_perag_put(pag);
 457        return error;
 458}
 459
 460/*
 461 * The inode lookup is done in batches to keep the amount of lock traffic and
 462 * radix tree lookups to a minimum. The batch size is a trade off between
 463 * lookup reduction and stack usage. This is in the reclaim path, so we can't
 464 * be too greedy.
 465 */
 466#define XFS_LOOKUP_BATCH        32
 467
 468STATIC int
 469xfs_inode_ag_walk_grab(
 470        struct xfs_inode        *ip)
 471{
 472        struct inode            *inode = VFS_I(ip);
 473
 474        ASSERT(rcu_read_lock_held());
 475
 476        /*
 477         * check for stale RCU freed inode
 478         *
 479         * If the inode has been reallocated, it doesn't matter if it's not in
 480         * the AG we are walking - we are walking for writeback, so if it
 481         * passes all the "valid inode" checks and is dirty, then we'll write
 482         * it back anyway.  If it has been reallocated and still being
 483         * initialised, the XFS_INEW check below will catch it.
 484         */
 485        spin_lock(&ip->i_flags_lock);
 486        if (!ip->i_ino)
 487                goto out_unlock_noent;
 488
 489        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
 490        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
 491                goto out_unlock_noent;
 492        spin_unlock(&ip->i_flags_lock);
 493
 494        /* nothing to sync during shutdown */
 495        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 496                return -EFSCORRUPTED;
 497
 498        /* If we can't grab the inode, it must on it's way to reclaim. */
 499        if (!igrab(inode))
 500                return -ENOENT;
 501
 502        /* inode is valid */
 503        return 0;
 504
 505out_unlock_noent:
 506        spin_unlock(&ip->i_flags_lock);
 507        return -ENOENT;
 508}
 509
 510STATIC int
 511xfs_inode_ag_walk(
 512        struct xfs_mount        *mp,
 513        struct xfs_perag        *pag,
 514        int                     (*execute)(struct xfs_inode *ip, int flags,
 515                                           void *args),
 516        int                     flags,
 517        void                    *args,
 518        int                     tag)
 519{
 520        uint32_t                first_index;
 521        int                     last_error = 0;
 522        int                     skipped;
 523        int                     done;
 524        int                     nr_found;
 525
 526restart:
 527        done = 0;
 528        skipped = 0;
 529        first_index = 0;
 530        nr_found = 0;
 531        do {
 532                struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 533                int             error = 0;
 534                int             i;
 535
 536                rcu_read_lock();
 537
 538                if (tag == -1)
 539                        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 540                                        (void **)batch, first_index,
 541                                        XFS_LOOKUP_BATCH);
 542                else
 543                        nr_found = radix_tree_gang_lookup_tag(
 544                                        &pag->pag_ici_root,
 545                                        (void **) batch, first_index,
 546                                        XFS_LOOKUP_BATCH, tag);
 547
 548                if (!nr_found) {
 549                        rcu_read_unlock();
 550                        break;
 551                }
 552
 553                /*
 554                 * Grab the inodes before we drop the lock. if we found
 555                 * nothing, nr == 0 and the loop will be skipped.
 556                 */
 557                for (i = 0; i < nr_found; i++) {
 558                        struct xfs_inode *ip = batch[i];
 559
 560                        if (done || xfs_inode_ag_walk_grab(ip))
 561                                batch[i] = NULL;
 562
 563                        /*
 564                         * Update the index for the next lookup. Catch
 565                         * overflows into the next AG range which can occur if
 566                         * we have inodes in the last block of the AG and we
 567                         * are currently pointing to the last inode.
 568                         *
 569                         * Because we may see inodes that are from the wrong AG
 570                         * due to RCU freeing and reallocation, only update the
 571                         * index if it lies in this AG. It was a race that lead
 572                         * us to see this inode, so another lookup from the
 573                         * same index will not find it again.
 574                         */
 575                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
 576                                continue;
 577                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 578                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 579                                done = 1;
 580                }
 581
 582                /* unlock now we've grabbed the inodes. */
 583                rcu_read_unlock();
 584
 585                for (i = 0; i < nr_found; i++) {
 586                        if (!batch[i])
 587                                continue;
 588                        error = execute(batch[i], flags, args);
 589                        IRELE(batch[i]);
 590                        if (error == -EAGAIN) {
 591                                skipped++;
 592                                continue;
 593                        }
 594                        if (error && last_error != -EFSCORRUPTED)
 595                                last_error = error;
 596                }
 597
 598                /* bail out if the filesystem is corrupted.  */
 599                if (error == -EFSCORRUPTED)
 600                        break;
 601
 602                cond_resched();
 603
 604        } while (nr_found && !done);
 605
 606        if (skipped) {
 607                delay(1);
 608                goto restart;
 609        }
 610        return last_error;
 611}
 612
 613/*
 614 * Background scanning to trim post-EOF preallocated space. This is queued
 615 * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
 616 */
 617STATIC void
 618xfs_queue_eofblocks(
 619        struct xfs_mount *mp)
 620{
 621        rcu_read_lock();
 622        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
 623                queue_delayed_work(mp->m_eofblocks_workqueue,
 624                                   &mp->m_eofblocks_work,
 625                                   msecs_to_jiffies(xfs_eofb_secs * 1000));
 626        rcu_read_unlock();
 627}
 628
 629void
 630xfs_eofblocks_worker(
 631        struct work_struct *work)
 632{
 633        struct xfs_mount *mp = container_of(to_delayed_work(work),
 634                                struct xfs_mount, m_eofblocks_work);
 635        xfs_icache_free_eofblocks(mp, NULL);
 636        xfs_queue_eofblocks(mp);
 637}
 638
 639int
 640xfs_inode_ag_iterator(
 641        struct xfs_mount        *mp,
 642        int                     (*execute)(struct xfs_inode *ip, int flags,
 643                                           void *args),
 644        int                     flags,
 645        void                    *args)
 646{
 647        struct xfs_perag        *pag;
 648        int                     error = 0;
 649        int                     last_error = 0;
 650        xfs_agnumber_t          ag;
 651
 652        ag = 0;
 653        while ((pag = xfs_perag_get(mp, ag))) {
 654                ag = pag->pag_agno + 1;
 655                error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
 656                xfs_perag_put(pag);
 657                if (error) {
 658                        last_error = error;
 659                        if (error == -EFSCORRUPTED)
 660                                break;
 661                }
 662        }
 663        return last_error;
 664}
 665
 666int
 667xfs_inode_ag_iterator_tag(
 668        struct xfs_mount        *mp,
 669        int                     (*execute)(struct xfs_inode *ip, int flags,
 670                                           void *args),
 671        int                     flags,
 672        void                    *args,
 673        int                     tag)
 674{
 675        struct xfs_perag        *pag;
 676        int                     error = 0;
 677        int                     last_error = 0;
 678        xfs_agnumber_t          ag;
 679
 680        ag = 0;
 681        while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
 682                ag = pag->pag_agno + 1;
 683                error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
 684                xfs_perag_put(pag);
 685                if (error) {
 686                        last_error = error;
 687                        if (error == -EFSCORRUPTED)
 688                                break;
 689                }
 690        }
 691        return last_error;
 692}
 693
 694/*
 695 * Queue a new inode reclaim pass if there are reclaimable inodes and there
 696 * isn't a reclaim pass already in progress. By default it runs every 5s based
 697 * on the xfs periodic sync default of 30s. Perhaps this should have it's own
 698 * tunable, but that can be done if this method proves to be ineffective or too
 699 * aggressive.
 700 */
 701static void
 702xfs_reclaim_work_queue(
 703        struct xfs_mount        *mp)
 704{
 705
 706        rcu_read_lock();
 707        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
 708                queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
 709                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 710        }
 711        rcu_read_unlock();
 712}
 713
 714/*
 715 * This is a fast pass over the inode cache to try to get reclaim moving on as
 716 * many inodes as possible in a short period of time. It kicks itself every few
 717 * seconds, as well as being kicked by the inode cache shrinker when memory
 718 * goes low. It scans as quickly as possible avoiding locked inodes or those
 719 * already being flushed, and once done schedules a future pass.
 720 */
 721void
 722xfs_reclaim_worker(
 723        struct work_struct *work)
 724{
 725        struct xfs_mount *mp = container_of(to_delayed_work(work),
 726                                        struct xfs_mount, m_reclaim_work);
 727
 728        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
 729        xfs_reclaim_work_queue(mp);
 730}
 731
 732static void
 733__xfs_inode_set_reclaim_tag(
 734        struct xfs_perag        *pag,
 735        struct xfs_inode        *ip)
 736{
 737        radix_tree_tag_set(&pag->pag_ici_root,
 738                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
 739                           XFS_ICI_RECLAIM_TAG);
 740
 741        if (!pag->pag_ici_reclaimable) {
 742                /* propagate the reclaim tag up into the perag radix tree */
 743                spin_lock(&ip->i_mount->m_perag_lock);
 744                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
 745                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
 746                                XFS_ICI_RECLAIM_TAG);
 747                spin_unlock(&ip->i_mount->m_perag_lock);
 748
 749                /* schedule periodic background inode reclaim */
 750                xfs_reclaim_work_queue(ip->i_mount);
 751
 752                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
 753                                                        -1, _RET_IP_);
 754        }
 755        pag->pag_ici_reclaimable++;
 756}
 757
 758/*
 759 * We set the inode flag atomically with the radix tree tag.
 760 * Once we get tag lookups on the radix tree, this inode flag
 761 * can go away.
 762 */
 763void
 764xfs_inode_set_reclaim_tag(
 765        xfs_inode_t     *ip)
 766{
 767        struct xfs_mount *mp = ip->i_mount;
 768        struct xfs_perag *pag;
 769
 770        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 771        spin_lock(&pag->pag_ici_lock);
 772        spin_lock(&ip->i_flags_lock);
 773        __xfs_inode_set_reclaim_tag(pag, ip);
 774        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 775        spin_unlock(&ip->i_flags_lock);
 776        spin_unlock(&pag->pag_ici_lock);
 777        xfs_perag_put(pag);
 778}
 779
 780STATIC void
 781__xfs_inode_clear_reclaim(
 782        xfs_perag_t     *pag,
 783        xfs_inode_t     *ip)
 784{
 785        pag->pag_ici_reclaimable--;
 786        if (!pag->pag_ici_reclaimable) {
 787                /* clear the reclaim tag from the perag radix tree */
 788                spin_lock(&ip->i_mount->m_perag_lock);
 789                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
 790                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
 791                                XFS_ICI_RECLAIM_TAG);
 792                spin_unlock(&ip->i_mount->m_perag_lock);
 793                trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
 794                                                        -1, _RET_IP_);
 795        }
 796}
 797
 798STATIC void
 799__xfs_inode_clear_reclaim_tag(
 800        xfs_mount_t     *mp,
 801        xfs_perag_t     *pag,
 802        xfs_inode_t     *ip)
 803{
 804        radix_tree_tag_clear(&pag->pag_ici_root,
 805                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 806        __xfs_inode_clear_reclaim(pag, ip);
 807}
 808
 809/*
 810 * Grab the inode for reclaim exclusively.
 811 * Return 0 if we grabbed it, non-zero otherwise.
 812 */
 813STATIC int
 814xfs_reclaim_inode_grab(
 815        struct xfs_inode        *ip,
 816        int                     flags)
 817{
 818        ASSERT(rcu_read_lock_held());
 819
 820        /* quick check for stale RCU freed inode */
 821        if (!ip->i_ino)
 822                return 1;
 823
 824        /*
 825         * If we are asked for non-blocking operation, do unlocked checks to
 826         * see if the inode already is being flushed or in reclaim to avoid
 827         * lock traffic.
 828         */
 829        if ((flags & SYNC_TRYLOCK) &&
 830            __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
 831                return 1;
 832
 833        /*
 834         * The radix tree lock here protects a thread in xfs_iget from racing
 835         * with us starting reclaim on the inode.  Once we have the
 836         * XFS_IRECLAIM flag set it will not touch us.
 837         *
 838         * Due to RCU lookup, we may find inodes that have been freed and only
 839         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
 840         * aren't candidates for reclaim at all, so we must check the
 841         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
 842         */
 843        spin_lock(&ip->i_flags_lock);
 844        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
 845            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
 846                /* not a reclaim candidate. */
 847                spin_unlock(&ip->i_flags_lock);
 848                return 1;
 849        }
 850        __xfs_iflags_set(ip, XFS_IRECLAIM);
 851        spin_unlock(&ip->i_flags_lock);
 852        return 0;
 853}
 854
 855/*
 856 * Inodes in different states need to be treated differently. The following
 857 * table lists the inode states and the reclaim actions necessary:
 858 *
 859 *      inode state          iflush ret         required action
 860 *      ---------------      ----------         ---------------
 861 *      bad                     -               reclaim
 862 *      shutdown                EIO             unpin and reclaim
 863 *      clean, unpinned         0               reclaim
 864 *      stale, unpinned         0               reclaim
 865 *      clean, pinned(*)        0               requeue
 866 *      stale, pinned           EAGAIN          requeue
 867 *      dirty, async            -               requeue
 868 *      dirty, sync             0               reclaim
 869 *
 870 * (*) dgc: I don't think the clean, pinned state is possible but it gets
 871 * handled anyway given the order of checks implemented.
 872 *
 873 * Also, because we get the flush lock first, we know that any inode that has
 874 * been flushed delwri has had the flush completed by the time we check that
 875 * the inode is clean.
 876 *
 877 * Note that because the inode is flushed delayed write by AIL pushing, the
 878 * flush lock may already be held here and waiting on it can result in very
 879 * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
 880 * the caller should push the AIL first before trying to reclaim inodes to
 881 * minimise the amount of time spent waiting.  For background relaim, we only
 882 * bother to reclaim clean inodes anyway.
 883 *
 884 * Hence the order of actions after gaining the locks should be:
 885 *      bad             => reclaim
 886 *      shutdown        => unpin and reclaim
 887 *      pinned, async   => requeue
 888 *      pinned, sync    => unpin
 889 *      stale           => reclaim
 890 *      clean           => reclaim
 891 *      dirty, async    => requeue
 892 *      dirty, sync     => flush, wait and reclaim
 893 */
 894STATIC int
 895xfs_reclaim_inode(
 896        struct xfs_inode        *ip,
 897        struct xfs_perag        *pag,
 898        int                     sync_mode)
 899{
 900        struct xfs_buf          *bp = NULL;
 901        int                     error;
 902
 903restart:
 904        error = 0;
 905        xfs_ilock(ip, XFS_ILOCK_EXCL);
 906        if (!xfs_iflock_nowait(ip)) {
 907                if (!(sync_mode & SYNC_WAIT))
 908                        goto out;
 909                xfs_iflock(ip);
 910        }
 911
 912        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 913                xfs_iunpin_wait(ip);
 914                xfs_iflush_abort(ip, false);
 915                goto reclaim;
 916        }
 917        if (xfs_ipincount(ip)) {
 918                if (!(sync_mode & SYNC_WAIT))
 919                        goto out_ifunlock;
 920                xfs_iunpin_wait(ip);
 921        }
 922        if (xfs_iflags_test(ip, XFS_ISTALE))
 923                goto reclaim;
 924        if (xfs_inode_clean(ip))
 925                goto reclaim;
 926
 927        /*
 928         * Never flush out dirty data during non-blocking reclaim, as it would
 929         * just contend with AIL pushing trying to do the same job.
 930         */
 931        if (!(sync_mode & SYNC_WAIT))
 932                goto out_ifunlock;
 933
 934        /*
 935         * Now we have an inode that needs flushing.
 936         *
 937         * Note that xfs_iflush will never block on the inode buffer lock, as
 938         * xfs_ifree_cluster() can lock the inode buffer before it locks the
 939         * ip->i_lock, and we are doing the exact opposite here.  As a result,
 940         * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
 941         * result in an ABBA deadlock with xfs_ifree_cluster().
 942         *
 943         * As xfs_ifree_cluser() must gather all inodes that are active in the
 944         * cache to mark them stale, if we hit this case we don't actually want
 945         * to do IO here - we want the inode marked stale so we can simply
 946         * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
 947         * inode, back off and try again.  Hopefully the next pass through will
 948         * see the stale flag set on the inode.
 949         */
 950        error = xfs_iflush(ip, &bp);
 951        if (error == -EAGAIN) {
 952                xfs_iunlock(ip, XFS_ILOCK_EXCL);
 953                /* backoff longer than in xfs_ifree_cluster */
 954                delay(2);
 955                goto restart;
 956        }
 957
 958        if (!error) {
 959                error = xfs_bwrite(bp);
 960                xfs_buf_relse(bp);
 961        }
 962
 963        xfs_iflock(ip);
 964reclaim:
 965        xfs_ifunlock(ip);
 966        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 967
 968        XFS_STATS_INC(xs_ig_reclaims);
 969        /*
 970         * Remove the inode from the per-AG radix tree.
 971         *
 972         * Because radix_tree_delete won't complain even if the item was never
 973         * added to the tree assert that it's been there before to catch
 974         * problems with the inode life time early on.
 975         */
 976        spin_lock(&pag->pag_ici_lock);
 977        if (!radix_tree_delete(&pag->pag_ici_root,
 978                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
 979                ASSERT(0);
 980        __xfs_inode_clear_reclaim(pag, ip);
 981        spin_unlock(&pag->pag_ici_lock);
 982
 983        /*
 984         * Here we do an (almost) spurious inode lock in order to coordinate
 985         * with inode cache radix tree lookups.  This is because the lookup
 986         * can reference the inodes in the cache without taking references.
 987         *
 988         * We make that OK here by ensuring that we wait until the inode is
 989         * unlocked after the lookup before we go ahead and free it.
 990         */
 991        xfs_ilock(ip, XFS_ILOCK_EXCL);
 992        xfs_qm_dqdetach(ip);
 993        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 994
 995        xfs_inode_free(ip);
 996        return error;
 997
 998out_ifunlock:
 999        xfs_ifunlock(ip);
1000out:

1001        xfs_iflags_clear(ip, XFS_IRECLAIM);
1002        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1003        /*
1004         * We could return -EAGAIN here to make reclaim rescan the inode tree in
1005         * a short while. However, this just burns CPU time scanning the tree
1006         * waiting for IO to complete and the reclaim work never goes back to
1007         * the idle state. Instead, return 0 to let the next scheduled
1008         * background reclaim attempt to reclaim the inode again.
1009         */
1010        return 0;
1011}
1012
1013/*
1014 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
1015 * corrupted, we still want to try to reclaim all the inodes. If we don't,
1016 * then a shut down during filesystem unmount reclaim walk leak all the
1017 * unreclaimed inodes.
1018 */
1019STATIC int
1020xfs_reclaim_inodes_ag(
1021        struct xfs_mount        *mp,
1022        int                     flags,
1023        int                     *nr_to_scan)
1024{
1025        struct xfs_perag        *pag;
1026        int                     error = 0;
1027        int                     last_error = 0;
1028        xfs_agnumber_t          ag;
1029        int                     trylock = flags & SYNC_TRYLOCK;
1030        int                     skipped;
1031
1032restart:
1033        ag = 0;
1034        skipped = 0;
1035        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1036                unsigned long   first_index = 0;
1037                int             done = 0;
1038                int             nr_found = 0;
1039
1040                ag = pag->pag_agno + 1;
1041
1042                if (trylock) {
1043                        if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
1044                                skipped++;
1045                                xfs_perag_put(pag);
1046                                continue;
1047                        }
1048                        first_index = pag->pag_ici_reclaim_cursor;
1049                } else
1050                        mutex_lock(&pag->pag_ici_reclaim_lock);
1051
1052                do {
1053                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1054                        int     i;
1055
1056                        rcu_read_lock();
1057                        nr_found = radix_tree_gang_lookup_tag(
1058                                        &pag->pag_ici_root,
1059                                        (void **)batch, first_index,
1060                                        XFS_LOOKUP_BATCH,
1061                                        XFS_ICI_RECLAIM_TAG);
1062                        if (!nr_found) {
1063                                done = 1;
1064                                rcu_read_unlock();
1065                                break;
1066                        }
1067
1068                        /*
1069                         * Grab the inodes before we drop the lock. if we found
1070                         * nothing, nr == 0 and the loop will be skipped.
1071                         */
1072                        for (i = 0; i < nr_found; i++) {
1073                                struct xfs_inode *ip = batch[i];
1074
1075                                if (done || xfs_reclaim_inode_grab(ip, flags))
1076                                        batch[i] = NULL;
1077
1078                                /*
1079                                 * Update the index for the next lookup. Catch
1080                                 * overflows into the next AG range which can
1081                                 * occur if we have inodes in the last block of
1082                                 * the AG and we are currently pointing to the
1083                                 * last inode.
1084                                 *
1085                                 * Because we may see inodes that are from the
1086                                 * wrong AG due to RCU freeing and
1087                                 * reallocation, only update the index if it
1088                                 * lies in this AG. It was a race that lead us
1089                                 * to see this inode, so another lookup from
1090                                 * the same index will not find it again.
1091                                 */
1092                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
1093                                                                pag->pag_agno)
1094                                        continue;
1095                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1096                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1097                                        done = 1;
1098                        }
1099
1100                        /* unlock now we've grabbed the inodes. */
1101                        rcu_read_unlock();
1102
1103                        for (i = 0; i < nr_found; i++) {
1104                                if (!batch[i])
1105                                        continue;
1106                                error = xfs_reclaim_inode(batch[i], pag, flags);
1107                                if (error && last_error != -EFSCORRUPTED)
1108                                        last_error = error;
1109                        }
1110
1111                        *nr_to_scan -= XFS_LOOKUP_BATCH;
1112
1113                        cond_resched();
1114
1115                } while (nr_found && !done && *nr_to_scan > 0);
1116
1117                if (trylock && !done)
1118                        pag->pag_ici_reclaim_cursor = first_index;
1119                else
1120                        pag->pag_ici_reclaim_cursor = 0;
1121                mutex_unlock(&pag->pag_ici_reclaim_lock);
1122                xfs_perag_put(pag);
1123        }
1124
1125        /*
1126         * if we skipped any AG, and we still have scan count remaining, do
1127         * another pass this time using blocking reclaim semantics (i.e
1128         * waiting on the reclaim locks and ignoring the reclaim cursors). This
1129         * ensure that when we get more reclaimers than AGs we block rather
1130         * than spin trying to execute reclaim.
1131         */
1132        if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
1133                trylock = 0;
1134                goto restart;
1135        }
1136        return last_error;
1137}
1138
1139int
1140xfs_reclaim_inodes(
1141        xfs_mount_t     *mp,
1142        int             mode)
1143{
1144        int             nr_to_scan = INT_MAX;
1145
1146        return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
1147}
1148
1149/*
1150 * Scan a certain number of inodes for reclaim.
1151 *
1152 * When called we make sure that there is a background (fast) inode reclaim in
1153 * progress, while we will throttle the speed of reclaim via doing synchronous
1154 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1155 * them to be cleaned, which we hope will not be very long due to the
1156 * background walker having already kicked the IO off on those dirty inodes.
1157 */
1158long
1159xfs_reclaim_inodes_nr(
1160        struct xfs_mount        *mp,
1161        int                     nr_to_scan)
1162{
1163        /* kick background reclaimer and push the AIL */
1164        xfs_reclaim_work_queue(mp);
1165        xfs_ail_push_all(mp->m_ail);
1166
1167        return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
1168}
1169
1170/*
1171 * Return the number of reclaimable inodes in the filesystem for
1172 * the shrinker to determine how much to reclaim.
1173 */
1174int
1175xfs_reclaim_inodes_count(
1176        struct xfs_mount        *mp)
1177{
1178        struct xfs_perag        *pag;
1179        xfs_agnumber_t          ag = 0;
1180        int                     reclaimable = 0;
1181
1182        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1183                ag = pag->pag_agno + 1;
1184                reclaimable += pag->pag_ici_reclaimable;
1185                xfs_perag_put(pag);
1186        }
1187        return reclaimable;
1188}
1189
1190STATIC int
1191xfs_inode_match_id(
1192        struct xfs_inode        *ip,
1193        struct xfs_eofblocks    *eofb)
1194{
1195        if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1196            !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1197                return 0;
1198
1199        if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1200            !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1201                return 0;
1202
1203        if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1204            xfs_get_projid(ip) != eofb->eof_prid)
1205                return 0;
1206
1207        return 1;
1208}
1209
1210/*
1211 * A union-based inode filtering algorithm. Process the inode if any of the
1212 * criteria match. This is for global/internal scans only.
1213 */
1214STATIC int
1215xfs_inode_match_id_union(
1216        struct xfs_inode        *ip,
1217        struct xfs_eofblocks    *eofb)
1218{
1219        if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1220            uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1221                return 1;
1222
1223        if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1224            gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1225                return 1;
1226
1227        if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1228            xfs_get_projid(ip) == eofb->eof_prid)
1229                return 1;
1230
1231        return 0;
1232}
1233
1234STATIC int
1235xfs_inode_free_eofblocks(
1236        struct xfs_inode        *ip,
1237        int                     flags,
1238        void                    *args)
1239{
1240        int ret;
1241        struct xfs_eofblocks *eofb = args;
1242        bool need_iolock = true;
1243        int match;
1244
1245        ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
1246
1247        if (!xfs_can_free_eofblocks(ip, false)) {
1248                /* inode could be preallocated or append-only */
1249                trace_xfs_inode_free_eofblocks_invalid(ip);
1250                xfs_inode_clear_eofblocks_tag(ip);
1251                return 0;
1252        }
1253
1254        /*
1255         * If the mapping is dirty the operation can block and wait for some
1256         * time. Unless we are waiting, skip it.
1257         */
1258        if (!(flags & SYNC_WAIT) &&
1259            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1260                return 0;
1261
1262        if (eofb) {
1263                if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
1264                        match = xfs_inode_match_id_union(ip, eofb);
1265                else
1266                        match = xfs_inode_match_id(ip, eofb);
1267                if (!match)
1268                        return 0;
1269
1270                /* skip the inode if the file size is too small */
1271                if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1272                    XFS_ISIZE(ip) < eofb->eof_min_file_size)
1273                        return 0;
1274
1275                /*
1276                 * A scan owner implies we already hold the iolock. Skip it in
1277                 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
1278                 * the possibility of EAGAIN being returned.
1279                 */
1280                if (eofb->eof_scan_owner == ip->i_ino)
1281                        need_iolock = false;
1282        }
1283
1284        ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
1285
1286        /* don't revisit the inode if we're not waiting */
1287        if (ret == -EAGAIN && !(flags & SYNC_WAIT))
1288                ret = 0;
1289
1290        return ret;
1291}
1292
1293int
1294xfs_icache_free_eofblocks(
1295        struct xfs_mount        *mp,
1296        struct xfs_eofblocks    *eofb)
1297{
1298        int flags = SYNC_TRYLOCK;
1299
1300        if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
1301                flags = SYNC_WAIT;
1302
1303        return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
1304                                         eofb, XFS_ICI_EOFBLOCKS_TAG);
1305}
1306
1307/*
1308 * Run eofblocks scans on the quotas applicable to the inode. For inodes with
1309 * multiple quotas, we don't know exactly which quota caused an allocation
1310 * failure. We make a best effort by including each quota under low free space
1311 * conditions (less than 1% free space) in the scan.
1312 */
1313int
1314xfs_inode_free_quota_eofblocks(
1315        struct xfs_inode *ip)
1316{
1317        int scan = 0;
1318        struct xfs_eofblocks eofb = {0};
1319        struct xfs_dquot *dq;
1320
1321        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1322
1323        /*
1324         * Set the scan owner to avoid a potential livelock. Otherwise, the scan
1325         * can repeatedly trylock on the inode we're currently processing. We
1326         * run a sync scan to increase effectiveness and use the union filter to
1327         * cover all applicable quotas in a single scan.
1328         */
1329        eofb.eof_scan_owner = ip->i_ino;
1330        eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
1331
1332        if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
1333                dq = xfs_inode_dquot(ip, XFS_DQ_USER);
1334                if (dq && xfs_dquot_lowsp(dq)) {
1335                        eofb.eof_uid = VFS_I(ip)->i_uid;
1336                        eofb.eof_flags |= XFS_EOF_FLAGS_UID;
1337                        scan = 1;
1338                }
1339        }
1340
1341        if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
1342                dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
1343                if (dq && xfs_dquot_lowsp(dq)) {
1344                        eofb.eof_gid = VFS_I(ip)->i_gid;
1345                        eofb.eof_flags |= XFS_EOF_FLAGS_GID;
1346                        scan = 1;
1347                }
1348        }
1349
1350        if (scan)
1351                xfs_icache_free_eofblocks(ip->i_mount, &eofb);
1352
1353        return scan;
1354}
1355
1356void
1357xfs_inode_set_eofblocks_tag(
1358        xfs_inode_t     *ip)
1359{
1360        struct xfs_mount *mp = ip->i_mount;
1361        struct xfs_perag *pag;
1362        int tagged;
1363
1364        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1365        spin_lock(&pag->pag_ici_lock);
1366        trace_xfs_inode_set_eofblocks_tag(ip);
1367
1368        tagged = radix_tree_tagged(&pag->pag_ici_root,
1369                                   XFS_ICI_EOFBLOCKS_TAG);
1370        radix_tree_tag_set(&pag->pag_ici_root,
1371                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1372                           XFS_ICI_EOFBLOCKS_TAG);
1373        if (!tagged) {
1374                /* propagate the eofblocks tag up into the perag radix tree */
1375                spin_lock(&ip->i_mount->m_perag_lock);
1376                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
1377                                   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1378                                   XFS_ICI_EOFBLOCKS_TAG);
1379                spin_unlock(&ip->i_mount->m_perag_lock);
1380
1381                /* kick off background trimming */
1382                xfs_queue_eofblocks(ip->i_mount);
1383
1384                trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
1385                                              -1, _RET_IP_);
1386        }
1387
1388        spin_unlock(&pag->pag_ici_lock);
1389        xfs_perag_put(pag);
1390}
1391
1392void
1393xfs_inode_clear_eofblocks_tag(
1394        xfs_inode_t     *ip)
1395{
1396        struct xfs_mount *mp = ip->i_mount;
1397        struct xfs_perag *pag;
1398
1399        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1400        spin_lock(&pag->pag_ici_lock);
1401        trace_xfs_inode_clear_eofblocks_tag(ip);
1402
1403        radix_tree_tag_clear(&pag->pag_ici_root,
1404                             XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1405                             XFS_ICI_EOFBLOCKS_TAG);
1406        if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
1407                /* clear the eofblocks tag from the perag radix tree */
1408                spin_lock(&ip->i_mount->m_perag_lock);
1409                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
1410                                     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1411                                     XFS_ICI_EOFBLOCKS_TAG);
1412                spin_unlock(&ip->i_mount->m_perag_lock);
1413                trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
1414                                               -1, _RET_IP_);
1415        }
1416
1417        spin_unlock(&pag->pag_ici_lock);
1418        xfs_perag_put(pag);
1419}
1420
1421