LXR linux/fs/xfs/xfs

   1/*
   2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_format.h"
  21#include "xfs_log_format.h"
  22#include "xfs_trans_resv.h"
  23#include "xfs_sb.h"
  24#include "xfs_mount.h"
  25#include "xfs_inode.h"
  26#include "xfs_error.h"
  27#include "xfs_trans.h"
  28#include "xfs_trans_priv.h"
  29#include "xfs_inode_item.h"
  30#include "xfs_quota.h"
  31#include "xfs_trace.h"
  32#include "xfs_icache.h"
  33#include "xfs_bmap_util.h"
  34#include "xfs_dquot_item.h"
  35#include "xfs_dquot.h"
  36
  37#include <linux/kthread.h>
  38#include <linux/freezer.h>
  39
  40STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
  41                                struct xfs_perag *pag, struct xfs_inode *ip);
  42
  43/*
  44 * Allocate and initialise an xfs_inode.
  45 */
  46struct xfs_inode *
  47xfs_inode_alloc(
  48        struct xfs_mount        *mp,
  49        xfs_ino_t               ino)
  50{
  51        struct xfs_inode        *ip;
  52
  53        /*
  54         * if this didn't occur in transactions, we could use
  55         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
  56         * code up to do this anyway.
  57         */
  58        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
  59        if (!ip)
  60                return NULL;
  61        if (inode_init_always(mp->m_super, VFS_I(ip))) {
  62                kmem_zone_free(xfs_inode_zone, ip);
  63                return NULL;
  64        }
  65
  66        XFS_STATS_INC(vn_active);
  67        ASSERT(atomic_read(&ip->i_pincount) == 0);
  68        ASSERT(!spin_is_locked(&ip->i_flags_lock));
  69        ASSERT(!xfs_isiflocked(ip));
  70        ASSERT(ip->i_ino == 0);
  71
  72        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
  73
  74        /* initialise the xfs inode */
  75        ip->i_ino = ino;
  76        ip->i_mount = mp;
  77        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
  78        ip->i_afp = NULL;
  79        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
  80        ip->i_flags = 0;
  81        ip->i_delayed_blks = 0;
  82        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
  83
  84        return ip;
  85}
  86
  87STATIC void
  88xfs_inode_free_callback(
  89        struct rcu_head         *head)
  90{
  91        struct inode            *inode = container_of(head, struct inode, i_rcu);
  92        struct xfs_inode        *ip = XFS_I(inode);
  93
  94        kmem_zone_free(xfs_inode_zone, ip);
  95}
  96
  97void
  98xfs_inode_free(
  99        struct xfs_inode        *ip)
 100{
 101        switch (ip->i_d.di_mode & S_IFMT) {
 102        case S_IFREG:
 103        case S_IFDIR:
 104        case S_IFLNK:
 105                xfs_idestroy_fork(ip, XFS_DATA_FORK);
 106                break;
 107        }
 108
 109        if (ip->i_afp)
 110                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 111
 112        if (ip->i_itemp) {
 113                ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
 114                xfs_inode_item_destroy(ip);
 115                ip->i_itemp = NULL;
 116        }
 117
 118        /*
 119         * Because we use RCU freeing we need to ensure the inode always
 120         * appears to be reclaimed with an invalid inode number when in the
 121         * free state. The ip->i_flags_lock provides the barrier against lookup
 122         * races.
 123         */
 124        spin_lock(&ip->i_flags_lock);
 125        ip->i_flags = XFS_IRECLAIM;
 126        ip->i_ino = 0;
 127        spin_unlock(&ip->i_flags_lock);
 128
 129        /* asserts to verify all state is correct here */
 130        ASSERT(atomic_read(&ip->i_pincount) == 0);
 131        ASSERT(!xfs_isiflocked(ip));
 132        XFS_STATS_DEC(vn_active);
 133
 134        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 135}
 136
 137/*
 138 * Check the validity of the inode we just found it the cache
 139 */
 140static int
 141xfs_iget_cache_hit(
 142        struct xfs_perag        *pag,
 143        struct xfs_inode        *ip,
 144        xfs_ino_t               ino,
 145        int                     flags,
 146        int                     lock_flags) __releases(RCU)
 147{
 148        struct inode            *inode = VFS_I(ip);
 149        struct xfs_mount        *mp = ip->i_mount;
 150        int                     error;
 151
 152        /*
 153         * check for re-use of an inode within an RCU grace period due to the
 154         * radix tree nodes not being updated yet. We monitor for this by
 155         * setting the inode number to zero before freeing the inode structure.
 156         * If the inode has been reallocated and set up, then the inode number
 157         * will not match, so check for that, too.
 158         */
 159        spin_lock(&ip->i_flags_lock);
 160        if (ip->i_ino != ino) {
 161                trace_xfs_iget_skip(ip);
 162                XFS_STATS_INC(xs_ig_frecycle);
 163                error = -EAGAIN;
 164                goto out_error;
 165        }
 166
 167
 168        /*
 169         * If we are racing with another cache hit that is currently
 170         * instantiating this inode or currently recycling it out of
 171         * reclaimabe state, wait for the initialisation to complete
 172         * before continuing.
 173         *
 174         * XXX(hch): eventually we should do something equivalent to
 175         *           wait_on_inode to wait for these flags to be cleared
 176         *           instead of polling for it.
 177         */
 178        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
 179                trace_xfs_iget_skip(ip);
 180                XFS_STATS_INC(xs_ig_frecycle);
 181                error = -EAGAIN;
 182                goto out_error;
 183        }
 184
 185        /*
 186         * If lookup is racing with unlink return an error immediately.
 187         */
 188        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
 189                error = -ENOENT;
 190                goto out_error;
 191        }
 192
 193        /*
 194         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
 195         * Need to carefully get it back into useable state.
 196         */
 197        if (ip->i_flags & XFS_IRECLAIMABLE) {
 198                trace_xfs_iget_reclaim(ip);
 199
 200                /*
 201                 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
 202                 * from stomping over us while we recycle the inode.  We can't
 203                 * clear the radix tree reclaimable tag yet as it requires
 204                 * pag_ici_lock to be held exclusive.
 205                 */
 206                ip->i_flags |= XFS_IRECLAIM;
 207
 208                spin_unlock(&ip->i_flags_lock);
 209                rcu_read_unlock();
 210
 211                error = inode_init_always(mp->m_super, inode);
 212                if (error) {
 213                        /*
 214                         * Re-initializing the inode failed, and we are in deep
 215                         * trouble.  Try to re-add it to the reclaim list.
 216                         */
 217                        rcu_read_lock();
 218                        spin_lock(&ip->i_flags_lock);
 219
 220                        ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
 221                        ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
 222                        trace_xfs_iget_reclaim_fail(ip);
 223                        goto out_error;
 224                }
 225
 226                spin_lock(&pag->pag_ici_lock);
 227                spin_lock(&ip->i_flags_lock);
 228
 229                /*
 230                 * Clear the per-lifetime state in the inode as we are now
 231                 * effectively a new inode and need to return to the initial
 232                 * state before reuse occurs.
 233                 */
 234                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
 235                ip->i_flags |= XFS_INEW;
 236                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
 237                inode->i_state = I_NEW;
 238
 239                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
 240                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 241
 242                spin_unlock(&ip->i_flags_lock);
 243                spin_unlock(&pag->pag_ici_lock);
 244        } else {
 245                /* If the VFS inode is being torn down, pause and try again. */
 246                if (!igrab(inode)) {
 247                        trace_xfs_iget_skip(ip);
 248                        error = -EAGAIN;
 249                        goto out_error;
 250                }
 251
 252                /* We've got a live one. */
 253                spin_unlock(&ip->i_flags_lock);
 254                rcu_read_unlock();
 255                trace_xfs_iget_hit(ip);
 256        }
 257
 258        if (lock_flags != 0)
 259                xfs_ilock(ip, lock_flags);
 260
 261        xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
 262        XFS_STATS_INC(xs_ig_found);
 263
 264        return 0;
 265
 266out_error:
 267        spin_unlock(&ip->i_flags_lock);
 268        rcu_read_unlock();
 269        return error;
 270}
 271
 272
 273static int
 274xfs_iget_cache_miss(
 275        struct xfs_mount        *mp,
 276        struct xfs_perag        *pag,
 277        xfs_trans_t             *tp,
 278        xfs_ino_t               ino,
 279        struct xfs_inode        **ipp,
 280        int                     flags,
 281        int                     lock_flags)
 282{
 283        struct xfs_inode        *ip;
 284        int                     error;
 285        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
 286        int                     iflags;
 287
 288        ip = xfs_inode_alloc(mp, ino);
 289        if (!ip)
 290                return -ENOMEM;
 291
 292        error = xfs_iread(mp, tp, ip, flags);
 293        if (error)
 294                goto out_destroy;
 295
 296        trace_xfs_iget_miss(ip);
 297
 298        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
 299                error = -ENOENT;
 300                goto out_destroy;
 301        }
 302
 303        /*
 304         * Preload the radix tree so we can insert safely under the
 305         * write spinlock. Note that we cannot sleep inside the preload
 306         * region. Since we can be called from transaction context, don't
 307         * recurse into the file system.
 308         */
 309        if (radix_tree_preload(GFP_NOFS)) {
 310                error = -EAGAIN;
 311                goto out_destroy;
 312        }
 313
 314        /*
 315         * Because the inode hasn't been added to the radix-tree yet it can't
 316         * be found by another thread, so we can do the non-sleeping lock here.
 317         */
 318        if (lock_flags) {
 319                if (!xfs_ilock_nowait(ip, lock_flags))
 320                        BUG();
 321        }
 322
 323        /*
 324         * These values must be set before inserting the inode into the radix
 325         * tree as the moment it is inserted a concurrent lookup (allowed by the
 326         * RCU locking mechanism) can find it and that lookup must see that this
 327         * is an inode currently under construction (i.e. that XFS_INEW is set).
 328         * The ip->i_flags_lock that protects the XFS_INEW flag forms the
 329         * memory barrier that ensures this detection works correctly at lookup
 330         * time.
 331         */
 332        iflags = XFS_INEW;
 333        if (flags & XFS_IGET_DONTCACHE)
 334                iflags |= XFS_IDONTCACHE;
 335        ip->i_udquot = NULL;
 336        ip->i_gdquot = NULL;
 337        ip->i_pdquot = NULL;
 338        xfs_iflags_set(ip, iflags);
 339
 340        /* insert the new inode */
 341        spin_lock(&pag->pag_ici_lock);
 342        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
 343        if (unlikely(error)) {
 344                WARN_ON(error != -EEXIST);
 345                XFS_STATS_INC(xs_ig_dup);
 346                error = -EAGAIN;
 347                goto out_preload_end;
 348        }
 349        spin_unlock(&pag->pag_ici_lock);
 350        radix_tree_preload_end();
 351
 352        *ipp = ip;
 353        return 0;
 354
 355out_preload_end:
 356        spin_unlock(&pag->pag_ici_lock);
 357        radix_tree_preload_end();
 358        if (lock_flags)
 359                xfs_iunlock(ip, lock_flags);
 360out_destroy:
 361        __destroy_inode(VFS_I(ip));
 362        xfs_inode_free(ip);
 363        return error;
 364}
 365
 366/*
 367 * Look up an inode by number in the given file system.
 368 * The inode is looked up in the cache held in each AG.
 369 * If the inode is found in the cache, initialise the vfs inode
 370 * if necessary.
 371 *
 372 * If it is not in core, read it in from the file system's device,
 373 * add it to the cache and initialise the vfs inode.
 374 *
 375 * The inode is locked according to the value of the lock_flags parameter.
 376 * This flag parameter indicates how and if the inode's IO lock and inode lock
 377 * should be taken.
 378 *
 379 * mp -- the mount point structure for the current file system.  It points
 380 *       to the inode hash table.
 381 * tp -- a pointer to the current transaction if there is one.  This is
 382 *       simply passed through to the xfs_iread() call.
 383 * ino -- the number of the inode desired.  This is the unique identifier
 384 *        within the file system for the inode being requested.
 385 * lock_flags -- flags indicating how to lock the inode.  See the comment
 386 *               for xfs_ilock() for a list of valid values.
 387 */
 388int
 389xfs_iget(
 390        xfs_mount_t     *mp,
 391        xfs_trans_t     *tp,
 392        xfs_ino_t       ino,
 393        uint            flags,
 394        uint            lock_flags,
 395        xfs_inode_t     **ipp)
 396{
 397        xfs_inode_t     *ip;
 398        int             error;
 399        xfs_perag_t     *pag;
 400        xfs_agino_t     agino;
 401
 402        /*
 403         * xfs_reclaim_inode() uses the ILOCK to ensure an inode
 404         * doesn't get freed while it's being referenced during a
 405         * radix tree traversal here.  It assumes this function
 406         * aqcuires only the ILOCK (and therefore it has no need to
 407         * involve the IOLOCK in this synchronization).
 408         */
 409        ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
 410
 411        /* reject inode numbers outside existing AGs */
 412        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 413                return -EINVAL;
 414
 415        /* get the perag structure and ensure that it's inode capable */
 416        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
 417        agino = XFS_INO_TO_AGINO(mp, ino);
 418
 419again:
 420        error = 0;
 421        rcu_read_lock();
 422        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 423
 424        if (ip) {
 425                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
 426                if (error)
 427                        goto out_error_or_again;
 428        } else {
 429                rcu_read_unlock();
 430                XFS_STATS_INC(xs_ig_missed);
 431
 432                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
 433                                                        flags, lock_flags);
 434                if (error)
 435                        goto out_error_or_again;
 436        }
 437        xfs_perag_put(pag);
 438
 439        *ipp = ip;
 440
 441        /*
 442         * If we have a real type for an on-disk inode, we can set ops(&unlock)
 443         * now.  If it's a new inode being created, xfs_ialloc will handle it.
 444         */
 445        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
 446                xfs_setup_inode(ip);
 447        return 0;
 448
 449out_error_or_again:
 450        if (error == -EAGAIN) {
 451                delay(1);
 452                goto again;
 453        }
 454        xfs_perag_put(pag);
 455        return error;
 456}
 457
 458/*
 459 * The inode lookup is done in batches to keep the amount of lock traffic and
 460 * radix tree lookups to a minimum. The batch size is a trade off between
 461 * lookup reduction and stack usage. This is in the reclaim path, so we can't
 462 * be too greedy.
 463 */
 464#define XFS_LOOKUP_BATCH        32
 465
 466STATIC int
 467xfs_inode_ag_walk_grab(
 468        struct xfs_inode        *ip)
 469{
 470        struct inode            *inode = VFS_I(ip);
 471
 472        ASSERT(rcu_read_lock_held());
 473
 474        /*
 475         * check for stale RCU freed inode
 476         *
 477         * If the inode has been reallocated, it doesn't matter if it's not in
 478         * the AG we are walking - we are walking for writeback, so if it
 479         * passes all the "valid inode" checks and is dirty, then we'll write
 480         * it back anyway.  If it has been reallocated and still being
 481         * initialised, the XFS_INEW check below will catch it.
 482         */
 483        spin_lock(&ip->i_flags_lock);
 484        if (!ip->i_ino)
 485                goto out_unlock_noent;
 486
 487        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
 488        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
 489                goto out_unlock_noent;
 490        spin_unlock(&ip->i_flags_lock);
 491
 492        /* nothing to sync during shutdown */
 493        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 494                return -EFSCORRUPTED;
 495
 496        /* If we can't grab the inode, it must on it's way to reclaim. */
 497        if (!igrab(inode))
 498                return -ENOENT;
 499
 500        /* inode is valid */
 501        return 0;
 502
 503out_unlock_noent:
 504        spin_unlock(&ip->i_flags_lock);
 505        return -ENOENT;
 506}
 507
 508STATIC int
 509xfs_inode_ag_walk(
 510        struct xfs_mount        *mp,
 511        struct xfs_perag        *pag,
 512        int                     (*execute)(struct xfs_inode *ip, int flags,
 513                                           void *args),
 514        int                     flags,
 515        void                    *args,
 516        int                     tag)
 517{
 518        uint32_t                first_index;
 519        int                     last_error = 0;
 520        int                     skipped;
 521        int                     done;
 522        int                     nr_found;
 523
 524restart:
 525        done = 0;
 526        skipped = 0;
 527        first_index = 0;
 528        nr_found = 0;
 529        do {
 530                struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 531                int             error = 0;
 532                int             i;
 533
 534                rcu_read_lock();
 535
 536                if (tag == -1)
 537                        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 538                                        (void **)batch, first_index,
 539                                        XFS_LOOKUP_BATCH);
 540                else
 541                        nr_found = radix_tree_gang_lookup_tag(
 542                                        &pag->pag_ici_root,
 543                                        (void **) batch, first_index,
 544                                        XFS_LOOKUP_BATCH, tag);
 545
 546                if (!nr_found) {
 547                        rcu_read_unlock();
 548                        break;
 549                }
 550
 551                /*
 552                 * Grab the inodes before we drop the lock. if we found
 553                 * nothing, nr == 0 and the loop will be skipped.
 554                 */
 555                for (i = 0; i < nr_found; i++) {
 556                        struct xfs_inode *ip = batch[i];
 557
 558                        if (done || xfs_inode_ag_walk_grab(ip))
 559                                batch[i] = NULL;
 560
 561                        /*
 562                         * Update the index for the next lookup. Catch
 563                         * overflows into the next AG range which can occur if
 564                         * we have inodes in the last block of the AG and we
 565                         * are currently pointing to the last inode.
 566                         *
 567                         * Because we may see inodes that are from the wrong AG
 568                         * due to RCU freeing and reallocation, only update the
 569                         * index if it lies in this AG. It was a race that lead
 570                         * us to see this inode, so another lookup from the
 571                         * same index will not find it again.
 572                         */
 573                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
 574                                continue;
 575                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 576                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 577                                done = 1;
 578                }
 579
 580                /* unlock now we've grabbed the inodes. */
 581                rcu_read_unlock();
 582
 583                for (i = 0; i < nr_found; i++) {
 584                        if (!batch[i])
 585                                continue;
 586                        error = execute(batch[i], flags, args);
 587                        IRELE(batch[i]);
 588                        if (error == -EAGAIN) {
 589                                skipped++;
 590                                continue;
 591                        }
 592                        if (error && last_error != -EFSCORRUPTED)
 593                                last_error = error;
 594                }
 595
 596                /* bail out if the filesystem is corrupted.  */
 597                if (error == -EFSCORRUPTED)
 598                        break;
 599
 600                cond_resched();
 601
 602        } while (nr_found && !done);
 603
 604        if (skipped) {
 605                delay(1);
 606                goto restart;
 607        }
 608        return last_error;
 609}
 610
 611/*
 612 * Background scanning to trim post-EOF preallocated space. This is queued
 613 * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
 614 */
 615STATIC void
 616xfs_queue_eofblocks(
 617        struct xfs_mount *mp)
 618{
 619        rcu_read_lock();
 620        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
 621                queue_delayed_work(mp->m_eofblocks_workqueue,
 622                                   &mp->m_eofblocks_work,
 623                                   msecs_to_jiffies(xfs_eofb_secs * 1000));
 624        rcu_read_unlock();
 625}
 626
 627void
 628xfs_eofblocks_worker(
 629        struct work_struct *work)
 630{
 631        struct xfs_mount *mp = container_of(to_delayed_work(work),
 632                                struct xfs_mount, m_eofblocks_work);
 633        xfs_icache_free_eofblocks(mp, NULL);
 634        xfs_queue_eofblocks(mp);
 635}
 636
 637int
 638xfs_inode_ag_iterator(
 639        struct xfs_mount        *mp,
 640        int                     (*execute)(struct xfs_inode *ip, int flags,
 641                                           void *args),
 642        int                     flags,
 643        void                    *args)
 644{
 645        struct xfs_perag        *pag;
 646        int                     error = 0;
 647        int                     last_error = 0;
 648        xfs_agnumber_t          ag;
 649
 650        ag = 0;
 651        while ((pag = xfs_perag_get(mp, ag))) {
 652                ag = pag->pag_agno + 1;
 653                error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
 654                xfs_perag_put(pag);
 655                if (error) {
 656                        last_error = error;
 657                        if (error == -EFSCORRUPTED)
 658                                break;
 659                }
 660        }
 661        return last_error;
 662}
 663
 664int
 665xfs_inode_ag_iterator_tag(
 666        struct xfs_mount        *mp,
 667        int                     (*execute)(struct xfs_inode *ip, int flags,
 668                                           void *args),
 669        int                     flags,
 670        void                    *args,
 671        int                     tag)
 672{
 673        struct xfs_perag        *pag;
 674        int                     error = 0;
 675        int                     last_error = 0;
 676        xfs_agnumber_t          ag;
 677
 678        ag = 0;
 679        while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
 680                ag = pag->pag_agno + 1;
 681                error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
 682                xfs_perag_put(pag);
 683                if (error) {
 684                        last_error = error;
 685                        if (error == -EFSCORRUPTED)
 686                                break;
 687                }
 688        }
 689        return last_error;
 690}
 691
 692/*
 693 * Queue a new inode reclaim pass if there are reclaimable inodes and there
 694 * isn't a reclaim pass already in progress. By default it runs every 5s based
 695 * on the xfs periodic sync default of 30s. Perhaps this should have it's own
 696 * tunable, but that can be done if this method proves to be ineffective or too
 697 * aggressive.
 698 */
 699static void
 700xfs_reclaim_work_queue(
 701        struct xfs_mount        *mp)
 702{
 703
 704        rcu_read_lock();
 705        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
 706                queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
 707                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 708        }
 709        rcu_read_unlock();
 710}
 711
 712/*
 713 * This is a fast pass over the inode cache to try to get reclaim moving on as
 714 * many inodes as possible in a short period of time. It kicks itself every few
 715 * seconds, as well as being kicked by the inode cache shrinker when memory
 716 * goes low. It scans as quickly as possible avoiding locked inodes or those
 717 * already being flushed, and once done schedules a future pass.
 718 */
 719void
 720xfs_reclaim_worker(
 721        struct work_struct *work)
 722{
 723        struct xfs_mount *mp = container_of(to_delayed_work(work),
 724                                        struct xfs_mount, m_reclaim_work);
 725
 726        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
 727        xfs_reclaim_work_queue(mp);
 728}
 729
 730static void
 731__xfs_inode_set_reclaim_tag(
 732        struct xfs_perag        *pag,
 733        struct xfs_inode        *ip)
 734{
 735        radix_tree_tag_set(&pag->pag_ici_root,
 736                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
 737                           XFS_ICI_RECLAIM_TAG);
 738
 739        if (!pag->pag_ici_reclaimable) {
 740                /* propagate the reclaim tag up into the perag radix tree */
 741                spin_lock(&ip->i_mount->m_perag_lock);
 742                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
 743                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
 744                                XFS_ICI_RECLAIM_TAG);
 745                spin_unlock(&ip->i_mount->m_perag_lock);
 746
 747                /* schedule periodic background inode reclaim */
 748                xfs_reclaim_work_queue(ip->i_mount);
 749
 750                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
 751                                                        -1, _RET_IP_);
 752        }
 753        pag->pag_ici_reclaimable++;
 754}
 755
 756/*
 757 * We set the inode flag atomically with the radix tree tag.
 758 * Once we get tag lookups on the radix tree, this inode flag
 759 * can go away.
 760 */
 761void
 762xfs_inode_set_reclaim_tag(
 763        xfs_inode_t     *ip)
 764{
 765        struct xfs_mount *mp = ip->i_mount;
 766        struct xfs_perag *pag;
 767
 768        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 769        spin_lock(&pag->pag_ici_lock);
 770        spin_lock(&ip->i_flags_lock);
 771        __xfs_inode_set_reclaim_tag(pag, ip);
 772        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 773        spin_unlock(&ip->i_flags_lock);
 774        spin_unlock(&pag->pag_ici_lock);
 775        xfs_perag_put(pag);
 776}
 777
 778STATIC void
 779__xfs_inode_clear_reclaim(
 780        xfs_perag_t     *pag,
 781        xfs_inode_t     *ip)
 782{
 783        pag->pag_ici_reclaimable--;
 784        if (!pag->pag_ici_reclaimable) {
 785                /* clear the reclaim tag from the perag radix tree */
 786                spin_lock(&ip->i_mount->m_perag_lock);
 787                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
 788                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
 789                                XFS_ICI_RECLAIM_TAG);
 790                spin_unlock(&ip->i_mount->m_perag_lock);
 791                trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
 792                                                        -1, _RET_IP_);
 793        }
 794}
 795
 796STATIC void
 797__xfs_inode_clear_reclaim_tag(
 798        xfs_mount_t     *mp,
 799        xfs_perag_t     *pag,
 800        xfs_inode_t     *ip)
 801{
 802        radix_tree_tag_clear(&pag->pag_ici_root,
 803                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 804        __xfs_inode_clear_reclaim(pag, ip);
 805}
 806
 807/*
 808 * Grab the inode for reclaim exclusively.
 809 * Return 0 if we grabbed it, non-zero otherwise.
 810 */
 811STATIC int
 812xfs_reclaim_inode_grab(
 813        struct xfs_inode        *ip,
 814        int                     flags)
 815{
 816        ASSERT(rcu_read_lock_held());
 817
 818        /* quick check for stale RCU freed inode */
 819        if (!ip->i_ino)
 820                return 1;
 821
 822        /*
 823         * If we are asked for non-blocking operation, do unlocked checks to
 824         * see if the inode already is being flushed or in reclaim to avoid
 825         * lock traffic.
 826         */
 827        if ((flags & SYNC_TRYLOCK) &&
 828            __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
 829                return 1;
 830
 831        /*
 832         * The radix tree lock here protects a thread in xfs_iget from racing
 833         * with us starting reclaim on the inode.  Once we have the
 834         * XFS_IRECLAIM flag set it will not touch us.
 835         *
 836         * Due to RCU lookup, we may find inodes that have been freed and only
 837         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
 838         * aren't candidates for reclaim at all, so we must check the
 839         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
 840         */
 841        spin_lock(&ip->i_flags_lock);
 842        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
 843            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
 844                /* not a reclaim candidate. */
 845                spin_unlock(&ip->i_flags_lock);
 846                return 1;
 847        }
 848        __xfs_iflags_set(ip, XFS_IRECLAIM);
 849        spin_unlock(&ip->i_flags_lock);
 850        return 0;
 851}
 852
 853/*
 854 * Inodes in different states need to be treated differently. The following
 855 * table lists the inode states and the reclaim actions necessary:
 856 *
 857 *      inode state          iflush ret         required action
 858 *      ---------------      ----------         ---------------
 859 *      bad                     -               reclaim
 860 *      shutdown                EIO             unpin and reclaim
 861 *      clean, unpinned         0               reclaim
 862 *      stale, unpinned         0               reclaim
 863 *      clean, pinned(*)        0               requeue
 864 *      stale, pinned           EAGAIN          requeue
 865 *      dirty, async            -               requeue
 866 *      dirty, sync             0               reclaim
 867 *
 868 * (*) dgc: I don't think the clean, pinned state is possible but it gets
 869 * handled anyway given the order of checks implemented.
 870 *
 871 * Also, because we get the flush lock first, we know that any inode that has
 872 * been flushed delwri has had the flush completed by the time we check that
 873 * the inode is clean.
 874 *
 875 * Note that because the inode is flushed delayed write by AIL pushing, the
 876 * flush lock may already be held here and waiting on it can result in very
 877 * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
 878 * the caller should push the AIL first before trying to reclaim inodes to
 879 * minimise the amount of time spent waiting.  For background relaim, we only
 880 * bother to reclaim clean inodes anyway.
 881 *
 882 * Hence the order of actions after gaining the locks should be:
 883 *      bad             => reclaim
 884 *      shutdown        => unpin and reclaim
 885 *      pinned, async   => requeue
 886 *      pinned, sync    => unpin
 887 *      stale           => reclaim
 888 *      clean           => reclaim
 889 *      dirty, async    => requeue
 890 *      dirty, sync     => flush, wait and reclaim
 891 */
 892STATIC int
 893xfs_reclaim_inode(
 894        struct xfs_inode        *ip,
 895        struct xfs_perag        *pag,
 896        int                     sync_mode)
 897{
 898        struct xfs_buf          *bp = NULL;
 899        int                     error;
 900
 901restart:
 902        error = 0;
 903        xfs_ilock(ip, XFS_ILOCK_EXCL);
 904        if (!xfs_iflock_nowait(ip)) {
 905                if (!(sync_mode & SYNC_WAIT))
 906                        goto out;
 907                xfs_iflock(ip);
 908        }
 909
 910        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 911                xfs_iunpin_wait(ip);
 912                xfs_iflush_abort(ip, false);
 913                goto reclaim;
 914        }
 915        if (xfs_ipincount(ip)) {
 916                if (!(sync_mode & SYNC_WAIT))
 917                        goto out_ifunlock;
 918                xfs_iunpin_wait(ip);
 919        }
 920        if (xfs_iflags_test(ip, XFS_ISTALE))
 921                goto reclaim;
 922        if (xfs_inode_clean(ip))
 923                goto reclaim;
 924
 925        /*
 926         * Never flush out dirty data during non-blocking reclaim, as it would
 927         * just contend with AIL pushing trying to do the same job.
 928         */
 929        if (!(sync_mode & SYNC_WAIT))
 930                goto out_ifunlock;
 931
 932        /*
 933         * Now we have an inode that needs flushing.
 934         *
 935         * Note that xfs_iflush will never block on the inode buffer lock, as
 936         * xfs_ifree_cluster() can lock the inode buffer before it locks the
 937         * ip->i_lock, and we are doing the exact opposite here.  As a result,
 938         * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
 939         * result in an ABBA deadlock with xfs_ifree_cluster().
 940         *
 941         * As xfs_ifree_cluser() must gather all inodes that are active in the
 942         * cache to mark them stale, if we hit this case we don't actually want
 943         * to do IO here - we want the inode marked stale so we can simply
 944         * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
 945         * inode, back off and try again.  Hopefully the next pass through will
 946         * see the stale flag set on the inode.
 947         */
 948        error = xfs_iflush(ip, &bp);
 949        if (error == -EAGAIN) {
 950                xfs_iunlock(ip, XFS_ILOCK_EXCL);
 951                /* backoff longer than in xfs_ifree_cluster */
 952                delay(2);
 953                goto restart;
 954        }
 955
 956        if (!error) {
 957                error = xfs_bwrite(bp);
 958                xfs_buf_relse(bp);
 959        }
 960
 961        xfs_iflock(ip);
 962reclaim:
 963        xfs_ifunlock(ip);
 964        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 965
 966        XFS_STATS_INC(xs_ig_reclaims);
 967        /*
 968         * Remove the inode from the per-AG radix tree.
 969         *
 970         * Because radix_tree_delete won't complain even if the item was never
 971         * added to the tree assert that it's been there before to catch
 972         * problems with the inode life time early on.
 973         */
 974        spin_lock(&pag->pag_ici_lock);
 975        if (!radix_tree_delete(&pag->pag_ici_root,
 976                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
 977                ASSERT(0);
 978        __xfs_inode_clear_reclaim(pag, ip);
 979        spin_unlock(&pag->pag_ici_lock);
 980
 981        /*
 982         * Here we do an (almost) spurious inode lock in order to coordinate
 983         * with inode cache radix tree lookups.  This is because the lookup
 984         * can reference the inodes in the cache without taking references.
 985         *
 986         * We make that OK here by ensuring that we wait until the inode is
 987         * unlocked after the lookup before we go ahead and free it.
 988         */
 989        xfs_ilock(ip, XFS_ILOCK_EXCL);
 990        xfs_qm_dqdetach(ip);
 991        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 992
 993        xfs_inode_free(ip);
 994        return error;
 995
 996out_ifunlock:
 997        xfs_ifunlock(ip);
 998out:
 999        xfs_iflags_clear(ip, XFS_IRECLAIM);
1000        xfs_iunlock(ip, XFS_ILOCK_EXCL);

1001        /*
1002         * We could return -EAGAIN here to make reclaim rescan the inode tree in
1003         * a short while. However, this just burns CPU time scanning the tree
1004         * waiting for IO to complete and the reclaim work never goes back to
1005         * the idle state. Instead, return 0 to let the next scheduled
1006         * background reclaim attempt to reclaim the inode again.
1007         */
1008        return 0;
1009}
1010
1011/*
1012 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
1013 * corrupted, we still want to try to reclaim all the inodes. If we don't,
1014 * then a shut down during filesystem unmount reclaim walk leak all the
1015 * unreclaimed inodes.
1016 */
1017STATIC int
1018xfs_reclaim_inodes_ag(
1019        struct xfs_mount        *mp,
1020        int                     flags,
1021        int                     *nr_to_scan)
1022{
1023        struct xfs_perag        *pag;
1024        int                     error = 0;
1025        int                     last_error = 0;
1026        xfs_agnumber_t          ag;
1027        int                     trylock = flags & SYNC_TRYLOCK;
1028        int                     skipped;
1029
1030restart:
1031        ag = 0;
1032        skipped = 0;
1033        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1034                unsigned long   first_index = 0;
1035                int             done = 0;
1036                int             nr_found = 0;
1037
1038                ag = pag->pag_agno + 1;
1039
1040                if (trylock) {
1041                        if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
1042                                skipped++;
1043                                xfs_perag_put(pag);
1044                                continue;
1045                        }
1046                        first_index = pag->pag_ici_reclaim_cursor;
1047                } else
1048                        mutex_lock(&pag->pag_ici_reclaim_lock);
1049
1050                do {
1051                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1052                        int     i;
1053
1054                        rcu_read_lock();
1055                        nr_found = radix_tree_gang_lookup_tag(
1056                                        &pag->pag_ici_root,
1057                                        (void **)batch, first_index,
1058                                        XFS_LOOKUP_BATCH,
1059                                        XFS_ICI_RECLAIM_TAG);
1060                        if (!nr_found) {
1061                                done = 1;
1062                                rcu_read_unlock();
1063                                break;
1064                        }
1065
1066                        /*
1067                         * Grab the inodes before we drop the lock. if we found
1068                         * nothing, nr == 0 and the loop will be skipped.
1069                         */
1070                        for (i = 0; i < nr_found; i++) {
1071                                struct xfs_inode *ip = batch[i];
1072
1073                                if (done || xfs_reclaim_inode_grab(ip, flags))
1074                                        batch[i] = NULL;
1075
1076                                /*
1077                                 * Update the index for the next lookup. Catch
1078                                 * overflows into the next AG range which can
1079                                 * occur if we have inodes in the last block of
1080                                 * the AG and we are currently pointing to the
1081                                 * last inode.
1082                                 *
1083                                 * Because we may see inodes that are from the
1084                                 * wrong AG due to RCU freeing and
1085                                 * reallocation, only update the index if it
1086                                 * lies in this AG. It was a race that lead us
1087                                 * to see this inode, so another lookup from
1088                                 * the same index will not find it again.
1089                                 */
1090                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
1091                                                                pag->pag_agno)
1092                                        continue;
1093                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1094                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1095                                        done = 1;
1096                        }
1097
1098                        /* unlock now we've grabbed the inodes. */
1099                        rcu_read_unlock();
1100
1101                        for (i = 0; i < nr_found; i++) {
1102                                if (!batch[i])
1103                                        continue;
1104                                error = xfs_reclaim_inode(batch[i], pag, flags);
1105                                if (error && last_error != -EFSCORRUPTED)
1106                                        last_error = error;
1107                        }
1108
1109                        *nr_to_scan -= XFS_LOOKUP_BATCH;
1110
1111                        cond_resched();
1112
1113                } while (nr_found && !done && *nr_to_scan > 0);
1114
1115                if (trylock && !done)
1116                        pag->pag_ici_reclaim_cursor = first_index;
1117                else
1118                        pag->pag_ici_reclaim_cursor = 0;
1119                mutex_unlock(&pag->pag_ici_reclaim_lock);
1120                xfs_perag_put(pag);
1121        }
1122
1123        /*
1124         * if we skipped any AG, and we still have scan count remaining, do
1125         * another pass this time using blocking reclaim semantics (i.e
1126         * waiting on the reclaim locks and ignoring the reclaim cursors). This
1127         * ensure that when we get more reclaimers than AGs we block rather
1128         * than spin trying to execute reclaim.
1129         */
1130        if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
1131                trylock = 0;
1132                goto restart;
1133        }
1134        return last_error;
1135}
1136
1137int
1138xfs_reclaim_inodes(
1139        xfs_mount_t     *mp,
1140        int             mode)
1141{
1142        int             nr_to_scan = INT_MAX;
1143
1144        return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
1145}
1146
1147/*
1148 * Scan a certain number of inodes for reclaim.
1149 *
1150 * When called we make sure that there is a background (fast) inode reclaim in
1151 * progress, while we will throttle the speed of reclaim via doing synchronous
1152 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1153 * them to be cleaned, which we hope will not be very long due to the
1154 * background walker having already kicked the IO off on those dirty inodes.
1155 */
1156long
1157xfs_reclaim_inodes_nr(
1158        struct xfs_mount        *mp,
1159        int                     nr_to_scan)
1160{
1161        /* kick background reclaimer and push the AIL */
1162        xfs_reclaim_work_queue(mp);
1163        xfs_ail_push_all(mp->m_ail);
1164
1165        return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
1166}
1167
1168/*
1169 * Return the number of reclaimable inodes in the filesystem for
1170 * the shrinker to determine how much to reclaim.
1171 */
1172int
1173xfs_reclaim_inodes_count(
1174        struct xfs_mount        *mp)
1175{
1176        struct xfs_perag        *pag;
1177        xfs_agnumber_t          ag = 0;
1178        int                     reclaimable = 0;
1179
1180        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1181                ag = pag->pag_agno + 1;
1182                reclaimable += pag->pag_ici_reclaimable;
1183                xfs_perag_put(pag);
1184        }
1185        return reclaimable;
1186}
1187
1188STATIC int
1189xfs_inode_match_id(
1190        struct xfs_inode        *ip,
1191        struct xfs_eofblocks    *eofb)
1192{
1193        if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1194            !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1195                return 0;
1196
1197        if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1198            !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1199                return 0;
1200
1201        if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1202            xfs_get_projid(ip) != eofb->eof_prid)
1203                return 0;
1204
1205        return 1;
1206}
1207
1208/*
1209 * A union-based inode filtering algorithm. Process the inode if any of the
1210 * criteria match. This is for global/internal scans only.
1211 */
1212STATIC int
1213xfs_inode_match_id_union(
1214        struct xfs_inode        *ip,
1215        struct xfs_eofblocks    *eofb)
1216{
1217        if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1218            uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1219                return 1;
1220
1221        if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1222            gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1223                return 1;
1224
1225        if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1226            xfs_get_projid(ip) == eofb->eof_prid)
1227                return 1;
1228
1229        return 0;
1230}
1231
1232STATIC int
1233xfs_inode_free_eofblocks(
1234        struct xfs_inode        *ip,
1235        int                     flags,
1236        void                    *args)
1237{
1238        int ret;
1239        struct xfs_eofblocks *eofb = args;
1240        bool need_iolock = true;
1241        int match;
1242
1243        ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
1244
1245        if (!xfs_can_free_eofblocks(ip, false)) {
1246                /* inode could be preallocated or append-only */
1247                trace_xfs_inode_free_eofblocks_invalid(ip);
1248                xfs_inode_clear_eofblocks_tag(ip);
1249                return 0;
1250        }
1251
1252        /*
1253         * If the mapping is dirty the operation can block and wait for some
1254         * time. Unless we are waiting, skip it.
1255         */
1256        if (!(flags & SYNC_WAIT) &&
1257            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1258                return 0;
1259
1260        if (eofb) {
1261                if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
1262                        match = xfs_inode_match_id_union(ip, eofb);
1263                else
1264                        match = xfs_inode_match_id(ip, eofb);
1265                if (!match)
1266                        return 0;
1267
1268                /* skip the inode if the file size is too small */
1269                if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1270                    XFS_ISIZE(ip) < eofb->eof_min_file_size)
1271                        return 0;
1272
1273                /*
1274                 * A scan owner implies we already hold the iolock. Skip it in
1275                 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
1276                 * the possibility of EAGAIN being returned.
1277                 */
1278                if (eofb->eof_scan_owner == ip->i_ino)
1279                        need_iolock = false;
1280        }
1281
1282        ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
1283
1284        /* don't revisit the inode if we're not waiting */
1285        if (ret == -EAGAIN && !(flags & SYNC_WAIT))
1286                ret = 0;
1287
1288        return ret;
1289}
1290
1291int
1292xfs_icache_free_eofblocks(
1293        struct xfs_mount        *mp,
1294        struct xfs_eofblocks    *eofb)
1295{
1296        int flags = SYNC_TRYLOCK;
1297
1298        if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
1299                flags = SYNC_WAIT;
1300
1301        return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
1302                                         eofb, XFS_ICI_EOFBLOCKS_TAG);
1303}
1304
1305/*
1306 * Run eofblocks scans on the quotas applicable to the inode. For inodes with
1307 * multiple quotas, we don't know exactly which quota caused an allocation
1308 * failure. We make a best effort by including each quota under low free space
1309 * conditions (less than 1% free space) in the scan.
1310 */
1311int
1312xfs_inode_free_quota_eofblocks(
1313        struct xfs_inode *ip)
1314{
1315        int scan = 0;
1316        struct xfs_eofblocks eofb = {0};
1317        struct xfs_dquot *dq;
1318
1319        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1320
1321        /*
1322         * Set the scan owner to avoid a potential livelock. Otherwise, the scan
1323         * can repeatedly trylock on the inode we're currently processing. We
1324         * run a sync scan to increase effectiveness and use the union filter to
1325         * cover all applicable quotas in a single scan.
1326         */
1327        eofb.eof_scan_owner = ip->i_ino;
1328        eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
1329
1330        if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
1331                dq = xfs_inode_dquot(ip, XFS_DQ_USER);
1332                if (dq && xfs_dquot_lowsp(dq)) {
1333                        eofb.eof_uid = VFS_I(ip)->i_uid;
1334                        eofb.eof_flags |= XFS_EOF_FLAGS_UID;
1335                        scan = 1;
1336                }
1337        }
1338
1339        if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
1340                dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
1341                if (dq && xfs_dquot_lowsp(dq)) {
1342                        eofb.eof_gid = VFS_I(ip)->i_gid;
1343                        eofb.eof_flags |= XFS_EOF_FLAGS_GID;
1344                        scan = 1;
1345                }
1346        }
1347
1348        if (scan)
1349                xfs_icache_free_eofblocks(ip->i_mount, &eofb);
1350
1351        return scan;
1352}
1353
1354void
1355xfs_inode_set_eofblocks_tag(
1356        xfs_inode_t     *ip)
1357{
1358        struct xfs_mount *mp = ip->i_mount;
1359        struct xfs_perag *pag;
1360        int tagged;
1361
1362        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1363        spin_lock(&pag->pag_ici_lock);
1364        trace_xfs_inode_set_eofblocks_tag(ip);
1365
1366        tagged = radix_tree_tagged(&pag->pag_ici_root,
1367                                   XFS_ICI_EOFBLOCKS_TAG);
1368        radix_tree_tag_set(&pag->pag_ici_root,
1369                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1370                           XFS_ICI_EOFBLOCKS_TAG);
1371        if (!tagged) {
1372                /* propagate the eofblocks tag up into the perag radix tree */
1373                spin_lock(&ip->i_mount->m_perag_lock);
1374                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
1375                                   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1376                                   XFS_ICI_EOFBLOCKS_TAG);
1377                spin_unlock(&ip->i_mount->m_perag_lock);
1378
1379                /* kick off background trimming */
1380                xfs_queue_eofblocks(ip->i_mount);
1381
1382                trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
1383                                              -1, _RET_IP_);
1384        }
1385
1386        spin_unlock(&pag->pag_ici_lock);
1387        xfs_perag_put(pag);
1388}
1389
1390void
1391xfs_inode_clear_eofblocks_tag(
1392        xfs_inode_t     *ip)
1393{
1394        struct xfs_mount *mp = ip->i_mount;
1395        struct xfs_perag *pag;
1396
1397        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1398        spin_lock(&pag->pag_ici_lock);
1399        trace_xfs_inode_clear_eofblocks_tag(ip);
1400
1401        radix_tree_tag_clear(&pag->pag_ici_root,
1402                             XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1403                             XFS_ICI_EOFBLOCKS_TAG);
1404        if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
1405                /* clear the eofblocks tag from the perag radix tree */
1406                spin_lock(&ip->i_mount->m_perag_lock);
1407                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
1408                                     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1409                                     XFS_ICI_EOFBLOCKS_TAG);
1410                spin_unlock(&ip->i_mount->m_perag_lock);
1411                trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
1412                                               -1, _RET_IP_);
1413        }
1414
1415        spin_unlock(&pag->pag_ici_lock);
1416        xfs_perag_put(pag);
1417}
1418
1419