linux/fs/xfs/xfs_icache.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_mount.h"
  13#include "xfs_inode.h"
  14#include "xfs_trans.h"
  15#include "xfs_trans_priv.h"
  16#include "xfs_inode_item.h"
  17#include "xfs_quota.h"
  18#include "xfs_trace.h"
  19#include "xfs_icache.h"
  20#include "xfs_bmap_util.h"
  21#include "xfs_dquot_item.h"
  22#include "xfs_dquot.h"
  23#include "xfs_reflink.h"
  24#include "xfs_ialloc.h"
  25#include "xfs_ag.h"
  26
  27#include <linux/iversion.h>
  28
  29/* Radix tree tags for incore inode tree. */
  30
  31/* inode is to be reclaimed */
  32#define XFS_ICI_RECLAIM_TAG     0
  33/* Inode has speculative preallocations (posteof or cow) to clean. */
  34#define XFS_ICI_BLOCKGC_TAG     1
  35
  36/*
  37 * The goal for walking incore inodes.  These can correspond with incore inode
  38 * radix tree tags when convenient.  Avoid existing XFS_IWALK namespace.
  39 */
  40enum xfs_icwalk_goal {
  41        /* Goals directly associated with tagged inodes. */
  42        XFS_ICWALK_BLOCKGC      = XFS_ICI_BLOCKGC_TAG,
  43        XFS_ICWALK_RECLAIM      = XFS_ICI_RECLAIM_TAG,
  44};
  45
  46static int xfs_icwalk(struct xfs_mount *mp,
  47                enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
  48static int xfs_icwalk_ag(struct xfs_perag *pag,
  49                enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
  50
  51/*
  52 * Private inode cache walk flags for struct xfs_icwalk.  Must not
  53 * coincide with XFS_ICWALK_FLAGS_VALID.
  54 */
  55
  56/* Stop scanning after icw_scan_limit inodes. */
  57#define XFS_ICWALK_FLAG_SCAN_LIMIT      (1U << 28)
  58
  59#define XFS_ICWALK_FLAG_RECLAIM_SICK    (1U << 27)
  60#define XFS_ICWALK_FLAG_UNION           (1U << 26) /* union filter algorithm */
  61
  62#define XFS_ICWALK_PRIVATE_FLAGS        (XFS_ICWALK_FLAG_SCAN_LIMIT | \
  63                                         XFS_ICWALK_FLAG_RECLAIM_SICK | \
  64                                         XFS_ICWALK_FLAG_UNION)
  65
  66/*
  67 * Allocate and initialise an xfs_inode.
  68 */
  69struct xfs_inode *
  70xfs_inode_alloc(
  71        struct xfs_mount        *mp,
  72        xfs_ino_t               ino)
  73{
  74        struct xfs_inode        *ip;
  75
  76        /*
  77         * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
  78         * and return NULL here on ENOMEM.
  79         */
  80        ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
  81
  82        if (inode_init_always(mp->m_super, VFS_I(ip))) {
  83                kmem_cache_free(xfs_inode_zone, ip);
  84                return NULL;
  85        }
  86
  87        /* VFS doesn't initialise i_mode or i_state! */
  88        VFS_I(ip)->i_mode = 0;
  89        VFS_I(ip)->i_state = 0;
  90
  91        XFS_STATS_INC(mp, vn_active);
  92        ASSERT(atomic_read(&ip->i_pincount) == 0);
  93        ASSERT(ip->i_ino == 0);
  94
  95        /* initialise the xfs inode */
  96        ip->i_ino = ino;
  97        ip->i_mount = mp;
  98        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
  99        ip->i_afp = NULL;
 100        ip->i_cowfp = NULL;
 101        memset(&ip->i_df, 0, sizeof(ip->i_df));
 102        ip->i_flags = 0;
 103        ip->i_delayed_blks = 0;
 104        ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
 105        ip->i_nblocks = 0;
 106        ip->i_forkoff = 0;
 107        ip->i_sick = 0;
 108        ip->i_checked = 0;
 109        INIT_WORK(&ip->i_ioend_work, xfs_end_io);
 110        INIT_LIST_HEAD(&ip->i_ioend_list);
 111        spin_lock_init(&ip->i_ioend_lock);
 112
 113        return ip;
 114}
 115
 116STATIC void
 117xfs_inode_free_callback(
 118        struct rcu_head         *head)
 119{
 120        struct inode            *inode = container_of(head, struct inode, i_rcu);
 121        struct xfs_inode        *ip = XFS_I(inode);
 122
 123        switch (VFS_I(ip)->i_mode & S_IFMT) {
 124        case S_IFREG:
 125        case S_IFDIR:
 126        case S_IFLNK:
 127                xfs_idestroy_fork(&ip->i_df);
 128                break;
 129        }
 130
 131        if (ip->i_afp) {
 132                xfs_idestroy_fork(ip->i_afp);
 133                kmem_cache_free(xfs_ifork_zone, ip->i_afp);
 134        }
 135        if (ip->i_cowfp) {
 136                xfs_idestroy_fork(ip->i_cowfp);
 137                kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
 138        }
 139        if (ip->i_itemp) {
 140                ASSERT(!test_bit(XFS_LI_IN_AIL,
 141                                 &ip->i_itemp->ili_item.li_flags));
 142                xfs_inode_item_destroy(ip);
 143                ip->i_itemp = NULL;
 144        }
 145
 146        kmem_cache_free(xfs_inode_zone, ip);
 147}
 148
 149static void
 150__xfs_inode_free(
 151        struct xfs_inode        *ip)
 152{
 153        /* asserts to verify all state is correct here */
 154        ASSERT(atomic_read(&ip->i_pincount) == 0);
 155        ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
 156        XFS_STATS_DEC(ip->i_mount, vn_active);
 157
 158        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 159}
 160
 161void
 162xfs_inode_free(
 163        struct xfs_inode        *ip)
 164{
 165        ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
 166
 167        /*
 168         * Because we use RCU freeing we need to ensure the inode always
 169         * appears to be reclaimed with an invalid inode number when in the
 170         * free state. The ip->i_flags_lock provides the barrier against lookup
 171         * races.
 172         */
 173        spin_lock(&ip->i_flags_lock);
 174        ip->i_flags = XFS_IRECLAIM;
 175        ip->i_ino = 0;
 176        spin_unlock(&ip->i_flags_lock);
 177
 178        __xfs_inode_free(ip);
 179}
 180
 181/*
 182 * Queue background inode reclaim work if there are reclaimable inodes and there
 183 * isn't reclaim work already scheduled or in progress.
 184 */
 185static void
 186xfs_reclaim_work_queue(
 187        struct xfs_mount        *mp)
 188{
 189
 190        rcu_read_lock();
 191        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
 192                queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
 193                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 194        }
 195        rcu_read_unlock();
 196}
 197
 198/*
 199 * Background scanning to trim preallocated space. This is queued based on the
 200 * 'speculative_prealloc_lifetime' tunable (5m by default).
 201 */
 202static inline void
 203xfs_blockgc_queue(
 204        struct xfs_perag        *pag)
 205{
 206        struct xfs_mount        *mp = pag->pag_mount;
 207
 208        if (!xfs_is_blockgc_enabled(mp))
 209                return;
 210
 211        rcu_read_lock();
 212        if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
 213                queue_delayed_work(pag->pag_mount->m_blockgc_wq,
 214                                   &pag->pag_blockgc_work,
 215                                   msecs_to_jiffies(xfs_blockgc_secs * 1000));
 216        rcu_read_unlock();
 217}
 218
 219/* Set a tag on both the AG incore inode tree and the AG radix tree. */
 220static void
 221xfs_perag_set_inode_tag(
 222        struct xfs_perag        *pag,
 223        xfs_agino_t             agino,
 224        unsigned int            tag)
 225{
 226        struct xfs_mount        *mp = pag->pag_mount;
 227        bool                    was_tagged;
 228
 229        lockdep_assert_held(&pag->pag_ici_lock);
 230
 231        was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
 232        radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
 233
 234        if (tag == XFS_ICI_RECLAIM_TAG)
 235                pag->pag_ici_reclaimable++;
 236
 237        if (was_tagged)
 238                return;
 239
 240        /* propagate the tag up into the perag radix tree */
 241        spin_lock(&mp->m_perag_lock);
 242        radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
 243        spin_unlock(&mp->m_perag_lock);
 244
 245        /* start background work */
 246        switch (tag) {
 247        case XFS_ICI_RECLAIM_TAG:
 248                xfs_reclaim_work_queue(mp);
 249                break;
 250        case XFS_ICI_BLOCKGC_TAG:
 251                xfs_blockgc_queue(pag);
 252                break;
 253        }
 254
 255        trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
 256}
 257
 258/* Clear a tag on both the AG incore inode tree and the AG radix tree. */
 259static void
 260xfs_perag_clear_inode_tag(
 261        struct xfs_perag        *pag,
 262        xfs_agino_t             agino,
 263        unsigned int            tag)
 264{
 265        struct xfs_mount        *mp = pag->pag_mount;
 266
 267        lockdep_assert_held(&pag->pag_ici_lock);
 268
 269        /*
 270         * Reclaim can signal (with a null agino) that it cleared its own tag
 271         * by removing the inode from the radix tree.
 272         */
 273        if (agino != NULLAGINO)
 274                radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
 275        else
 276                ASSERT(tag == XFS_ICI_RECLAIM_TAG);
 277
 278        if (tag == XFS_ICI_RECLAIM_TAG)
 279                pag->pag_ici_reclaimable--;
 280
 281        if (radix_tree_tagged(&pag->pag_ici_root, tag))
 282                return;
 283
 284        /* clear the tag from the perag radix tree */
 285        spin_lock(&mp->m_perag_lock);
 286        radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
 287        spin_unlock(&mp->m_perag_lock);
 288
 289        trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
 290}
 291
 292static inline void
 293xfs_inew_wait(
 294        struct xfs_inode        *ip)
 295{
 296        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
 297        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
 298
 299        do {
 300                prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 301                if (!xfs_iflags_test(ip, XFS_INEW))
 302                        break;
 303                schedule();
 304        } while (true);
 305        finish_wait(wq, &wait.wq_entry);
 306}
 307
 308/*
 309 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
 310 * part of the structure. This is made more complex by the fact we store
 311 * information about the on-disk values in the VFS inode and so we can't just
 312 * overwrite the values unconditionally. Hence we save the parameters we
 313 * need to retain across reinitialisation, and rewrite them into the VFS inode
 314 * after reinitialisation even if it fails.
 315 */
 316static int
 317xfs_reinit_inode(
 318        struct xfs_mount        *mp,
 319        struct inode            *inode)
 320{
 321        int                     error;
 322        uint32_t                nlink = inode->i_nlink;
 323        uint32_t                generation = inode->i_generation;
 324        uint64_t                version = inode_peek_iversion(inode);
 325        umode_t                 mode = inode->i_mode;
 326        dev_t                   dev = inode->i_rdev;
 327        kuid_t                  uid = inode->i_uid;
 328        kgid_t                  gid = inode->i_gid;
 329
 330        error = inode_init_always(mp->m_super, inode);
 331
 332        set_nlink(inode, nlink);
 333        inode->i_generation = generation;
 334        inode_set_iversion_queried(inode, version);
 335        inode->i_mode = mode;
 336        inode->i_rdev = dev;
 337        inode->i_uid = uid;
 338        inode->i_gid = gid;
 339        return error;
 340}
 341
 342/*
 343 * Carefully nudge an inode whose VFS state has been torn down back into a
 344 * usable state.  Drops the i_flags_lock and the rcu read lock.
 345 */
 346static int
 347xfs_iget_recycle(
 348        struct xfs_perag        *pag,
 349        struct xfs_inode        *ip) __releases(&ip->i_flags_lock)
 350{
 351        struct xfs_mount        *mp = ip->i_mount;
 352        struct inode            *inode = VFS_I(ip);
 353        int                     error;
 354
 355        trace_xfs_iget_recycle(ip);
 356
 357        /*
 358         * We need to make it look like the inode is being reclaimed to prevent
 359         * the actual reclaim workers from stomping over us while we recycle
 360         * the inode.  We can't clear the radix tree tag yet as it requires
 361         * pag_ici_lock to be held exclusive.
 362         */
 363        ip->i_flags |= XFS_IRECLAIM;
 364
 365        spin_unlock(&ip->i_flags_lock);
 366        rcu_read_unlock();
 367
 368        ASSERT(!rwsem_is_locked(&inode->i_rwsem));
 369        error = xfs_reinit_inode(mp, inode);
 370        if (error) {
 371                bool    wake;
 372
 373                /*
 374                 * Re-initializing the inode failed, and we are in deep
 375                 * trouble.  Try to re-add it to the reclaim list.
 376                 */
 377                rcu_read_lock();
 378                spin_lock(&ip->i_flags_lock);
 379                wake = !!__xfs_iflags_test(ip, XFS_INEW);
 380                ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
 381                if (wake)
 382                        wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
 383                ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
 384                spin_unlock(&ip->i_flags_lock);
 385                rcu_read_unlock();
 386
 387                trace_xfs_iget_recycle_fail(ip);
 388                return error;
 389        }
 390
 391        spin_lock(&pag->pag_ici_lock);
 392        spin_lock(&ip->i_flags_lock);
 393
 394        /*
 395         * Clear the per-lifetime state in the inode as we are now effectively
 396         * a new inode and need to return to the initial state before reuse
 397         * occurs.
 398         */
 399        ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
 400        ip->i_flags |= XFS_INEW;
 401        xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
 402                        XFS_ICI_RECLAIM_TAG);
 403        inode->i_state = I_NEW;
 404        spin_unlock(&ip->i_flags_lock);
 405        spin_unlock(&pag->pag_ici_lock);
 406
 407        return 0;
 408}
 409
 410/*
 411 * If we are allocating a new inode, then check what was returned is
 412 * actually a free, empty inode. If we are not allocating an inode,
 413 * then check we didn't find a free inode.
 414 *
 415 * Returns:
 416 *      0               if the inode free state matches the lookup context
 417 *      -ENOENT         if the inode is free and we are not allocating
 418 *      -EFSCORRUPTED   if there is any state mismatch at all
 419 */
 420static int
 421xfs_iget_check_free_state(
 422        struct xfs_inode        *ip,
 423        int                     flags)
 424{
 425        if (flags & XFS_IGET_CREATE) {
 426                /* should be a free inode */
 427                if (VFS_I(ip)->i_mode != 0) {
 428                        xfs_warn(ip->i_mount,
 429"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
 430                                ip->i_ino, VFS_I(ip)->i_mode);
 431                        return -EFSCORRUPTED;
 432                }
 433
 434                if (ip->i_nblocks != 0) {
 435                        xfs_warn(ip->i_mount,
 436"Corruption detected! Free inode 0x%llx has blocks allocated!",
 437                                ip->i_ino);
 438                        return -EFSCORRUPTED;
 439                }
 440                return 0;
 441        }
 442
 443        /* should be an allocated inode */
 444        if (VFS_I(ip)->i_mode == 0)
 445                return -ENOENT;
 446
 447        return 0;
 448}
 449
 450/* Make all pending inactivation work start immediately. */
 451static void
 452xfs_inodegc_queue_all(
 453        struct xfs_mount        *mp)
 454{
 455        struct xfs_inodegc      *gc;
 456        int                     cpu;
 457
 458        for_each_online_cpu(cpu) {
 459                gc = per_cpu_ptr(mp->m_inodegc, cpu);
 460                if (!llist_empty(&gc->list))
 461                        queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
 462        }
 463}
 464
 465/*
 466 * Check the validity of the inode we just found it the cache
 467 */
 468static int
 469xfs_iget_cache_hit(
 470        struct xfs_perag        *pag,
 471        struct xfs_inode        *ip,
 472        xfs_ino_t               ino,
 473        int                     flags,
 474        int                     lock_flags) __releases(RCU)
 475{
 476        struct inode            *inode = VFS_I(ip);
 477        struct xfs_mount        *mp = ip->i_mount;
 478        int                     error;
 479
 480        /*
 481         * check for re-use of an inode within an RCU grace period due to the
 482         * radix tree nodes not being updated yet. We monitor for this by
 483         * setting the inode number to zero before freeing the inode structure.
 484         * If the inode has been reallocated and set up, then the inode number
 485         * will not match, so check for that, too.
 486         */
 487        spin_lock(&ip->i_flags_lock);
 488        if (ip->i_ino != ino)
 489                goto out_skip;
 490
 491        /*
 492         * If we are racing with another cache hit that is currently
 493         * instantiating this inode or currently recycling it out of
 494         * reclaimable state, wait for the initialisation to complete
 495         * before continuing.
 496         *
 497         * If we're racing with the inactivation worker we also want to wait.
 498         * If we're creating a new file, it's possible that the worker
 499         * previously marked the inode as free on disk but hasn't finished
 500         * updating the incore state yet.  The AGI buffer will be dirty and
 501         * locked to the icreate transaction, so a synchronous push of the
 502         * inodegc workers would result in deadlock.  For a regular iget, the
 503         * worker is running already, so we might as well wait.
 504         *
 505         * XXX(hch): eventually we should do something equivalent to
 506         *           wait_on_inode to wait for these flags to be cleared
 507         *           instead of polling for it.
 508         */
 509        if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
 510                goto out_skip;
 511
 512        if (ip->i_flags & XFS_NEED_INACTIVE) {
 513                /* Unlinked inodes cannot be re-grabbed. */
 514                if (VFS_I(ip)->i_nlink == 0) {
 515                        error = -ENOENT;
 516                        goto out_error;
 517                }
 518                goto out_inodegc_flush;
 519        }
 520
 521        /*
 522         * Check the inode free state is valid. This also detects lookup
 523         * racing with unlinks.
 524         */
 525        error = xfs_iget_check_free_state(ip, flags);
 526        if (error)
 527                goto out_error;
 528
 529        /* Skip inodes that have no vfs state. */
 530        if ((flags & XFS_IGET_INCORE) &&
 531            (ip->i_flags & XFS_IRECLAIMABLE))
 532                goto out_skip;
 533
 534        /* The inode fits the selection criteria; process it. */
 535        if (ip->i_flags & XFS_IRECLAIMABLE) {
 536                /* Drops i_flags_lock and RCU read lock. */
 537                error = xfs_iget_recycle(pag, ip);
 538                if (error)
 539                        return error;
 540        } else {
 541                /* If the VFS inode is being torn down, pause and try again. */
 542                if (!igrab(inode))
 543                        goto out_skip;
 544
 545                /* We've got a live one. */
 546                spin_unlock(&ip->i_flags_lock);
 547                rcu_read_unlock();
 548                trace_xfs_iget_hit(ip);
 549        }
 550
 551        if (lock_flags != 0)
 552                xfs_ilock(ip, lock_flags);
 553
 554        if (!(flags & XFS_IGET_INCORE))
 555                xfs_iflags_clear(ip, XFS_ISTALE);
 556        XFS_STATS_INC(mp, xs_ig_found);
 557
 558        return 0;
 559
 560out_skip:
 561        trace_xfs_iget_skip(ip);
 562        XFS_STATS_INC(mp, xs_ig_frecycle);
 563        error = -EAGAIN;
 564out_error:
 565        spin_unlock(&ip->i_flags_lock);
 566        rcu_read_unlock();
 567        return error;
 568
 569out_inodegc_flush:
 570        spin_unlock(&ip->i_flags_lock);
 571        rcu_read_unlock();
 572        /*
 573         * Do not wait for the workers, because the caller could hold an AGI
 574         * buffer lock.  We're just going to sleep in a loop anyway.
 575         */
 576        if (xfs_is_inodegc_enabled(mp))
 577                xfs_inodegc_queue_all(mp);
 578        return -EAGAIN;
 579}
 580
 581static int
 582xfs_iget_cache_miss(
 583        struct xfs_mount        *mp,
 584        struct xfs_perag        *pag,
 585        xfs_trans_t             *tp,
 586        xfs_ino_t               ino,
 587        struct xfs_inode        **ipp,
 588        int                     flags,
 589        int                     lock_flags)
 590{
 591        struct xfs_inode        *ip;
 592        int                     error;
 593        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
 594        int                     iflags;
 595
 596        ip = xfs_inode_alloc(mp, ino);
 597        if (!ip)
 598                return -ENOMEM;
 599
 600        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
 601        if (error)
 602                goto out_destroy;
 603
 604        /*
 605         * For version 5 superblocks, if we are initialising a new inode and we
 606         * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
 607         * simply build the new inode core with a random generation number.
 608         *
 609         * For version 4 (and older) superblocks, log recovery is dependent on
 610         * the i_flushiter field being initialised from the current on-disk
 611         * value and hence we must also read the inode off disk even when
 612         * initializing new inodes.
 613         */
 614        if (xfs_has_v3inodes(mp) &&
 615            (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
 616                VFS_I(ip)->i_generation = prandom_u32();
 617        } else {
 618                struct xfs_buf          *bp;
 619
 620                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
 621                if (error)
 622                        goto out_destroy;
 623
 624                error = xfs_inode_from_disk(ip,
 625                                xfs_buf_offset(bp, ip->i_imap.im_boffset));
 626                if (!error)
 627                        xfs_buf_set_ref(bp, XFS_INO_REF);
 628                xfs_trans_brelse(tp, bp);
 629
 630                if (error)
 631                        goto out_destroy;
 632        }
 633
 634        trace_xfs_iget_miss(ip);
 635
 636        /*
 637         * Check the inode free state is valid. This also detects lookup
 638         * racing with unlinks.
 639         */
 640        error = xfs_iget_check_free_state(ip, flags);
 641        if (error)
 642                goto out_destroy;
 643
 644        /*
 645         * Preload the radix tree so we can insert safely under the
 646         * write spinlock. Note that we cannot sleep inside the preload
 647         * region. Since we can be called from transaction context, don't
 648         * recurse into the file system.
 649         */
 650        if (radix_tree_preload(GFP_NOFS)) {
 651                error = -EAGAIN;
 652                goto out_destroy;
 653        }
 654
 655        /*
 656         * Because the inode hasn't been added to the radix-tree yet it can't
 657         * be found by another thread, so we can do the non-sleeping lock here.
 658         */
 659        if (lock_flags) {
 660                if (!xfs_ilock_nowait(ip, lock_flags))
 661                        BUG();
 662        }
 663
 664        /*
 665         * These values must be set before inserting the inode into the radix
 666         * tree as the moment it is inserted a concurrent lookup (allowed by the
 667         * RCU locking mechanism) can find it and that lookup must see that this
 668         * is an inode currently under construction (i.e. that XFS_INEW is set).
 669         * The ip->i_flags_lock that protects the XFS_INEW flag forms the
 670         * memory barrier that ensures this detection works correctly at lookup
 671         * time.
 672         */
 673        iflags = XFS_INEW;
 674        if (flags & XFS_IGET_DONTCACHE)
 675                d_mark_dontcache(VFS_I(ip));
 676        ip->i_udquot = NULL;
 677        ip->i_gdquot = NULL;
 678        ip->i_pdquot = NULL;
 679        xfs_iflags_set(ip, iflags);
 680
 681        /* insert the new inode */
 682        spin_lock(&pag->pag_ici_lock);
 683        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
 684        if (unlikely(error)) {
 685                WARN_ON(error != -EEXIST);
 686                XFS_STATS_INC(mp, xs_ig_dup);
 687                error = -EAGAIN;
 688                goto out_preload_end;
 689        }
 690        spin_unlock(&pag->pag_ici_lock);
 691        radix_tree_preload_end();
 692
 693        *ipp = ip;
 694        return 0;
 695
 696out_preload_end:
 697        spin_unlock(&pag->pag_ici_lock);
 698        radix_tree_preload_end();
 699        if (lock_flags)
 700                xfs_iunlock(ip, lock_flags);
 701out_destroy:
 702        __destroy_inode(VFS_I(ip));
 703        xfs_inode_free(ip);
 704        return error;
 705}
 706
 707/*
 708 * Look up an inode by number in the given file system.  The inode is looked up
 709 * in the cache held in each AG.  If the inode is found in the cache, initialise
 710 * the vfs inode if necessary.
 711 *
 712 * If it is not in core, read it in from the file system's device, add it to the
 713 * cache and initialise the vfs inode.
 714 *
 715 * The inode is locked according to the value of the lock_flags parameter.
 716 * Inode lookup is only done during metadata operations and not as part of the
 717 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
 718 */
 719int
 720xfs_iget(
 721        struct xfs_mount        *mp,
 722        struct xfs_trans        *tp,
 723        xfs_ino_t               ino,
 724        uint                    flags,
 725        uint                    lock_flags,
 726        struct xfs_inode        **ipp)
 727{
 728        struct xfs_inode        *ip;
 729        struct xfs_perag        *pag;
 730        xfs_agino_t             agino;
 731        int                     error;
 732
 733        ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
 734
 735        /* reject inode numbers outside existing AGs */
 736        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 737                return -EINVAL;
 738
 739        XFS_STATS_INC(mp, xs_ig_attempts);
 740
 741        /* get the perag structure and ensure that it's inode capable */
 742        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
 743        agino = XFS_INO_TO_AGINO(mp, ino);
 744
 745again:
 746        error = 0;
 747        rcu_read_lock();
 748        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 749
 750        if (ip) {
 751                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
 752                if (error)
 753                        goto out_error_or_again;
 754        } else {
 755                rcu_read_unlock();
 756                if (flags & XFS_IGET_INCORE) {
 757                        error = -ENODATA;
 758                        goto out_error_or_again;
 759                }
 760                XFS_STATS_INC(mp, xs_ig_missed);
 761
 762                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
 763                                                        flags, lock_flags);
 764                if (error)
 765                        goto out_error_or_again;
 766        }
 767        xfs_perag_put(pag);
 768
 769        *ipp = ip;
 770
 771        /*
 772         * If we have a real type for an on-disk inode, we can setup the inode
 773         * now.  If it's a new inode being created, xfs_ialloc will handle it.
 774         */
 775        if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
 776                xfs_setup_existing_inode(ip);
 777        return 0;
 778
 779out_error_or_again:
 780        if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
 781                delay(1);
 782                goto again;
 783        }
 784        xfs_perag_put(pag);
 785        return error;
 786}
 787
 788/*
 789 * "Is this a cached inode that's also allocated?"
 790 *
 791 * Look up an inode by number in the given file system.  If the inode is
 792 * in cache and isn't in purgatory, return 1 if the inode is allocated
 793 * and 0 if it is not.  For all other cases (not in cache, being torn
 794 * down, etc.), return a negative error code.
 795 *
 796 * The caller has to prevent inode allocation and freeing activity,
 797 * presumably by locking the AGI buffer.   This is to ensure that an
 798 * inode cannot transition from allocated to freed until the caller is
 799 * ready to allow that.  If the inode is in an intermediate state (new,
 800 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
 801 * inode is not in the cache, -ENOENT will be returned.  The caller must
 802 * deal with these scenarios appropriately.
 803 *
 804 * This is a specialized use case for the online scrubber; if you're
 805 * reading this, you probably want xfs_iget.
 806 */
 807int
 808xfs_icache_inode_is_allocated(
 809        struct xfs_mount        *mp,
 810        struct xfs_trans        *tp,
 811        xfs_ino_t               ino,
 812        bool                    *inuse)
 813{
 814        struct xfs_inode        *ip;
 815        int                     error;
 816
 817        error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
 818        if (error)
 819                return error;
 820
 821        *inuse = !!(VFS_I(ip)->i_mode);
 822        xfs_irele(ip);
 823        return 0;
 824}
 825
 826/*
 827 * Grab the inode for reclaim exclusively.
 828 *
 829 * We have found this inode via a lookup under RCU, so the inode may have
 830 * already been freed, or it may be in the process of being recycled by
 831 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
 832 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
 833 * will not be set. Hence we need to check for both these flag conditions to
 834 * avoid inodes that are no longer reclaim candidates.
 835 *
 836 * Note: checking for other state flags here, under the i_flags_lock or not, is
 837 * racy and should be avoided. Those races should be resolved only after we have
 838 * ensured that we are able to reclaim this inode and the world can see that we
 839 * are going to reclaim it.
 840 *
 841 * Return true if we grabbed it, false otherwise.
 842 */
 843static bool
 844xfs_reclaim_igrab(
 845        struct xfs_inode        *ip,
 846        struct xfs_icwalk       *icw)
 847{
 848        ASSERT(rcu_read_lock_held());
 849
 850        spin_lock(&ip->i_flags_lock);
 851        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
 852            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
 853                /* not a reclaim candidate. */
 854                spin_unlock(&ip->i_flags_lock);
 855                return false;
 856        }
 857
 858        /* Don't reclaim a sick inode unless the caller asked for it. */
 859        if (ip->i_sick &&
 860            (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
 861                spin_unlock(&ip->i_flags_lock);
 862                return false;
 863        }
 864
 865        __xfs_iflags_set(ip, XFS_IRECLAIM);
 866        spin_unlock(&ip->i_flags_lock);
 867        return true;
 868}
 869
 870/*
 871 * Inode reclaim is non-blocking, so the default action if progress cannot be
 872 * made is to "requeue" the inode for reclaim by unlocking it and clearing the
 873 * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
 874 * blocking anymore and hence we can wait for the inode to be able to reclaim
 875 * it.
 876 *
 877 * We do no IO here - if callers require inodes to be cleaned they must push the
 878 * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
 879 * done in the background in a non-blocking manner, and enables memory reclaim
 880 * to make progress without blocking.
 881 */
 882static void
 883xfs_reclaim_inode(
 884        struct xfs_inode        *ip,
 885        struct xfs_perag        *pag)
 886{
 887        xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
 888
 889        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
 890                goto out;
 891        if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
 892                goto out_iunlock;
 893
 894        if (xfs_is_shutdown(ip->i_mount)) {
 895                xfs_iunpin_wait(ip);
 896                xfs_iflush_abort(ip);
 897                goto reclaim;
 898        }
 899        if (xfs_ipincount(ip))
 900                goto out_clear_flush;
 901        if (!xfs_inode_clean(ip))
 902                goto out_clear_flush;
 903
 904        xfs_iflags_clear(ip, XFS_IFLUSHING);
 905reclaim:
 906        trace_xfs_inode_reclaiming(ip);
 907
 908        /*
 909         * Because we use RCU freeing we need to ensure the inode always appears
 910         * to be reclaimed with an invalid inode number when in the free state.
 911         * We do this as early as possible under the ILOCK so that
 912         * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
 913         * detect races with us here. By doing this, we guarantee that once
 914         * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
 915         * it will see either a valid inode that will serialise correctly, or it
 916         * will see an invalid inode that it can skip.
 917         */
 918        spin_lock(&ip->i_flags_lock);
 919        ip->i_flags = XFS_IRECLAIM;
 920        ip->i_ino = 0;
 921        ip->i_sick = 0;
 922        ip->i_checked = 0;
 923        spin_unlock(&ip->i_flags_lock);
 924
 925        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 926
 927        XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
 928        /*
 929         * Remove the inode from the per-AG radix tree.
 930         *
 931         * Because radix_tree_delete won't complain even if the item was never
 932         * added to the tree assert that it's been there before to catch
 933         * problems with the inode life time early on.
 934         */
 935        spin_lock(&pag->pag_ici_lock);
 936        if (!radix_tree_delete(&pag->pag_ici_root,
 937                                XFS_INO_TO_AGINO(ip->i_mount, ino)))
 938                ASSERT(0);
 939        xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
 940        spin_unlock(&pag->pag_ici_lock);
 941
 942        /*
 943         * Here we do an (almost) spurious inode lock in order to coordinate
 944         * with inode cache radix tree lookups.  This is because the lookup
 945         * can reference the inodes in the cache without taking references.
 946         *
 947         * We make that OK here by ensuring that we wait until the inode is
 948         * unlocked after the lookup before we go ahead and free it.
 949         */
 950        xfs_ilock(ip, XFS_ILOCK_EXCL);
 951        ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
 952        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 953        ASSERT(xfs_inode_clean(ip));
 954
 955        __xfs_inode_free(ip);
 956        return;
 957
 958out_clear_flush:
 959        xfs_iflags_clear(ip, XFS_IFLUSHING);
 960out_iunlock:
 961        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 962out:
 963        xfs_iflags_clear(ip, XFS_IRECLAIM);
 964}
 965
 966/* Reclaim sick inodes if we're unmounting or the fs went down. */
 967static inline bool
 968xfs_want_reclaim_sick(
 969        struct xfs_mount        *mp)
 970{
 971        return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
 972               xfs_is_shutdown(mp);
 973}
 974
 975void
 976xfs_reclaim_inodes(
 977        struct xfs_mount        *mp)
 978{
 979        struct xfs_icwalk       icw = {
 980                .icw_flags      = 0,
 981        };
 982
 983        if (xfs_want_reclaim_sick(mp))
 984                icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
 985
 986        while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
 987                xfs_ail_push_all_sync(mp->m_ail);
 988                xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
 989        }
 990}
 991
 992/*
 993 * The shrinker infrastructure determines how many inodes we should scan for
 994 * reclaim. We want as many clean inodes ready to reclaim as possible, so we
 995 * push the AIL here. We also want to proactively free up memory if we can to
 996 * minimise the amount of work memory reclaim has to do so we kick the
 997 * background reclaim if it isn't already scheduled.
 998 */
 999long
1000xfs_reclaim_inodes_nr(
1001        struct xfs_mount        *mp,
1002        unsigned long           nr_to_scan)
1003{
1004        struct xfs_icwalk       icw = {
1005                .icw_flags      = XFS_ICWALK_FLAG_SCAN_LIMIT,
1006                .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
1007        };
1008
1009        if (xfs_want_reclaim_sick(mp))
1010                icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
1011
1012        /* kick background reclaimer and push the AIL */
1013        xfs_reclaim_work_queue(mp);
1014        xfs_ail_push_all(mp->m_ail);
1015
1016        xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
1017        return 0;
1018}
1019
1020/*
1021 * Return the number of reclaimable inodes in the filesystem for
1022 * the shrinker to determine how much to reclaim.
1023 */
1024long
1025xfs_reclaim_inodes_count(
1026        struct xfs_mount        *mp)
1027{
1028        struct xfs_perag        *pag;
1029        xfs_agnumber_t          ag = 0;
1030        long                    reclaimable = 0;
1031
1032        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1033                ag = pag->pag_agno + 1;
1034                reclaimable += pag->pag_ici_reclaimable;
1035                xfs_perag_put(pag);
1036        }
1037        return reclaimable;
1038}
1039
1040STATIC bool
1041xfs_icwalk_match_id(
1042        struct xfs_inode        *ip,
1043        struct xfs_icwalk       *icw)
1044{
1045        if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1046            !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1047                return false;
1048
1049        if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1050            !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1051                return false;
1052
1053        if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1054            ip->i_projid != icw->icw_prid)
1055                return false;
1056
1057        return true;
1058}
1059
1060/*
1061 * A union-based inode filtering algorithm. Process the inode if any of the
1062 * criteria match. This is for global/internal scans only.
1063 */
1064STATIC bool
1065xfs_icwalk_match_id_union(
1066        struct xfs_inode        *ip,
1067        struct xfs_icwalk       *icw)
1068{
1069        if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1070            uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1071                return true;
1072
1073        if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1074            gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1075                return true;
1076
1077        if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1078            ip->i_projid == icw->icw_prid)
1079                return true;
1080
1081        return false;
1082}
1083
1084/*
1085 * Is this inode @ip eligible for eof/cow block reclamation, given some
1086 * filtering parameters @icw?  The inode is eligible if @icw is null or
1087 * if the predicate functions match.
1088 */
1089static bool
1090xfs_icwalk_match(
1091        struct xfs_inode        *ip,
1092        struct xfs_icwalk       *icw)
1093{
1094        bool                    match;
1095
1096        if (!icw)
1097                return true;
1098
1099        if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
1100                match = xfs_icwalk_match_id_union(ip, icw);
1101        else
1102                match = xfs_icwalk_match_id(ip, icw);
1103        if (!match)
1104                return false;
1105
1106        /* skip the inode if the file size is too small */
1107        if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
1108            XFS_ISIZE(ip) < icw->icw_min_file_size)
1109                return false;
1110
1111        return true;
1112}
1113
1114/*
1115 * This is a fast pass over the inode cache to try to get reclaim moving on as
1116 * many inodes as possible in a short period of time. It kicks itself every few
1117 * seconds, as well as being kicked by the inode cache shrinker when memory
1118 * goes low.
1119 */
1120void
1121xfs_reclaim_worker(
1122        struct work_struct *work)
1123{
1124        struct xfs_mount *mp = container_of(to_delayed_work(work),
1125                                        struct xfs_mount, m_reclaim_work);
1126
1127        xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
1128        xfs_reclaim_work_queue(mp);
1129}
1130
1131STATIC int
1132xfs_inode_free_eofblocks(
1133        struct xfs_inode        *ip,
1134        struct xfs_icwalk       *icw,
1135        unsigned int            *lockflags)
1136{
1137        bool                    wait;
1138
1139        wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1140
1141        if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
1142                return 0;
1143
1144        /*
1145         * If the mapping is dirty the operation can block and wait for some
1146         * time. Unless we are waiting, skip it.
1147         */
1148        if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1149                return 0;
1150
1151        if (!xfs_icwalk_match(ip, icw))
1152                return 0;
1153
1154        /*
1155         * If the caller is waiting, return -EAGAIN to keep the background
1156         * scanner moving and revisit the inode in a subsequent pass.
1157         */
1158        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1159                if (wait)
1160                        return -EAGAIN;
1161                return 0;
1162        }
1163        *lockflags |= XFS_IOLOCK_EXCL;
1164
1165        if (xfs_can_free_eofblocks(ip, false))
1166                return xfs_free_eofblocks(ip);
1167
1168        /* inode could be preallocated or append-only */
1169        trace_xfs_inode_free_eofblocks_invalid(ip);
1170        xfs_inode_clear_eofblocks_tag(ip);
1171        return 0;
1172}
1173
1174static void
1175xfs_blockgc_set_iflag(
1176        struct xfs_inode        *ip,
1177        unsigned long           iflag)
1178{
1179        struct xfs_mount        *mp = ip->i_mount;
1180        struct xfs_perag        *pag;
1181
1182        ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1183
1184        /*
1185         * Don't bother locking the AG and looking up in the radix trees
1186         * if we already know that we have the tag set.
1187         */
1188        if (ip->i_flags & iflag)
1189                return;
1190        spin_lock(&ip->i_flags_lock);
1191        ip->i_flags |= iflag;
1192        spin_unlock(&ip->i_flags_lock);
1193
1194        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1195        spin_lock(&pag->pag_ici_lock);
1196
1197        xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1198                        XFS_ICI_BLOCKGC_TAG);
1199
1200        spin_unlock(&pag->pag_ici_lock);
1201        xfs_perag_put(pag);
1202}
1203
1204void
1205xfs_inode_set_eofblocks_tag(
1206        xfs_inode_t     *ip)
1207{
1208        trace_xfs_inode_set_eofblocks_tag(ip);
1209        return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
1210}
1211
1212static void
1213xfs_blockgc_clear_iflag(
1214        struct xfs_inode        *ip,
1215        unsigned long           iflag)
1216{
1217        struct xfs_mount        *mp = ip->i_mount;
1218        struct xfs_perag        *pag;
1219        bool                    clear_tag;
1220
1221        ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1222
1223        spin_lock(&ip->i_flags_lock);
1224        ip->i_flags &= ~iflag;
1225        clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
1226        spin_unlock(&ip->i_flags_lock);
1227
1228        if (!clear_tag)
1229                return;
1230
1231        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1232        spin_lock(&pag->pag_ici_lock);
1233
1234        xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1235                        XFS_ICI_BLOCKGC_TAG);
1236
1237        spin_unlock(&pag->pag_ici_lock);
1238        xfs_perag_put(pag);
1239}
1240
1241void
1242xfs_inode_clear_eofblocks_tag(
1243        xfs_inode_t     *ip)
1244{
1245        trace_xfs_inode_clear_eofblocks_tag(ip);
1246        return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
1247}
1248
1249/*
1250 * Set ourselves up to free CoW blocks from this file.  If it's already clean
1251 * then we can bail out quickly, but otherwise we must back off if the file
1252 * is undergoing some kind of write.
1253 */
1254static bool
1255xfs_prep_free_cowblocks(
1256        struct xfs_inode        *ip)
1257{
1258        /*
1259         * Just clear the tag if we have an empty cow fork or none at all. It's
1260         * possible the inode was fully unshared since it was originally tagged.
1261         */
1262        if (!xfs_inode_has_cow_data(ip)) {
1263                trace_xfs_inode_free_cowblocks_invalid(ip);
1264                xfs_inode_clear_cowblocks_tag(ip);
1265                return false;
1266        }
1267
1268        /*
1269         * If the mapping is dirty or under writeback we cannot touch the
1270         * CoW fork.  Leave it alone if we're in the midst of a directio.
1271         */
1272        if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
1273            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
1274            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
1275            atomic_read(&VFS_I(ip)->i_dio_count))
1276                return false;
1277
1278        return true;
1279}
1280
1281/*
1282 * Automatic CoW Reservation Freeing
1283 *
1284 * These functions automatically garbage collect leftover CoW reservations
1285 * that were made on behalf of a cowextsize hint when we start to run out
1286 * of quota or when the reservations sit around for too long.  If the file
1287 * has dirty pages or is undergoing writeback, its CoW reservations will
1288 * be retained.
1289 *
1290 * The actual garbage collection piggybacks off the same code that runs
1291 * the speculative EOF preallocation garbage collector.
1292 */
1293STATIC int
1294xfs_inode_free_cowblocks(
1295        struct xfs_inode        *ip,
1296        struct xfs_icwalk       *icw,
1297        unsigned int            *lockflags)
1298{
1299        bool                    wait;
1300        int                     ret = 0;
1301
1302        wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1303
1304        if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
1305                return 0;
1306
1307        if (!xfs_prep_free_cowblocks(ip))
1308                return 0;
1309
1310        if (!xfs_icwalk_match(ip, icw))
1311                return 0;
1312
1313        /*
1314         * If the caller is waiting, return -EAGAIN to keep the background
1315         * scanner moving and revisit the inode in a subsequent pass.
1316         */
1317        if (!(*lockflags & XFS_IOLOCK_EXCL) &&
1318            !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1319                if (wait)
1320                        return -EAGAIN;
1321                return 0;
1322        }
1323        *lockflags |= XFS_IOLOCK_EXCL;
1324
1325        if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
1326                if (wait)
1327                        return -EAGAIN;
1328                return 0;
1329        }
1330        *lockflags |= XFS_MMAPLOCK_EXCL;
1331
1332        /*
1333         * Check again, nobody else should be able to dirty blocks or change
1334         * the reflink iflag now that we have the first two locks held.
1335         */
1336        if (xfs_prep_free_cowblocks(ip))
1337                ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
1338        return ret;
1339}
1340
1341void
1342xfs_inode_set_cowblocks_tag(
1343        xfs_inode_t     *ip)
1344{
1345        trace_xfs_inode_set_cowblocks_tag(ip);
1346        return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
1347}
1348
1349void
1350xfs_inode_clear_cowblocks_tag(
1351        xfs_inode_t     *ip)
1352{
1353        trace_xfs_inode_clear_cowblocks_tag(ip);
1354        return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
1355}
1356
1357/* Disable post-EOF and CoW block auto-reclamation. */
1358void
1359xfs_blockgc_stop(
1360        struct xfs_mount        *mp)
1361{
1362        struct xfs_perag        *pag;
1363        xfs_agnumber_t          agno;
1364
1365        if (!xfs_clear_blockgc_enabled(mp))
1366                return;
1367
1368        for_each_perag(mp, agno, pag)
1369                cancel_delayed_work_sync(&pag->pag_blockgc_work);
1370        trace_xfs_blockgc_stop(mp, __return_address);
1371}
1372
1373/* Enable post-EOF and CoW block auto-reclamation. */
1374void
1375xfs_blockgc_start(
1376        struct xfs_mount        *mp)
1377{
1378        struct xfs_perag        *pag;
1379        xfs_agnumber_t          agno;
1380
1381        if (xfs_set_blockgc_enabled(mp))
1382                return;
1383
1384        trace_xfs_blockgc_start(mp, __return_address);
1385        for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1386                xfs_blockgc_queue(pag);
1387}
1388
1389/* Don't try to run block gc on an inode that's in any of these states. */
1390#define XFS_BLOCKGC_NOGRAB_IFLAGS       (XFS_INEW | \
1391                                         XFS_NEED_INACTIVE | \
1392                                         XFS_INACTIVATING | \
1393                                         XFS_IRECLAIMABLE | \
1394                                         XFS_IRECLAIM)
1395/*
1396 * Decide if the given @ip is eligible for garbage collection of speculative
1397 * preallocations, and grab it if so.  Returns true if it's ready to go or
1398 * false if we should just ignore it.
1399 */
1400static bool
1401xfs_blockgc_igrab(
1402        struct xfs_inode        *ip)
1403{
1404        struct inode            *inode = VFS_I(ip);
1405
1406        ASSERT(rcu_read_lock_held());
1407
1408        /* Check for stale RCU freed inode */
1409        spin_lock(&ip->i_flags_lock);
1410        if (!ip->i_ino)
1411                goto out_unlock_noent;
1412
1413        if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
1414                goto out_unlock_noent;
1415        spin_unlock(&ip->i_flags_lock);
1416
1417        /* nothing to sync during shutdown */
1418        if (xfs_is_shutdown(ip->i_mount))
1419                return false;
1420
1421        /* If we can't grab the inode, it must on it's way to reclaim. */
1422        if (!igrab(inode))
1423                return false;
1424
1425        /* inode is valid */
1426        return true;
1427
1428out_unlock_noent:
1429        spin_unlock(&ip->i_flags_lock);
1430        return false;
1431}
1432
1433/* Scan one incore inode for block preallocations that we can remove. */
1434static int
1435xfs_blockgc_scan_inode(
1436        struct xfs_inode        *ip,
1437        struct xfs_icwalk       *icw)
1438{
1439        unsigned int            lockflags = 0;
1440        int                     error;
1441
1442        error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
1443        if (error)
1444                goto unlock;
1445
1446        error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
1447unlock:
1448        if (lockflags)
1449                xfs_iunlock(ip, lockflags);
1450        xfs_irele(ip);
1451        return error;
1452}
1453
1454/* Background worker that trims preallocated space. */
1455void
1456xfs_blockgc_worker(
1457        struct work_struct      *work)
1458{
1459        struct xfs_perag        *pag = container_of(to_delayed_work(work),
1460                                        struct xfs_perag, pag_blockgc_work);
1461        struct xfs_mount        *mp = pag->pag_mount;
1462        int                     error;
1463
1464        trace_xfs_blockgc_worker(mp, __return_address);
1465
1466        error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
1467        if (error)
1468                xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
1469                                pag->pag_agno, error);
1470        xfs_blockgc_queue(pag);
1471}
1472
1473/*
1474 * Try to free space in the filesystem by purging inactive inodes, eofblocks
1475 * and cowblocks.
1476 */
1477int
1478xfs_blockgc_free_space(
1479        struct xfs_mount        *mp,
1480        struct xfs_icwalk       *icw)
1481{
1482        int                     error;
1483
1484        trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
1485
1486        error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
1487        if (error)
1488                return error;
1489
1490        xfs_inodegc_flush(mp);
1491        return 0;
1492}
1493
1494/*
1495 * Reclaim all the free space that we can by scheduling the background blockgc
1496 * and inodegc workers immediately and waiting for them all to clear.
1497 */
1498void
1499xfs_blockgc_flush_all(
1500        struct xfs_mount        *mp)
1501{
1502        struct xfs_perag        *pag;
1503        xfs_agnumber_t          agno;
1504
1505        trace_xfs_blockgc_flush_all(mp, __return_address);
1506
1507        /*
1508         * For each blockgc worker, move its queue time up to now.  If it
1509         * wasn't queued, it will not be requeued.  Then flush whatever's
1510         * left.
1511         */
1512        for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1513                mod_delayed_work(pag->pag_mount->m_blockgc_wq,
1514                                &pag->pag_blockgc_work, 0);
1515
1516        for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1517                flush_delayed_work(&pag->pag_blockgc_work);
1518
1519        xfs_inodegc_flush(mp);
1520}
1521
1522/*
1523 * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
1524 * quota caused an allocation failure, so we make a best effort by including
1525 * each quota under low free space conditions (less than 1% free space) in the
1526 * scan.
1527 *
1528 * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
1529 * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
1530 * MMAPLOCK.
1531 */
1532int
1533xfs_blockgc_free_dquots(
1534        struct xfs_mount        *mp,
1535        struct xfs_dquot        *udqp,
1536        struct xfs_dquot        *gdqp,
1537        struct xfs_dquot        *pdqp,
1538        unsigned int            iwalk_flags)
1539{
1540        struct xfs_icwalk       icw = {0};
1541        bool                    do_work = false;
1542
1543        if (!udqp && !gdqp && !pdqp)
1544                return 0;
1545
1546        /*
1547         * Run a scan to free blocks using the union filter to cover all
1548         * applicable quotas in a single scan.
1549         */
1550        icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
1551
1552        if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
1553                icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
1554                icw.icw_flags |= XFS_ICWALK_FLAG_UID;
1555                do_work = true;
1556        }
1557
1558        if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
1559                icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
1560                icw.icw_flags |= XFS_ICWALK_FLAG_GID;
1561                do_work = true;
1562        }
1563
1564        if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
1565                icw.icw_prid = pdqp->q_id;
1566                icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
1567                do_work = true;
1568        }
1569
1570        if (!do_work)
1571                return 0;
1572
1573        return xfs_blockgc_free_space(mp, &icw);
1574}
1575
1576/* Run cow/eofblocks scans on the quotas attached to the inode. */
1577int
1578xfs_blockgc_free_quota(
1579        struct xfs_inode        *ip,
1580        unsigned int            iwalk_flags)
1581{
1582        return xfs_blockgc_free_dquots(ip->i_mount,
1583                        xfs_inode_dquot(ip, XFS_DQTYPE_USER),
1584                        xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
1585                        xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
1586}
1587
1588/* XFS Inode Cache Walking Code */
1589
1590/*
1591 * The inode lookup is done in batches to keep the amount of lock traffic and
1592 * radix tree lookups to a minimum. The batch size is a trade off between
1593 * lookup reduction and stack usage. This is in the reclaim path, so we can't
1594 * be too greedy.
1595 */
1596#define XFS_LOOKUP_BATCH        32
1597
1598
1599/*
1600 * Decide if we want to grab this inode in anticipation of doing work towards
1601 * the goal.
1602 */
1603static inline bool
1604xfs_icwalk_igrab(
1605        enum xfs_icwalk_goal    goal,
1606        struct xfs_inode        *ip,
1607        struct xfs_icwalk       *icw)
1608{
1609        switch (goal) {
1610        case XFS_ICWALK_BLOCKGC:
1611                return xfs_blockgc_igrab(ip);
1612        case XFS_ICWALK_RECLAIM:
1613                return xfs_reclaim_igrab(ip, icw);
1614        default:
1615                return false;
1616        }
1617}
1618
1619/*
1620 * Process an inode.  Each processing function must handle any state changes
1621 * made by the icwalk igrab function.  Return -EAGAIN to skip an inode.
1622 */
1623static inline int
1624xfs_icwalk_process_inode(
1625        enum xfs_icwalk_goal    goal,
1626        struct xfs_inode        *ip,
1627        struct xfs_perag        *pag,
1628        struct xfs_icwalk       *icw)
1629{
1630        int                     error = 0;
1631
1632        switch (goal) {
1633        case XFS_ICWALK_BLOCKGC:
1634                error = xfs_blockgc_scan_inode(ip, icw);
1635                break;
1636        case XFS_ICWALK_RECLAIM:
1637                xfs_reclaim_inode(ip, pag);
1638                break;
1639        }
1640        return error;
1641}
1642
1643/*
1644 * For a given per-AG structure @pag and a goal, grab qualifying inodes and
1645 * process them in some manner.
1646 */
1647static int
1648xfs_icwalk_ag(
1649        struct xfs_perag        *pag,
1650        enum xfs_icwalk_goal    goal,
1651        struct xfs_icwalk       *icw)
1652{
1653        struct xfs_mount        *mp = pag->pag_mount;
1654        uint32_t                first_index;
1655        int                     last_error = 0;
1656        int                     skipped;
1657        bool                    done;
1658        int                     nr_found;
1659
1660restart:
1661        done = false;
1662        skipped = 0;
1663        if (goal == XFS_ICWALK_RECLAIM)
1664                first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
1665        else
1666                first_index = 0;
1667        nr_found = 0;
1668        do {
1669                struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1670                int             error = 0;
1671                int             i;
1672
1673                rcu_read_lock();
1674
1675                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
1676                                (void **) batch, first_index,
1677                                XFS_LOOKUP_BATCH, goal);
1678                if (!nr_found) {
1679                        done = true;
1680                        rcu_read_unlock();
1681                        break;
1682                }
1683
1684                /*
1685                 * Grab the inodes before we drop the lock. if we found
1686                 * nothing, nr == 0 and the loop will be skipped.
1687                 */
1688                for (i = 0; i < nr_found; i++) {
1689                        struct xfs_inode *ip = batch[i];
1690
1691                        if (done || !xfs_icwalk_igrab(goal, ip, icw))
1692                                batch[i] = NULL;
1693
1694                        /*
1695                         * Update the index for the next lookup. Catch
1696                         * overflows into the next AG range which can occur if
1697                         * we have inodes in the last block of the AG and we
1698                         * are currently pointing to the last inode.
1699                         *
1700                         * Because we may see inodes that are from the wrong AG
1701                         * due to RCU freeing and reallocation, only update the
1702                         * index if it lies in this AG. It was a race that lead
1703                         * us to see this inode, so another lookup from the
1704                         * same index will not find it again.
1705                         */
1706                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1707                                continue;
1708                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1709                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1710                                done = true;
1711                }
1712
1713                /* unlock now we've grabbed the inodes. */
1714                rcu_read_unlock();
1715
1716                for (i = 0; i < nr_found; i++) {
1717                        if (!batch[i])
1718                                continue;
1719                        error = xfs_icwalk_process_inode(goal, batch[i], pag,
1720                                        icw);
1721                        if (error == -EAGAIN) {
1722                                skipped++;
1723                                continue;
1724                        }
1725                        if (error && last_error != -EFSCORRUPTED)
1726                                last_error = error;
1727                }
1728
1729                /* bail out if the filesystem is corrupted.  */
1730                if (error == -EFSCORRUPTED)
1731                        break;
1732
1733                cond_resched();
1734
1735                if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
1736                        icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
1737                        if (icw->icw_scan_limit <= 0)
1738                                break;
1739                }
1740        } while (nr_found && !done);
1741
1742        if (goal == XFS_ICWALK_RECLAIM) {
1743                if (done)
1744                        first_index = 0;
1745                WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
1746        }
1747
1748        if (skipped) {
1749                delay(1);
1750                goto restart;
1751        }
1752        return last_error;
1753}
1754
1755/* Walk all incore inodes to achieve a given goal. */
1756static int
1757xfs_icwalk(
1758        struct xfs_mount        *mp,
1759        enum xfs_icwalk_goal    goal,
1760        struct xfs_icwalk       *icw)
1761{
1762        struct xfs_perag        *pag;
1763        int                     error = 0;
1764        int                     last_error = 0;
1765        xfs_agnumber_t          agno;
1766
1767        for_each_perag_tag(mp, agno, pag, goal) {
1768                error = xfs_icwalk_ag(pag, goal, icw);
1769                if (error) {
1770                        last_error = error;
1771                        if (error == -EFSCORRUPTED) {
1772                                xfs_perag_put(pag);
1773                                break;
1774                        }
1775                }
1776        }
1777        return last_error;
1778        BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
1779}
1780
1781#ifdef DEBUG
1782static void
1783xfs_check_delalloc(
1784        struct xfs_inode        *ip,
1785        int                     whichfork)
1786{
1787        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
1788        struct xfs_bmbt_irec    got;
1789        struct xfs_iext_cursor  icur;
1790
1791        if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
1792                return;
1793        do {
1794                if (isnullstartblock(got.br_startblock)) {
1795                        xfs_warn(ip->i_mount,
1796        "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
1797                                ip->i_ino,
1798                                whichfork == XFS_DATA_FORK ? "data" : "cow",
1799                                got.br_startoff, got.br_blockcount);
1800                }
1801        } while (xfs_iext_next_extent(ifp, &icur, &got));
1802}
1803#else
1804#define xfs_check_delalloc(ip, whichfork)       do { } while (0)
1805#endif
1806
1807/* Schedule the inode for reclaim. */
1808static void
1809xfs_inodegc_set_reclaimable(
1810        struct xfs_inode        *ip)
1811{
1812        struct xfs_mount        *mp = ip->i_mount;
1813        struct xfs_perag        *pag;
1814
1815        if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
1816                xfs_check_delalloc(ip, XFS_DATA_FORK);
1817                xfs_check_delalloc(ip, XFS_COW_FORK);
1818                ASSERT(0);
1819        }
1820
1821        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1822        spin_lock(&pag->pag_ici_lock);
1823        spin_lock(&ip->i_flags_lock);
1824
1825        trace_xfs_inode_set_reclaimable(ip);
1826        ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
1827        ip->i_flags |= XFS_IRECLAIMABLE;
1828        xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1829                        XFS_ICI_RECLAIM_TAG);
1830
1831        spin_unlock(&ip->i_flags_lock);
1832        spin_unlock(&pag->pag_ici_lock);
1833        xfs_perag_put(pag);
1834}
1835
1836/*
1837 * Free all speculative preallocations and possibly even the inode itself.
1838 * This is the last chance to make changes to an otherwise unreferenced file
1839 * before incore reclamation happens.
1840 */
1841static void
1842xfs_inodegc_inactivate(
1843        struct xfs_inode        *ip)
1844{
1845        trace_xfs_inode_inactivating(ip);
1846        xfs_inactive(ip);
1847        xfs_inodegc_set_reclaimable(ip);
1848}
1849
1850void
1851xfs_inodegc_worker(
1852        struct work_struct      *work)
1853{
1854        struct xfs_inodegc      *gc = container_of(work, struct xfs_inodegc,
1855                                                        work);
1856        struct llist_node       *node = llist_del_all(&gc->list);
1857        struct xfs_inode        *ip, *n;
1858
1859        WRITE_ONCE(gc->items, 0);
1860
1861        if (!node)
1862                return;
1863
1864        ip = llist_entry(node, struct xfs_inode, i_gclist);
1865        trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
1866
1867        WRITE_ONCE(gc->shrinker_hits, 0);
1868        llist_for_each_entry_safe(ip, n, node, i_gclist) {
1869                xfs_iflags_set(ip, XFS_INACTIVATING);
1870                xfs_inodegc_inactivate(ip);
1871        }
1872}
1873
1874/*
1875 * Force all currently queued inode inactivation work to run immediately, and
1876 * wait for the work to finish. Two pass - queue all the work first pass, wait
1877 * for it in a second pass.
1878 */
1879void
1880xfs_inodegc_flush(
1881        struct xfs_mount        *mp)
1882{
1883        struct xfs_inodegc      *gc;
1884        int                     cpu;
1885
1886        if (!xfs_is_inodegc_enabled(mp))
1887                return;
1888
1889        trace_xfs_inodegc_flush(mp, __return_address);
1890
1891        xfs_inodegc_queue_all(mp);
1892
1893        for_each_online_cpu(cpu) {
1894                gc = per_cpu_ptr(mp->m_inodegc, cpu);
1895                flush_work(&gc->work);
1896        }
1897}
1898
1899/*
1900 * Flush all the pending work and then disable the inode inactivation background
1901 * workers and wait for them to stop.
1902 */
1903void
1904xfs_inodegc_stop(
1905        struct xfs_mount        *mp)
1906{
1907        struct xfs_inodegc      *gc;
1908        int                     cpu;
1909
1910        if (!xfs_clear_inodegc_enabled(mp))
1911                return;
1912
1913        xfs_inodegc_queue_all(mp);
1914
1915        for_each_online_cpu(cpu) {
1916                gc = per_cpu_ptr(mp->m_inodegc, cpu);
1917                cancel_work_sync(&gc->work);
1918        }
1919        trace_xfs_inodegc_stop(mp, __return_address);
1920}
1921
1922/*
1923 * Enable the inode inactivation background workers and schedule deferred inode
1924 * inactivation work if there is any.
1925 */
1926void
1927xfs_inodegc_start(
1928        struct xfs_mount        *mp)
1929{
1930        if (xfs_set_inodegc_enabled(mp))
1931                return;
1932
1933        trace_xfs_inodegc_start(mp, __return_address);
1934        xfs_inodegc_queue_all(mp);
1935}
1936
1937#ifdef CONFIG_XFS_RT
1938static inline bool
1939xfs_inodegc_want_queue_rt_file(
1940        struct xfs_inode        *ip)
1941{
1942        struct xfs_mount        *mp = ip->i_mount;
1943        uint64_t                freertx;
1944
1945        if (!XFS_IS_REALTIME_INODE(ip))
1946                return false;
1947
1948        freertx = READ_ONCE(mp->m_sb.sb_frextents);
1949        return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
1950}
1951#else
1952# define xfs_inodegc_want_queue_rt_file(ip)     (false)
1953#endif /* CONFIG_XFS_RT */
1954
1955/*
1956 * Schedule the inactivation worker when:
1957 *
1958 *  - We've accumulated more than one inode cluster buffer's worth of inodes.
1959 *  - There is less than 5% free space left.
1960 *  - Any of the quotas for this inode are near an enforcement limit.
1961 */
1962static inline bool
1963xfs_inodegc_want_queue_work(
1964        struct xfs_inode        *ip,
1965        unsigned int            items)
1966{
1967        struct xfs_mount        *mp = ip->i_mount;
1968
1969        if (items > mp->m_ino_geo.inodes_per_cluster)
1970                return true;
1971
1972        if (__percpu_counter_compare(&mp->m_fdblocks,
1973                                mp->m_low_space[XFS_LOWSP_5_PCNT],
1974                                XFS_FDBLOCKS_BATCH) < 0)
1975                return true;
1976
1977        if (xfs_inodegc_want_queue_rt_file(ip))
1978                return true;
1979
1980        if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
1981                return true;
1982
1983        if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
1984                return true;
1985
1986        if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
1987                return true;
1988
1989        return false;
1990}
1991
1992/*
1993 * Upper bound on the number of inodes in each AG that can be queued for
1994 * inactivation at any given time, to avoid monopolizing the workqueue.
1995 */
1996#define XFS_INODEGC_MAX_BACKLOG         (4 * XFS_INODES_PER_CHUNK)
1997
1998/*
1999 * Make the frontend wait for inactivations when:
2000 *
2001 *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
2002 *  - The queue depth exceeds the maximum allowable percpu backlog.
2003 *
2004 * Note: If the current thread is running a transaction, we don't ever want to
2005 * wait for other transactions because that could introduce a deadlock.
2006 */
2007static inline bool
2008xfs_inodegc_want_flush_work(
2009        struct xfs_inode        *ip,
2010        unsigned int            items,
2011        unsigned int            shrinker_hits)
2012{
2013        if (current->journal_info)
2014                return false;
2015
2016        if (shrinker_hits > 0)
2017                return true;
2018
2019        if (items > XFS_INODEGC_MAX_BACKLOG)
2020                return true;
2021
2022        return false;
2023}
2024
2025/*
2026 * Queue a background inactivation worker if there are inodes that need to be
2027 * inactivated and higher level xfs code hasn't disabled the background
2028 * workers.
2029 */
2030static void
2031xfs_inodegc_queue(
2032        struct xfs_inode        *ip)
2033{
2034        struct xfs_mount        *mp = ip->i_mount;
2035        struct xfs_inodegc      *gc;
2036        int                     items;
2037        unsigned int            shrinker_hits;
2038
2039        trace_xfs_inode_set_need_inactive(ip);
2040        spin_lock(&ip->i_flags_lock);
2041        ip->i_flags |= XFS_NEED_INACTIVE;
2042        spin_unlock(&ip->i_flags_lock);
2043
2044        gc = get_cpu_ptr(mp->m_inodegc);
2045        llist_add(&ip->i_gclist, &gc->list);
2046        items = READ_ONCE(gc->items);
2047        WRITE_ONCE(gc->items, items + 1);
2048        shrinker_hits = READ_ONCE(gc->shrinker_hits);
2049        put_cpu_ptr(gc);
2050
2051        if (!xfs_is_inodegc_enabled(mp))
2052                return;
2053
2054        if (xfs_inodegc_want_queue_work(ip, items)) {
2055                trace_xfs_inodegc_queue(mp, __return_address);
2056                queue_work(mp->m_inodegc_wq, &gc->work);
2057        }
2058
2059        if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
2060                trace_xfs_inodegc_throttle(mp, __return_address);
2061                flush_work(&gc->work);
2062        }
2063}
2064
2065/*
2066 * Fold the dead CPU inodegc queue into the current CPUs queue.
2067 */
2068void
2069xfs_inodegc_cpu_dead(
2070        struct xfs_mount        *mp,
2071        unsigned int            dead_cpu)
2072{
2073        struct xfs_inodegc      *dead_gc, *gc;
2074        struct llist_node       *first, *last;
2075        unsigned int            count = 0;
2076
2077        dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
2078        cancel_work_sync(&dead_gc->work);
2079
2080        if (llist_empty(&dead_gc->list))
2081                return;
2082
2083        first = dead_gc->list.first;
2084        last = first;
2085        while (last->next) {
2086                last = last->next;
2087                count++;
2088        }
2089        dead_gc->list.first = NULL;
2090        dead_gc->items = 0;
2091
2092        /* Add pending work to current CPU */
2093        gc = get_cpu_ptr(mp->m_inodegc);
2094        llist_add_batch(first, last, &gc->list);
2095        count += READ_ONCE(gc->items);
2096        WRITE_ONCE(gc->items, count);
2097        put_cpu_ptr(gc);
2098
2099        if (xfs_is_inodegc_enabled(mp)) {
2100                trace_xfs_inodegc_queue(mp, __return_address);
2101                queue_work(mp->m_inodegc_wq, &gc->work);
2102        }
2103}
2104
2105/*
2106 * We set the inode flag atomically with the radix tree tag.  Once we get tag
2107 * lookups on the radix tree, this inode flag can go away.
2108 *
2109 * We always use background reclaim here because even if the inode is clean, it
2110 * still may be under IO and hence we have wait for IO completion to occur
2111 * before we can reclaim the inode. The background reclaim path handles this
2112 * more efficiently than we can here, so simply let background reclaim tear down
2113 * all inodes.
2114 */
2115void
2116xfs_inode_mark_reclaimable(
2117        struct xfs_inode        *ip)
2118{
2119        struct xfs_mount        *mp = ip->i_mount;
2120        bool                    need_inactive;
2121
2122        XFS_STATS_INC(mp, vn_reclaim);
2123
2124        /*
2125         * We should never get here with any of the reclaim flags already set.
2126         */
2127        ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
2128
2129        need_inactive = xfs_inode_needs_inactive(ip);
2130        if (need_inactive) {
2131                xfs_inodegc_queue(ip);
2132                return;
2133        }
2134
2135        /* Going straight to reclaim, so drop the dquots. */
2136        xfs_qm_dqdetach(ip);
2137        xfs_inodegc_set_reclaimable(ip);
2138}
2139
2140/*
2141 * Register a phony shrinker so that we can run background inodegc sooner when
2142 * there's memory pressure.  Inactivation does not itself free any memory but
2143 * it does make inodes reclaimable, which eventually frees memory.
2144 *
2145 * The count function, seek value, and batch value are crafted to trigger the
2146 * scan function during the second round of scanning.  Hopefully this means
2147 * that we reclaimed enough memory that initiating metadata transactions won't
2148 * make things worse.
2149 */
2150#define XFS_INODEGC_SHRINKER_COUNT      (1UL << DEF_PRIORITY)
2151#define XFS_INODEGC_SHRINKER_BATCH      ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
2152
2153static unsigned long
2154xfs_inodegc_shrinker_count(
2155        struct shrinker         *shrink,
2156        struct shrink_control   *sc)
2157{
2158        struct xfs_mount        *mp = container_of(shrink, struct xfs_mount,
2159                                                   m_inodegc_shrinker);
2160        struct xfs_inodegc      *gc;
2161        int                     cpu;
2162
2163        if (!xfs_is_inodegc_enabled(mp))
2164                return 0;
2165
2166        for_each_online_cpu(cpu) {
2167                gc = per_cpu_ptr(mp->m_inodegc, cpu);
2168                if (!llist_empty(&gc->list))
2169                        return XFS_INODEGC_SHRINKER_COUNT;
2170        }
2171
2172        return 0;
2173}
2174
2175static unsigned long
2176xfs_inodegc_shrinker_scan(
2177        struct shrinker         *shrink,
2178        struct shrink_control   *sc)
2179{
2180        struct xfs_mount        *mp = container_of(shrink, struct xfs_mount,
2181                                                   m_inodegc_shrinker);
2182        struct xfs_inodegc      *gc;
2183        int                     cpu;
2184        bool                    no_items = true;
2185
2186        if (!xfs_is_inodegc_enabled(mp))
2187                return SHRINK_STOP;
2188
2189        trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
2190
2191        for_each_online_cpu(cpu) {
2192                gc = per_cpu_ptr(mp->m_inodegc, cpu);
2193                if (!llist_empty(&gc->list)) {
2194                        unsigned int    h = READ_ONCE(gc->shrinker_hits);
2195
2196                        WRITE_ONCE(gc->shrinker_hits, h + 1);
2197                        queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
2198                        no_items = false;
2199                }
2200        }
2201
2202        /*
2203         * If there are no inodes to inactivate, we don't want the shrinker
2204         * to think there's deferred work to call us back about.
2205         */
2206        if (no_items)
2207                return LONG_MAX;
2208
2209        return SHRINK_STOP;
2210}
2211
2212/* Register a shrinker so we can accelerate inodegc and throttle queuing. */
2213int
2214xfs_inodegc_register_shrinker(
2215        struct xfs_mount        *mp)
2216{
2217        struct shrinker         *shrink = &mp->m_inodegc_shrinker;
2218
2219        shrink->count_objects = xfs_inodegc_shrinker_count;
2220        shrink->scan_objects = xfs_inodegc_shrinker_scan;
2221        shrink->seeks = 0;
2222        shrink->flags = SHRINKER_NONSLAB;
2223        shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
2224
2225        return register_shrinker(shrink);
2226}
2227