linux/fs/xfs/xfs_sync.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_types.h"
  21#include "xfs_log.h"
  22#include "xfs_inum.h"
  23#include "xfs_trans.h"
  24#include "xfs_trans_priv.h"
  25#include "xfs_sb.h"
  26#include "xfs_ag.h"
  27#include "xfs_mount.h"
  28#include "xfs_bmap_btree.h"
  29#include "xfs_inode.h"
  30#include "xfs_dinode.h"
  31#include "xfs_error.h"
  32#include "xfs_filestream.h"
  33#include "xfs_vnodeops.h"
  34#include "xfs_inode_item.h"
  35#include "xfs_quota.h"
  36#include "xfs_trace.h"
  37#include "xfs_fsops.h"
  38
  39#include <linux/kthread.h>
  40#include <linux/freezer.h>
  41
  42struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
  43
  44/*
  45 * The inode lookup is done in batches to keep the amount of lock traffic and
  46 * radix tree lookups to a minimum. The batch size is a trade off between
  47 * lookup reduction and stack usage. This is in the reclaim path, so we can't
  48 * be too greedy.
  49 */
  50#define XFS_LOOKUP_BATCH        32
  51
  52STATIC int
  53xfs_inode_ag_walk_grab(
  54        struct xfs_inode        *ip)
  55{
  56        struct inode            *inode = VFS_I(ip);
  57
  58        ASSERT(rcu_read_lock_held());
  59
  60        /*
  61         * check for stale RCU freed inode
  62         *
  63         * If the inode has been reallocated, it doesn't matter if it's not in
  64         * the AG we are walking - we are walking for writeback, so if it
  65         * passes all the "valid inode" checks and is dirty, then we'll write
  66         * it back anyway.  If it has been reallocated and still being
  67         * initialised, the XFS_INEW check below will catch it.
  68         */
  69        spin_lock(&ip->i_flags_lock);
  70        if (!ip->i_ino)
  71                goto out_unlock_noent;
  72
  73        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
  74        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
  75                goto out_unlock_noent;
  76        spin_unlock(&ip->i_flags_lock);
  77
  78        /* nothing to sync during shutdown */
  79        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  80                return EFSCORRUPTED;
  81
  82        /* If we can't grab the inode, it must on it's way to reclaim. */
  83        if (!igrab(inode))
  84                return ENOENT;
  85
  86        if (is_bad_inode(inode)) {
  87                IRELE(ip);
  88                return ENOENT;
  89        }
  90
  91        /* inode is valid */
  92        return 0;
  93
  94out_unlock_noent:
  95        spin_unlock(&ip->i_flags_lock);
  96        return ENOENT;
  97}
  98
  99STATIC int
 100xfs_inode_ag_walk(
 101        struct xfs_mount        *mp,
 102        struct xfs_perag        *pag,
 103        int                     (*execute)(struct xfs_inode *ip,
 104                                           struct xfs_perag *pag, int flags),
 105        int                     flags)
 106{
 107        uint32_t                first_index;
 108        int                     last_error = 0;
 109        int                     skipped;
 110        int                     done;
 111        int                     nr_found;
 112
 113restart:
 114        done = 0;
 115        skipped = 0;
 116        first_index = 0;
 117        nr_found = 0;
 118        do {
 119                struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 120                int             error = 0;
 121                int             i;
 122
 123                rcu_read_lock();
 124                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 125                                        (void **)batch, first_index,
 126                                        XFS_LOOKUP_BATCH);
 127                if (!nr_found) {
 128                        rcu_read_unlock();
 129                        break;
 130                }
 131
 132                /*
 133                 * Grab the inodes before we drop the lock. if we found
 134                 * nothing, nr == 0 and the loop will be skipped.
 135                 */
 136                for (i = 0; i < nr_found; i++) {
 137                        struct xfs_inode *ip = batch[i];
 138
 139                        if (done || xfs_inode_ag_walk_grab(ip))
 140                                batch[i] = NULL;
 141
 142                        /*
 143                         * Update the index for the next lookup. Catch
 144                         * overflows into the next AG range which can occur if
 145                         * we have inodes in the last block of the AG and we
 146                         * are currently pointing to the last inode.
 147                         *
 148                         * Because we may see inodes that are from the wrong AG
 149                         * due to RCU freeing and reallocation, only update the
 150                         * index if it lies in this AG. It was a race that lead
 151                         * us to see this inode, so another lookup from the
 152                         * same index will not find it again.
 153                         */
 154                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
 155                                continue;
 156                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 157                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 158                                done = 1;
 159                }
 160
 161                /* unlock now we've grabbed the inodes. */
 162                rcu_read_unlock();
 163
 164                for (i = 0; i < nr_found; i++) {
 165                        if (!batch[i])
 166                                continue;
 167                        error = execute(batch[i], pag, flags);
 168                        IRELE(batch[i]);
 169                        if (error == EAGAIN) {
 170                                skipped++;
 171                                continue;
 172                        }
 173                        if (error && last_error != EFSCORRUPTED)
 174                                last_error = error;
 175                }
 176
 177                /* bail out if the filesystem is corrupted.  */
 178                if (error == EFSCORRUPTED)
 179                        break;
 180
 181                cond_resched();
 182
 183        } while (nr_found && !done);
 184
 185        if (skipped) {
 186                delay(1);
 187                goto restart;
 188        }
 189        return last_error;
 190}
 191
 192int
 193xfs_inode_ag_iterator(
 194        struct xfs_mount        *mp,
 195        int                     (*execute)(struct xfs_inode *ip,
 196                                           struct xfs_perag *pag, int flags),
 197        int                     flags)
 198{
 199        struct xfs_perag        *pag;
 200        int                     error = 0;
 201        int                     last_error = 0;
 202        xfs_agnumber_t          ag;
 203
 204        ag = 0;
 205        while ((pag = xfs_perag_get(mp, ag))) {
 206                ag = pag->pag_agno + 1;
 207                error = xfs_inode_ag_walk(mp, pag, execute, flags);
 208                xfs_perag_put(pag);
 209                if (error) {
 210                        last_error = error;
 211                        if (error == EFSCORRUPTED)
 212                                break;
 213                }
 214        }
 215        return XFS_ERROR(last_error);
 216}
 217
 218STATIC int
 219xfs_sync_inode_data(
 220        struct xfs_inode        *ip,
 221        struct xfs_perag        *pag,
 222        int                     flags)
 223{
 224        struct inode            *inode = VFS_I(ip);
 225        struct address_space *mapping = inode->i_mapping;
 226        int                     error = 0;
 227
 228        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 229                return 0;
 230
 231        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
 232                if (flags & SYNC_TRYLOCK)
 233                        return 0;
 234                xfs_ilock(ip, XFS_IOLOCK_SHARED);
 235        }
 236
 237        error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
 238                                0 : XBF_ASYNC, FI_NONE);
 239        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 240        return error;
 241}
 242
 243/*
 244 * Write out pagecache data for the whole filesystem.
 245 */
 246STATIC int
 247xfs_sync_data(
 248        struct xfs_mount        *mp,
 249        int                     flags)
 250{
 251        int                     error;
 252
 253        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
 254
 255        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
 256        if (error)
 257                return XFS_ERROR(error);
 258
 259        xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
 260        return 0;
 261}
 262
 263STATIC int
 264xfs_sync_fsdata(
 265        struct xfs_mount        *mp)
 266{
 267        struct xfs_buf          *bp;
 268        int                     error;
 269
 270        /*
 271         * If the buffer is pinned then push on the log so we won't get stuck
 272         * waiting in the write for someone, maybe ourselves, to flush the log.
 273         *
 274         * Even though we just pushed the log above, we did not have the
 275         * superblock buffer locked at that point so it can become pinned in
 276         * between there and here.
 277         */
 278        bp = xfs_getsb(mp, 0);
 279        if (xfs_buf_ispinned(bp))
 280                xfs_log_force(mp, 0);
 281        error = xfs_bwrite(bp);
 282        xfs_buf_relse(bp);
 283        return error;
 284}
 285
 286/*
 287 * When remounting a filesystem read-only or freezing the filesystem, we have
 288 * two phases to execute. This first phase is syncing the data before we
 289 * quiesce the filesystem, and the second is flushing all the inodes out after
 290 * we've waited for all the transactions created by the first phase to
 291 * complete. The second phase ensures that the inodes are written to their
 292 * location on disk rather than just existing in transactions in the log. This
 293 * means after a quiesce there is no log replay required to write the inodes to
 294 * disk (this is the main difference between a sync and a quiesce).
 295 */
 296/*
 297 * First stage of freeze - no writers will make progress now we are here,
 298 * so we flush delwri and delalloc buffers here, then wait for all I/O to
 299 * complete.  Data is frozen at that point. Metadata is not frozen,
 300 * transactions can still occur here so don't bother emptying the AIL
 301 * because it'll just get dirty again.
 302 */
 303int
 304xfs_quiesce_data(
 305        struct xfs_mount        *mp)
 306{
 307        int                     error, error2 = 0;
 308
 309        /* force out the log */
 310        xfs_log_force(mp, XFS_LOG_SYNC);
 311
 312        /* write superblock and hoover up shutdown errors */
 313        error = xfs_sync_fsdata(mp);
 314
 315        /* mark the log as covered if needed */
 316        if (xfs_log_need_covered(mp))
 317                error2 = xfs_fs_log_dummy(mp);
 318
 319        return error ? error : error2;
 320}
 321
 322/*
 323 * Second stage of a quiesce. The data is already synced, now we have to take
 324 * care of the metadata. New transactions are already blocked, so we need to
 325 * wait for any remaining transactions to drain out before proceeding.
 326 */
 327void
 328xfs_quiesce_attr(
 329        struct xfs_mount        *mp)
 330{
 331        int     error = 0;
 332
 333        /* wait for all modifications to complete */
 334        while (atomic_read(&mp->m_active_trans) > 0)
 335                delay(100);
 336
 337        /* reclaim inodes to do any IO before the freeze completes */
 338        xfs_reclaim_inodes(mp, 0);
 339        xfs_reclaim_inodes(mp, SYNC_WAIT);
 340
 341        /* flush all pending changes from the AIL */
 342        xfs_ail_push_all_sync(mp->m_ail);
 343
 344        /*
 345         * Just warn here till VFS can correctly support
 346         * read-only remount without racing.
 347         */
 348        WARN_ON(atomic_read(&mp->m_active_trans) != 0);
 349
 350        /* Push the superblock and write an unmount record */
 351        error = xfs_log_sbcount(mp);
 352        if (error)
 353                xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
 354                                "Frozen image may not be consistent.");
 355        xfs_log_unmount_write(mp);
 356
 357        /*
 358         * At this point we might have modified the superblock again and thus
 359         * added an item to the AIL, thus flush it again.
 360         */
 361        xfs_ail_push_all_sync(mp->m_ail);
 362
 363        /*
 364         * The superblock buffer is uncached and xfsaild_push() will lock and
 365         * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
 366         * here but a lock on the superblock buffer will block until iodone()
 367         * has completed.
 368         */
 369        xfs_buf_lock(mp->m_sb_bp);
 370        xfs_buf_unlock(mp->m_sb_bp);
 371}
 372
 373static void
 374xfs_syncd_queue_sync(
 375        struct xfs_mount        *mp)
 376{
 377        queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
 378                                msecs_to_jiffies(xfs_syncd_centisecs * 10));
 379}
 380
 381/*
 382 * Every sync period we need to unpin all items, reclaim inodes and sync
 383 * disk quotas.  We might need to cover the log to indicate that the
 384 * filesystem is idle and not frozen.
 385 */
 386STATIC void
 387xfs_sync_worker(
 388        struct work_struct *work)
 389{
 390        struct xfs_mount *mp = container_of(to_delayed_work(work),
 391                                        struct xfs_mount, m_sync_work);
 392        int             error;
 393
 394        /*
 395         * We shouldn't write/force the log if we are in the mount/unmount
 396         * process or on a read only filesystem. The workqueue still needs to be
 397         * active in both cases, however, because it is used for inode reclaim
 398         * during these times.  Use the MS_ACTIVE flag to avoid doing anything
 399         * during mount.  Doing work during unmount is avoided by calling
 400         * cancel_delayed_work_sync on this work queue before tearing down
 401         * the ail and the log in xfs_log_unmount.
 402         */
 403        if (!(mp->m_super->s_flags & MS_ACTIVE) &&
 404            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
 405                /* dgc: errors ignored here */
 406                if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
 407                    xfs_log_need_covered(mp))
 408                        error = xfs_fs_log_dummy(mp);
 409                else
 410                        xfs_log_force(mp, 0);
 411
 412                /* start pushing all the metadata that is currently
 413                 * dirty */
 414                xfs_ail_push_all(mp->m_ail);
 415        }
 416
 417        /* queue us up again */
 418        xfs_syncd_queue_sync(mp);
 419}
 420
 421/*
 422 * Queue a new inode reclaim pass if there are reclaimable inodes and there
 423 * isn't a reclaim pass already in progress. By default it runs every 5s based
 424 * on the xfs syncd work default of 30s. Perhaps this should have it's own
 425 * tunable, but that can be done if this method proves to be ineffective or too
 426 * aggressive.
 427 */
 428static void
 429xfs_syncd_queue_reclaim(
 430        struct xfs_mount        *mp)
 431{
 432
 433        rcu_read_lock();
 434        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
 435                queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
 436                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 437        }
 438        rcu_read_unlock();
 439}
 440
 441/*
 442 * This is a fast pass over the inode cache to try to get reclaim moving on as
 443 * many inodes as possible in a short period of time. It kicks itself every few
 444 * seconds, as well as being kicked by the inode cache shrinker when memory
 445 * goes low. It scans as quickly as possible avoiding locked inodes or those
 446 * already being flushed, and once done schedules a future pass.
 447 */
 448STATIC void
 449xfs_reclaim_worker(
 450        struct work_struct *work)
 451{
 452        struct xfs_mount *mp = container_of(to_delayed_work(work),
 453                                        struct xfs_mount, m_reclaim_work);
 454
 455        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
 456        xfs_syncd_queue_reclaim(mp);
 457}
 458
 459/*
 460 * Flush delayed allocate data, attempting to free up reserved space
 461 * from existing allocations.  At this point a new allocation attempt
 462 * has failed with ENOSPC and we are in the process of scratching our
 463 * heads, looking about for more room.
 464 *
 465 * Queue a new data flush if there isn't one already in progress and
 466 * wait for completion of the flush. This means that we only ever have one
 467 * inode flush in progress no matter how many ENOSPC events are occurring and
 468 * so will prevent the system from bogging down due to every concurrent
 469 * ENOSPC event scanning all the active inodes in the system for writeback.
 470 */
 471void
 472xfs_flush_inodes(
 473        struct xfs_inode        *ip)
 474{
 475        struct xfs_mount        *mp = ip->i_mount;
 476
 477        queue_work(xfs_syncd_wq, &mp->m_flush_work);
 478        flush_work_sync(&mp->m_flush_work);
 479}
 480
 481STATIC void
 482xfs_flush_worker(
 483        struct work_struct *work)
 484{
 485        struct xfs_mount *mp = container_of(work,
 486                                        struct xfs_mount, m_flush_work);
 487
 488        xfs_sync_data(mp, SYNC_TRYLOCK);
 489        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 490}
 491
 492int
 493xfs_syncd_init(
 494        struct xfs_mount        *mp)
 495{
 496        INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
 497        INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
 498        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 499
 500        xfs_syncd_queue_sync(mp);
 501
 502        return 0;
 503}
 504
 505void
 506xfs_syncd_stop(
 507        struct xfs_mount        *mp)
 508{
 509        cancel_delayed_work_sync(&mp->m_sync_work);
 510        cancel_delayed_work_sync(&mp->m_reclaim_work);
 511        cancel_work_sync(&mp->m_flush_work);
 512}
 513
 514void
 515__xfs_inode_set_reclaim_tag(
 516        struct xfs_perag        *pag,
 517        struct xfs_inode        *ip)
 518{
 519        radix_tree_tag_set(&pag->pag_ici_root,
 520                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
 521                           XFS_ICI_RECLAIM_TAG);
 522
 523        if (!pag->pag_ici_reclaimable) {
 524                /* propagate the reclaim tag up into the perag radix tree */
 525                spin_lock(&ip->i_mount->m_perag_lock);
 526                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
 527                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
 528                                XFS_ICI_RECLAIM_TAG);
 529                spin_unlock(&ip->i_mount->m_perag_lock);
 530
 531                /* schedule periodic background inode reclaim */
 532                xfs_syncd_queue_reclaim(ip->i_mount);
 533
 534                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
 535                                                        -1, _RET_IP_);
 536        }
 537        pag->pag_ici_reclaimable++;
 538}
 539
 540/*
 541 * We set the inode flag atomically with the radix tree tag.
 542 * Once we get tag lookups on the radix tree, this inode flag
 543 * can go away.
 544 */
 545void
 546xfs_inode_set_reclaim_tag(
 547        xfs_inode_t     *ip)
 548{
 549        struct xfs_mount *mp = ip->i_mount;
 550        struct xfs_perag *pag;
 551
 552        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 553        spin_lock(&pag->pag_ici_lock);
 554        spin_lock(&ip->i_flags_lock);
 555        __xfs_inode_set_reclaim_tag(pag, ip);
 556        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 557        spin_unlock(&ip->i_flags_lock);
 558        spin_unlock(&pag->pag_ici_lock);
 559        xfs_perag_put(pag);
 560}
 561
 562STATIC void
 563__xfs_inode_clear_reclaim(
 564        xfs_perag_t     *pag,
 565        xfs_inode_t     *ip)
 566{
 567        pag->pag_ici_reclaimable--;
 568        if (!pag->pag_ici_reclaimable) {
 569                /* clear the reclaim tag from the perag radix tree */
 570                spin_lock(&ip->i_mount->m_perag_lock);
 571                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
 572                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
 573                                XFS_ICI_RECLAIM_TAG);
 574                spin_unlock(&ip->i_mount->m_perag_lock);
 575                trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
 576                                                        -1, _RET_IP_);
 577        }
 578}
 579
 580void
 581__xfs_inode_clear_reclaim_tag(
 582        xfs_mount_t     *mp,
 583        xfs_perag_t     *pag,
 584        xfs_inode_t     *ip)
 585{
 586        radix_tree_tag_clear(&pag->pag_ici_root,
 587                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 588        __xfs_inode_clear_reclaim(pag, ip);
 589}
 590
 591/*
 592 * Grab the inode for reclaim exclusively.
 593 * Return 0 if we grabbed it, non-zero otherwise.
 594 */
 595STATIC int
 596xfs_reclaim_inode_grab(
 597        struct xfs_inode        *ip,
 598        int                     flags)
 599{
 600        ASSERT(rcu_read_lock_held());
 601
 602        /* quick check for stale RCU freed inode */
 603        if (!ip->i_ino)
 604                return 1;
 605
 606        /*
 607         * If we are asked for non-blocking operation, do unlocked checks to
 608         * see if the inode already is being flushed or in reclaim to avoid
 609         * lock traffic.
 610         */
 611        if ((flags & SYNC_TRYLOCK) &&
 612            __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
 613                return 1;
 614
 615        /*
 616         * The radix tree lock here protects a thread in xfs_iget from racing
 617         * with us starting reclaim on the inode.  Once we have the
 618         * XFS_IRECLAIM flag set it will not touch us.
 619         *
 620         * Due to RCU lookup, we may find inodes that have been freed and only
 621         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
 622         * aren't candidates for reclaim at all, so we must check the
 623         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
 624         */
 625        spin_lock(&ip->i_flags_lock);
 626        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
 627            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
 628                /* not a reclaim candidate. */
 629                spin_unlock(&ip->i_flags_lock);
 630                return 1;
 631        }
 632        __xfs_iflags_set(ip, XFS_IRECLAIM);
 633        spin_unlock(&ip->i_flags_lock);
 634        return 0;
 635}
 636
 637/*
 638 * Inodes in different states need to be treated differently. The following
 639 * table lists the inode states and the reclaim actions necessary:
 640 *
 641 *      inode state          iflush ret         required action
 642 *      ---------------      ----------         ---------------
 643 *      bad                     -               reclaim
 644 *      shutdown                EIO             unpin and reclaim
 645 *      clean, unpinned         0               reclaim
 646 *      stale, unpinned         0               reclaim
 647 *      clean, pinned(*)        0               requeue
 648 *      stale, pinned           EAGAIN          requeue
 649 *      dirty, async            -               requeue
 650 *      dirty, sync             0               reclaim
 651 *
 652 * (*) dgc: I don't think the clean, pinned state is possible but it gets
 653 * handled anyway given the order of checks implemented.
 654 *
 655 * Also, because we get the flush lock first, we know that any inode that has
 656 * been flushed delwri has had the flush completed by the time we check that
 657 * the inode is clean.
 658 *
 659 * Note that because the inode is flushed delayed write by AIL pushing, the
 660 * flush lock may already be held here and waiting on it can result in very
 661 * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
 662 * the caller should push the AIL first before trying to reclaim inodes to
 663 * minimise the amount of time spent waiting.  For background relaim, we only
 664 * bother to reclaim clean inodes anyway.
 665 *
 666 * Hence the order of actions after gaining the locks should be:
 667 *      bad             => reclaim
 668 *      shutdown        => unpin and reclaim
 669 *      pinned, async   => requeue
 670 *      pinned, sync    => unpin
 671 *      stale           => reclaim
 672 *      clean           => reclaim
 673 *      dirty, async    => requeue
 674 *      dirty, sync     => flush, wait and reclaim
 675 */
 676STATIC int
 677xfs_reclaim_inode(
 678        struct xfs_inode        *ip,
 679        struct xfs_perag        *pag,
 680        int                     sync_mode)
 681{
 682        struct xfs_buf          *bp = NULL;
 683        int                     error;
 684
 685restart:
 686        error = 0;
 687        xfs_ilock(ip, XFS_ILOCK_EXCL);
 688        if (!xfs_iflock_nowait(ip)) {
 689                if (!(sync_mode & SYNC_WAIT))
 690                        goto out;
 691                xfs_iflock(ip);
 692        }
 693
 694        if (is_bad_inode(VFS_I(ip)))
 695                goto reclaim;
 696        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 697                xfs_iunpin_wait(ip);
 698                xfs_iflush_abort(ip, false);
 699                goto reclaim;
 700        }
 701        if (xfs_ipincount(ip)) {
 702                if (!(sync_mode & SYNC_WAIT))
 703                        goto out_ifunlock;
 704                xfs_iunpin_wait(ip);
 705        }
 706        if (xfs_iflags_test(ip, XFS_ISTALE))
 707                goto reclaim;
 708        if (xfs_inode_clean(ip))
 709                goto reclaim;
 710
 711        /*
 712         * Never flush out dirty data during non-blocking reclaim, as it would
 713         * just contend with AIL pushing trying to do the same job.
 714         */
 715        if (!(sync_mode & SYNC_WAIT))
 716                goto out_ifunlock;
 717
 718        /*
 719         * Now we have an inode that needs flushing.
 720         *
 721         * Note that xfs_iflush will never block on the inode buffer lock, as
 722         * xfs_ifree_cluster() can lock the inode buffer before it locks the
 723         * ip->i_lock, and we are doing the exact opposite here.  As a result,
 724         * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
 725         * result in an ABBA deadlock with xfs_ifree_cluster().
 726         *
 727         * As xfs_ifree_cluser() must gather all inodes that are active in the
 728         * cache to mark them stale, if we hit this case we don't actually want
 729         * to do IO here - we want the inode marked stale so we can simply
 730         * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
 731         * inode, back off and try again.  Hopefully the next pass through will
 732         * see the stale flag set on the inode.
 733         */
 734        error = xfs_iflush(ip, &bp);
 735        if (error == EAGAIN) {
 736                xfs_iunlock(ip, XFS_ILOCK_EXCL);
 737                /* backoff longer than in xfs_ifree_cluster */
 738                delay(2);
 739                goto restart;
 740        }
 741
 742        if (!error) {
 743                error = xfs_bwrite(bp);
 744                xfs_buf_relse(bp);
 745        }
 746
 747        xfs_iflock(ip);
 748reclaim:
 749        xfs_ifunlock(ip);
 750        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 751
 752        XFS_STATS_INC(xs_ig_reclaims);
 753        /*
 754         * Remove the inode from the per-AG radix tree.
 755         *
 756         * Because radix_tree_delete won't complain even if the item was never
 757         * added to the tree assert that it's been there before to catch
 758         * problems with the inode life time early on.
 759         */
 760        spin_lock(&pag->pag_ici_lock);
 761        if (!radix_tree_delete(&pag->pag_ici_root,
 762                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
 763                ASSERT(0);
 764        __xfs_inode_clear_reclaim(pag, ip);
 765        spin_unlock(&pag->pag_ici_lock);
 766
 767        /*
 768         * Here we do an (almost) spurious inode lock in order to coordinate
 769         * with inode cache radix tree lookups.  This is because the lookup
 770         * can reference the inodes in the cache without taking references.
 771         *
 772         * We make that OK here by ensuring that we wait until the inode is
 773         * unlocked after the lookup before we go ahead and free it.
 774         */
 775        xfs_ilock(ip, XFS_ILOCK_EXCL);
 776        xfs_qm_dqdetach(ip);
 777        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 778
 779        xfs_inode_free(ip);
 780        return error;
 781
 782out_ifunlock:
 783        xfs_ifunlock(ip);
 784out:
 785        xfs_iflags_clear(ip, XFS_IRECLAIM);
 786        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 787        /*
 788         * We could return EAGAIN here to make reclaim rescan the inode tree in
 789         * a short while. However, this just burns CPU time scanning the tree
 790         * waiting for IO to complete and xfssyncd never goes back to the idle
 791         * state. Instead, return 0 to let the next scheduled background reclaim
 792         * attempt to reclaim the inode again.
 793         */
 794        return 0;
 795}
 796
 797/*
 798 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
 799 * corrupted, we still want to try to reclaim all the inodes. If we don't,
 800 * then a shut down during filesystem unmount reclaim walk leak all the
 801 * unreclaimed inodes.
 802 */
 803int
 804xfs_reclaim_inodes_ag(
 805        struct xfs_mount        *mp,
 806        int                     flags,
 807        int                     *nr_to_scan)
 808{
 809        struct xfs_perag        *pag;
 810        int                     error = 0;
 811        int                     last_error = 0;
 812        xfs_agnumber_t          ag;
 813        int                     trylock = flags & SYNC_TRYLOCK;
 814        int                     skipped;
 815
 816restart:
 817        ag = 0;
 818        skipped = 0;
 819        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
 820                unsigned long   first_index = 0;
 821                int             done = 0;
 822                int             nr_found = 0;
 823
 824                ag = pag->pag_agno + 1;
 825
 826                if (trylock) {
 827                        if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
 828                                skipped++;
 829                                xfs_perag_put(pag);
 830                                continue;
 831                        }
 832                        first_index = pag->pag_ici_reclaim_cursor;
 833                } else
 834                        mutex_lock(&pag->pag_ici_reclaim_lock);
 835
 836                do {
 837                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 838                        int     i;
 839
 840                        rcu_read_lock();
 841                        nr_found = radix_tree_gang_lookup_tag(
 842                                        &pag->pag_ici_root,
 843                                        (void **)batch, first_index,
 844                                        XFS_LOOKUP_BATCH,
 845                                        XFS_ICI_RECLAIM_TAG);
 846                        if (!nr_found) {
 847                                done = 1;
 848                                rcu_read_unlock();
 849                                break;
 850                        }
 851
 852                        /*
 853                         * Grab the inodes before we drop the lock. if we found
 854                         * nothing, nr == 0 and the loop will be skipped.
 855                         */
 856                        for (i = 0; i < nr_found; i++) {
 857                                struct xfs_inode *ip = batch[i];
 858
 859                                if (done || xfs_reclaim_inode_grab(ip, flags))
 860                                        batch[i] = NULL;
 861
 862                                /*
 863                                 * Update the index for the next lookup. Catch
 864                                 * overflows into the next AG range which can
 865                                 * occur if we have inodes in the last block of
 866                                 * the AG and we are currently pointing to the
 867                                 * last inode.
 868                                 *
 869                                 * Because we may see inodes that are from the
 870                                 * wrong AG due to RCU freeing and
 871                                 * reallocation, only update the index if it
 872                                 * lies in this AG. It was a race that lead us
 873                                 * to see this inode, so another lookup from
 874                                 * the same index will not find it again.
 875                                 */
 876                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
 877                                                                pag->pag_agno)
 878                                        continue;
 879                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 880                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 881                                        done = 1;
 882                        }
 883
 884                        /* unlock now we've grabbed the inodes. */
 885                        rcu_read_unlock();
 886
 887                        for (i = 0; i < nr_found; i++) {
 888                                if (!batch[i])
 889                                        continue;
 890                                error = xfs_reclaim_inode(batch[i], pag, flags);
 891                                if (error && last_error != EFSCORRUPTED)
 892                                        last_error = error;
 893                        }
 894
 895                        *nr_to_scan -= XFS_LOOKUP_BATCH;
 896
 897                        cond_resched();
 898
 899                } while (nr_found && !done && *nr_to_scan > 0);
 900
 901                if (trylock && !done)
 902                        pag->pag_ici_reclaim_cursor = first_index;
 903                else
 904                        pag->pag_ici_reclaim_cursor = 0;
 905                mutex_unlock(&pag->pag_ici_reclaim_lock);
 906                xfs_perag_put(pag);
 907        }
 908
 909        /*
 910         * if we skipped any AG, and we still have scan count remaining, do
 911         * another pass this time using blocking reclaim semantics (i.e
 912         * waiting on the reclaim locks and ignoring the reclaim cursors). This
 913         * ensure that when we get more reclaimers than AGs we block rather
 914         * than spin trying to execute reclaim.
 915         */
 916        if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
 917                trylock = 0;
 918                goto restart;
 919        }
 920        return XFS_ERROR(last_error);
 921}
 922
 923int
 924xfs_reclaim_inodes(
 925        xfs_mount_t     *mp,
 926        int             mode)
 927{
 928        int             nr_to_scan = INT_MAX;
 929
 930        return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
 931}
 932
 933/*
 934 * Scan a certain number of inodes for reclaim.
 935 *
 936 * When called we make sure that there is a background (fast) inode reclaim in
 937 * progress, while we will throttle the speed of reclaim via doing synchronous
 938 * reclaim of inodes. That means if we come across dirty inodes, we wait for
 939 * them to be cleaned, which we hope will not be very long due to the
 940 * background walker having already kicked the IO off on those dirty inodes.
 941 */
 942void
 943xfs_reclaim_inodes_nr(
 944        struct xfs_mount        *mp,
 945        int                     nr_to_scan)
 946{
 947        /* kick background reclaimer and push the AIL */
 948        xfs_syncd_queue_reclaim(mp);
 949        xfs_ail_push_all(mp->m_ail);
 950
 951        xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
 952}
 953
 954/*
 955 * Return the number of reclaimable inodes in the filesystem for
 956 * the shrinker to determine how much to reclaim.
 957 */
 958int
 959xfs_reclaim_inodes_count(
 960        struct xfs_mount        *mp)
 961{
 962        struct xfs_perag        *pag;
 963        xfs_agnumber_t          ag = 0;
 964        int                     reclaimable = 0;
 965
 966        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
 967                ag = pag->pag_agno + 1;
 968                reclaimable += pag->pag_ici_reclaimable;
 969                xfs_perag_put(pag);
 970        }
 971        return reclaimable;
 972}
 973
 974