LXR linux/fs/fs-writeback.c

   1/*
   2 * fs/fs-writeback.c
   3 *
   4 * Copyright (C) 2002, Linus Torvalds.
   5 *
   6 * Contains all the functions related to writing back and waiting
   7 * upon dirty inodes against superblocks, and writing back dirty
   8 * pages against inodes.  ie: data writeback.  Writeout of the
   9 * inode itself is not handled here.
  10 *
  11 * 10Apr2002    Andrew Morton
  12 *              Split out of fs/inode.c
  13 *              Additions for address_space-based writeback
  14 */
  15
  16#include <linux/kernel.h>
  17#include <linux/module.h>
  18#include <linux/spinlock.h>
  19#include <linux/sched.h>
  20#include <linux/fs.h>
  21#include <linux/mm.h>
  22#include <linux/kthread.h>
  23#include <linux/freezer.h>
  24#include <linux/writeback.h>
  25#include <linux/blkdev.h>
  26#include <linux/backing-dev.h>
  27#include <linux/buffer_head.h>
  28#include "internal.h"
  29
  30#define inode_to_bdi(inode)     ((inode)->i_mapping->backing_dev_info)
  31
  32/*
  33 * We don't actually have pdflush, but this one is exported though /proc...
  34 */
  35int nr_pdflush_threads;
  36
  37/*
  38 * Passed into wb_writeback(), essentially a subset of writeback_control
  39 */
  40struct wb_writeback_args {
  41        long nr_pages;
  42        struct super_block *sb;
  43        enum writeback_sync_modes sync_mode;
  44        int for_kupdate:1;
  45        int range_cyclic:1;
  46        int for_background:1;
  47};
  48
  49/*
  50 * Work items for the bdi_writeback threads
  51 */
  52struct bdi_work {
  53        struct list_head list;          /* pending work list */
  54        struct rcu_head rcu_head;       /* for RCU free/clear of work */
  55
  56        unsigned long seen;             /* threads that have seen this work */
  57        atomic_t pending;               /* number of threads still to do work */
  58
  59        struct wb_writeback_args args;  /* writeback arguments */
  60
  61        unsigned long state;            /* flag bits, see WS_* */
  62};
  63
  64enum {
  65        WS_USED_B = 0,
  66        WS_ONSTACK_B,
  67};
  68
  69#define WS_USED (1 << WS_USED_B)
  70#define WS_ONSTACK (1 << WS_ONSTACK_B)
  71
  72static inline bool bdi_work_on_stack(struct bdi_work *work)
  73{
  74        return test_bit(WS_ONSTACK_B, &work->state);
  75}
  76
  77static inline void bdi_work_init(struct bdi_work *work,
  78                                 struct wb_writeback_args *args)
  79{
  80        INIT_RCU_HEAD(&work->rcu_head);
  81        work->args = *args;
  82        work->state = WS_USED;
  83}
  84
  85/**
  86 * writeback_in_progress - determine whether there is writeback in progress
  87 * @bdi: the device's backing_dev_info structure.
  88 *
  89 * Determine whether there is writeback waiting to be handled against a
  90 * backing device.
  91 */
  92int writeback_in_progress(struct backing_dev_info *bdi)
  93{
  94        return !list_empty(&bdi->work_list);
  95}
  96
  97static void bdi_work_clear(struct bdi_work *work)
  98{
  99        clear_bit(WS_USED_B, &work->state);
 100        smp_mb__after_clear_bit();
 101        /*
 102         * work can have disappeared at this point. bit waitq functions
 103         * should be able to tolerate this, provided bdi_sched_wait does
 104         * not dereference it's pointer argument.
 105        */
 106        wake_up_bit(&work->state, WS_USED_B);
 107}
 108
 109static void bdi_work_free(struct rcu_head *head)
 110{
 111        struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
 112
 113        if (!bdi_work_on_stack(work))
 114                kfree(work);
 115        else
 116                bdi_work_clear(work);
 117}
 118
 119static void wb_work_complete(struct bdi_work *work)
 120{
 121        const enum writeback_sync_modes sync_mode = work->args.sync_mode;
 122        int onstack = bdi_work_on_stack(work);
 123
 124        /*
 125         * For allocated work, we can clear the done/seen bit right here.
 126         * For on-stack work, we need to postpone both the clear and free
 127         * to after the RCU grace period, since the stack could be invalidated
 128         * as soon as bdi_work_clear() has done the wakeup.
 129         */
 130        if (!onstack)
 131                bdi_work_clear(work);
 132        if (sync_mode == WB_SYNC_NONE || onstack)
 133                call_rcu(&work->rcu_head, bdi_work_free);
 134}
 135
 136static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
 137{
 138        /*
 139         * The caller has retrieved the work arguments from this work,
 140         * drop our reference. If this is the last ref, delete and free it
 141         */
 142        if (atomic_dec_and_test(&work->pending)) {
 143                struct backing_dev_info *bdi = wb->bdi;
 144
 145                spin_lock(&bdi->wb_lock);
 146                list_del_rcu(&work->list);
 147                spin_unlock(&bdi->wb_lock);
 148
 149                wb_work_complete(work);
 150        }
 151}
 152
 153static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 154{
 155        work->seen = bdi->wb_mask;
 156        BUG_ON(!work->seen);
 157        atomic_set(&work->pending, bdi->wb_cnt);
 158        BUG_ON(!bdi->wb_cnt);
 159
 160        /*
 161         * list_add_tail_rcu() contains the necessary barriers to
 162         * make sure the above stores are seen before the item is
 163         * noticed on the list
 164         */
 165        spin_lock(&bdi->wb_lock);
 166        list_add_tail_rcu(&work->list, &bdi->work_list);
 167        spin_unlock(&bdi->wb_lock);
 168
 169        /*
 170         * If the default thread isn't there, make sure we add it. When
 171         * it gets created and wakes up, we'll run this work.
 172         */
 173        if (unlikely(list_empty_careful(&bdi->wb_list)))
 174                wake_up_process(default_backing_dev_info.wb.task);
 175        else {
 176                struct bdi_writeback *wb = &bdi->wb;
 177
 178                if (wb->task)
 179                        wake_up_process(wb->task);
 180        }
 181}
 182
 183/*
 184 * Used for on-stack allocated work items. The caller needs to wait until
 185 * the wb threads have acked the work before it's safe to continue.
 186 */
 187static void bdi_wait_on_work_clear(struct bdi_work *work)
 188{
 189        wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
 190                    TASK_UNINTERRUPTIBLE);
 191}
 192
 193static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 194                                 struct wb_writeback_args *args)
 195{
 196        struct bdi_work *work;
 197
 198        /*
 199         * This is WB_SYNC_NONE writeback, so if allocation fails just
 200         * wakeup the thread for old dirty data writeback
 201         */
 202        work = kmalloc(sizeof(*work), GFP_ATOMIC);
 203        if (work) {
 204                bdi_work_init(work, args);
 205                bdi_queue_work(bdi, work);
 206        } else {
 207                struct bdi_writeback *wb = &bdi->wb;
 208
 209                if (wb->task)
 210                        wake_up_process(wb->task);
 211        }
 212}
 213
 214/**
 215 * bdi_sync_writeback - start and wait for writeback
 216 * @bdi: the backing device to write from
 217 * @sb: write inodes from this super_block
 218 *
 219 * Description:
 220 *   This does WB_SYNC_ALL data integrity writeback and waits for the
 221 *   IO to complete. Callers must hold the sb s_umount semaphore for
 222 *   reading, to avoid having the super disappear before we are done.
 223 */
 224static void bdi_sync_writeback(struct backing_dev_info *bdi,
 225                               struct super_block *sb)
 226{
 227        struct wb_writeback_args args = {
 228                .sb             = sb,
 229                .sync_mode      = WB_SYNC_ALL,
 230                .nr_pages       = LONG_MAX,
 231                .range_cyclic   = 0,
 232        };
 233        struct bdi_work work;
 234
 235        bdi_work_init(&work, &args);
 236        work.state |= WS_ONSTACK;
 237
 238        bdi_queue_work(bdi, &work);
 239        bdi_wait_on_work_clear(&work);
 240}
 241
 242/**
 243 * bdi_start_writeback - start writeback
 244 * @bdi: the backing device to write from
 245 * @nr_pages: the number of pages to write
 246 *
 247 * Description:
 248 *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
 249 *   started when this function returns, we make no guarentees on
 250 *   completion. Caller need not hold sb s_umount semaphore.
 251 *
 252 */
 253void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
 254                         long nr_pages)
 255{
 256        struct wb_writeback_args args = {
 257                .sb             = sb,
 258                .sync_mode      = WB_SYNC_NONE,
 259                .nr_pages       = nr_pages,
 260                .range_cyclic   = 1,
 261        };
 262
 263        /*
 264         * We treat @nr_pages=0 as the special case to do background writeback,
 265         * ie. to sync pages until the background dirty threshold is reached.
 266         */
 267        if (!nr_pages) {
 268                args.nr_pages = LONG_MAX;
 269                args.for_background = 1;
 270        }
 271
 272        bdi_alloc_queue_work(bdi, &args);
 273}
 274
 275/*
 276 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 277 * furthest end of its superblock's dirty-inode list.
 278 *
 279 * Before stamping the inode's ->dirtied_when, we check to see whether it is
 280 * already the most-recently-dirtied inode on the b_dirty list.  If that is
 281 * the case then the inode must have been redirtied while it was being written
 282 * out and we don't reset its dirtied_when.
 283 */
 284static void redirty_tail(struct inode *inode)
 285{
 286        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 287
 288        if (!list_empty(&wb->b_dirty)) {
 289                struct inode *tail;
 290
 291                tail = list_entry(wb->b_dirty.next, struct inode, i_list);
 292                if (time_before(inode->dirtied_when, tail->dirtied_when))
 293                        inode->dirtied_when = jiffies;
 294        }
 295        list_move(&inode->i_list, &wb->b_dirty);
 296}
 297
 298/*
 299 * requeue inode for re-scanning after bdi->b_io list is exhausted.
 300 */
 301static void requeue_io(struct inode *inode)
 302{
 303        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 304
 305        list_move(&inode->i_list, &wb->b_more_io);
 306}
 307
 308static void inode_sync_complete(struct inode *inode)
 309{
 310        /*
 311         * Prevent speculative execution through spin_unlock(&inode_lock);
 312         */
 313        smp_mb();
 314        wake_up_bit(&inode->i_state, __I_SYNC);
 315}
 316
 317static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 318{
 319        bool ret = time_after(inode->dirtied_when, t);
 320#ifndef CONFIG_64BIT
 321        /*
 322         * For inodes being constantly redirtied, dirtied_when can get stuck.
 323         * It _appears_ to be in the future, but is actually in distant past.
 324         * This test is necessary to prevent such wrapped-around relative times
 325         * from permanently stopping the whole bdi writeback.
 326         */
 327        ret = ret && time_before_eq(inode->dirtied_when, jiffies);
 328#endif
 329        return ret;
 330}
 331
 332/*
 333 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
 334 */
 335static void move_expired_inodes(struct list_head *delaying_queue,
 336                               struct list_head *dispatch_queue,
 337                                unsigned long *older_than_this)
 338{
 339        LIST_HEAD(tmp);
 340        struct list_head *pos, *node;
 341        struct super_block *sb = NULL;
 342        struct inode *inode;
 343        int do_sb_sort = 0;
 344
 345        while (!list_empty(delaying_queue)) {
 346                inode = list_entry(delaying_queue->prev, struct inode, i_list);
 347                if (older_than_this &&
 348                    inode_dirtied_after(inode, *older_than_this))
 349                        break;
 350                if (sb && sb != inode->i_sb)
 351                        do_sb_sort = 1;
 352                sb = inode->i_sb;
 353                list_move(&inode->i_list, &tmp);
 354        }
 355
 356        /* just one sb in list, splice to dispatch_queue and we're done */
 357        if (!do_sb_sort) {
 358                list_splice(&tmp, dispatch_queue);
 359                return;
 360        }
 361
 362        /* Move inodes from one superblock together */
 363        while (!list_empty(&tmp)) {
 364                inode = list_entry(tmp.prev, struct inode, i_list);
 365                sb = inode->i_sb;
 366                list_for_each_prev_safe(pos, node, &tmp) {
 367                        inode = list_entry(pos, struct inode, i_list);
 368                        if (inode->i_sb == sb)
 369                                list_move(&inode->i_list, dispatch_queue);
 370                }
 371        }
 372}
 373
 374/*
 375 * Queue all expired dirty inodes for io, eldest first.
 376 */
 377static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 378{
 379        list_splice_init(&wb->b_more_io, wb->b_io.prev);
 380        move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 381}
 382
 383static int write_inode(struct inode *inode, int sync)
 384{
 385        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
 386                return inode->i_sb->s_op->write_inode(inode, sync);
 387        return 0;
 388}
 389
 390/*
 391 * Wait for writeback on an inode to complete.
 392 */
 393static void inode_wait_for_writeback(struct inode *inode)
 394{
 395        DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
 396        wait_queue_head_t *wqh;
 397
 398        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
 399        do {
 400                spin_unlock(&inode_lock);
 401                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
 402                spin_lock(&inode_lock);
 403        } while (inode->i_state & I_SYNC);
 404}
 405
 406/*
 407 * Write out an inode's dirty pages.  Called under inode_lock.  Either the
 408 * caller has ref on the inode (either via __iget or via syscall against an fd)
 409 * or the inode has I_WILL_FREE set (via generic_forget_inode)
 410 *
 411 * If `wait' is set, wait on the writeout.
 412 *
 413 * The whole writeout design is quite complex and fragile.  We want to avoid
 414 * starvation of particular inodes when others are being redirtied, prevent
 415 * livelocks, etc.
 416 *
 417 * Called under inode_lock.
 418 */
 419static int
 420writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 421{
 422        struct address_space *mapping = inode->i_mapping;
 423        int wait = wbc->sync_mode == WB_SYNC_ALL;
 424        unsigned dirty;
 425        int ret;
 426
 427        if (!atomic_read(&inode->i_count))
 428                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
 429        else
 430                WARN_ON(inode->i_state & I_WILL_FREE);
 431
 432        if (inode->i_state & I_SYNC) {
 433                /*
 434                 * If this inode is locked for writeback and we are not doing
 435                 * writeback-for-data-integrity, move it to b_more_io so that
 436                 * writeback can proceed with the other inodes on s_io.
 437                 *
 438                 * We'll have another go at writing back this inode when we
 439                 * completed a full scan of b_io.
 440                 */
 441                if (!wait) {
 442                        requeue_io(inode);
 443                        return 0;
 444                }
 445
 446                /*
 447                 * It's a data-integrity sync.  We must wait.
 448                 */
 449                inode_wait_for_writeback(inode);
 450        }
 451
 452        BUG_ON(inode->i_state & I_SYNC);
 453
 454        /* Set I_SYNC, reset I_DIRTY */
 455        dirty = inode->i_state & I_DIRTY;
 456        inode->i_state |= I_SYNC;
 457        inode->i_state &= ~I_DIRTY;
 458
 459        spin_unlock(&inode_lock);
 460
 461        ret = do_writepages(mapping, wbc);
 462
 463        /* Don't write the inode if only I_DIRTY_PAGES was set */
 464        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 465                int err = write_inode(inode, wait);
 466                if (ret == 0)
 467                        ret = err;
 468        }
 469
 470        if (wait) {
 471                int err = filemap_fdatawait(mapping);
 472                if (ret == 0)
 473                        ret = err;
 474        }
 475
 476        spin_lock(&inode_lock);
 477        inode->i_state &= ~I_SYNC;
 478        if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 479                if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
 480                        /*
 481                         * More pages get dirtied by a fast dirtier.
 482                         */
 483                        goto select_queue;
 484                } else if (inode->i_state & I_DIRTY) {
 485                        /*
 486                         * At least XFS will redirty the inode during the
 487                         * writeback (delalloc) and on io completion (isize).
 488                         */
 489                        redirty_tail(inode);
 490                } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 491                        /*
 492                         * We didn't write back all the pages.  nfs_writepages()
 493                         * sometimes bales out without doing anything. Redirty
 494                         * the inode; Move it from b_io onto b_more_io/b_dirty.
 495                         */
 496                        /*
 497                         * akpm: if the caller was the kupdate function we put
 498                         * this inode at the head of b_dirty so it gets first
 499                         * consideration.  Otherwise, move it to the tail, for
 500                         * the reasons described there.  I'm not really sure
 501                         * how much sense this makes.  Presumably I had a good
 502                         * reasons for doing it this way, and I'd rather not
 503                         * muck with it at present.
 504                         */
 505                        if (wbc->for_kupdate) {
 506                                /*
 507                                 * For the kupdate function we move the inode
 508                                 * to b_more_io so it will get more writeout as
 509                                 * soon as the queue becomes uncongested.
 510                                 */
 511                                inode->i_state |= I_DIRTY_PAGES;
 512select_queue:
 513                                if (wbc->nr_to_write <= 0) {
 514                                        /*
 515                                         * slice used up: queue for next turn
 516                                         */
 517                                        requeue_io(inode);
 518                                } else {
 519                                        /*
 520                                         * somehow blocked: retry later
 521                                         */
 522                                        redirty_tail(inode);
 523                                }
 524                        } else {
 525                                /*
 526                                 * Otherwise fully redirty the inode so that
 527                                 * other inodes on this superblock will get some
 528                                 * writeout.  Otherwise heavy writing to one
 529                                 * file would indefinitely suspend writeout of
 530                                 * all the other files.
 531                                 */
 532                                inode->i_state |= I_DIRTY_PAGES;
 533                                redirty_tail(inode);
 534                        }
 535                } else if (atomic_read(&inode->i_count)) {
 536                        /*
 537                         * The inode is clean, inuse
 538                         */
 539                        list_move(&inode->i_list, &inode_in_use);
 540                } else {
 541                        /*
 542                         * The inode is clean, unused
 543                         */
 544                        list_move(&inode->i_list, &inode_unused);
 545                }
 546        }
 547        inode_sync_complete(inode);
 548        return ret;
 549}
 550
 551static void unpin_sb_for_writeback(struct super_block **psb)
 552{
 553        struct super_block *sb = *psb;
 554
 555        if (sb) {
 556                up_read(&sb->s_umount);
 557                put_super(sb);
 558                *psb = NULL;
 559        }
 560}
 561
 562/*
 563 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
 564 * before calling writeback. So make sure that we do pin it, so it doesn't
 565 * go away while we are writing inodes from it.
 566 *
 567 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
 568 * 1 if we failed.
 569 */
 570static int pin_sb_for_writeback(struct writeback_control *wbc,
 571                                struct inode *inode, struct super_block **psb)
 572{
 573        struct super_block *sb = inode->i_sb;
 574
 575        /*
 576         * If this sb is already pinned, nothing more to do. If not and
 577         * *psb is non-NULL, unpin the old one first
 578         */
 579        if (sb == *psb)
 580                return 0;
 581        else if (*psb)
 582                unpin_sb_for_writeback(psb);
 583
 584        /*
 585         * Caller must already hold the ref for this
 586         */
 587        if (wbc->sync_mode == WB_SYNC_ALL) {
 588                WARN_ON(!rwsem_is_locked(&sb->s_umount));
 589                return 0;
 590        }
 591
 592        spin_lock(&sb_lock);
 593        sb->s_count++;
 594        if (down_read_trylock(&sb->s_umount)) {
 595                if (sb->s_root) {
 596                        spin_unlock(&sb_lock);
 597                        goto pinned;
 598                }
 599                /*
 600                 * umounted, drop rwsem again and fall through to failure
 601                 */
 602                up_read(&sb->s_umount);
 603        }
 604
 605        sb->s_count--;
 606        spin_unlock(&sb_lock);
 607        return 1;
 608pinned:
 609        *psb = sb;
 610        return 0;
 611}
 612
 613static void writeback_inodes_wb(struct bdi_writeback *wb,
 614                                struct writeback_control *wbc)
 615{
 616        struct super_block *sb = wbc->sb, *pin_sb = NULL;
 617        const int is_blkdev_sb = sb_is_blkdev_sb(sb);
 618        const unsigned long start = jiffies;    /* livelock avoidance */
 619
 620        spin_lock(&inode_lock);
 621
 622        if (!wbc->for_kupdate || list_empty(&wb->b_io))
 623                queue_io(wb, wbc->older_than_this);
 624
 625        while (!list_empty(&wb->b_io)) {
 626                struct inode *inode = list_entry(wb->b_io.prev,
 627                                                struct inode, i_list);
 628                long pages_skipped;
 629
 630                /*
 631                 * super block given and doesn't match, skip this inode
 632                 */
 633                if (sb && sb != inode->i_sb) {
 634                        redirty_tail(inode);
 635                        continue;
 636                }
 637
 638                if (!bdi_cap_writeback_dirty(wb->bdi)) {
 639                        redirty_tail(inode);
 640                        if (is_blkdev_sb) {
 641                                /*
 642                                 * Dirty memory-backed blockdev: the ramdisk
 643                                 * driver does this.  Skip just this inode
 644                                 */
 645                                continue;
 646                        }
 647                        /*
 648                         * Dirty memory-backed inode against a filesystem other
 649                         * than the kernel-internal bdev filesystem.  Skip the
 650                         * entire superblock.
 651                         */
 652                        break;
 653                }
 654
 655                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
 656                        requeue_io(inode);
 657                        continue;
 658                }
 659
 660                if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
 661                        wbc->encountered_congestion = 1;
 662                        if (!is_blkdev_sb)
 663                                break;          /* Skip a congested fs */
 664                        requeue_io(inode);
 665                        continue;               /* Skip a congested blockdev */
 666                }
 667
 668                /*
 669                 * Was this inode dirtied after sync_sb_inodes was called?
 670                 * This keeps sync from extra jobs and livelock.
 671                 */
 672                if (inode_dirtied_after(inode, start))
 673                        break;
 674
 675                if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
 676                        requeue_io(inode);
 677                        continue;
 678                }
 679
 680                BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
 681                __iget(inode);
 682                pages_skipped = wbc->pages_skipped;
 683                writeback_single_inode(inode, wbc);
 684                if (wbc->pages_skipped != pages_skipped) {
 685                        /*
 686                         * writeback is not making progress due to locked
 687                         * buffers.  Skip this inode for now.
 688                         */
 689                        redirty_tail(inode);
 690                }
 691                spin_unlock(&inode_lock);
 692                iput(inode);
 693                cond_resched();
 694                spin_lock(&inode_lock);
 695                if (wbc->nr_to_write <= 0) {
 696                        wbc->more_io = 1;
 697                        break;
 698                }
 699                if (!list_empty(&wb->b_more_io))
 700                        wbc->more_io = 1;
 701        }
 702
 703        unpin_sb_for_writeback(&pin_sb);
 704
 705        spin_unlock(&inode_lock);
 706        /* Leave any unwritten inodes on b_io */
 707}
 708
 709void writeback_inodes_wbc(struct writeback_control *wbc)
 710{
 711        struct backing_dev_info *bdi = wbc->bdi;
 712
 713        writeback_inodes_wb(&bdi->wb, wbc);
 714}
 715
 716/*
 717 * The maximum number of pages to writeout in a single bdi flush/kupdate
 718 * operation.  We do this so we don't hold I_SYNC against an inode for
 719 * enormous amounts of time, which would block a userspace task which has
 720 * been forced to throttle against that inode.  Also, the code reevaluates
 721 * the dirty each time it has written this many pages.
 722 */
 723#define MAX_WRITEBACK_PAGES     1024
 724
 725static inline bool over_bground_thresh(void)
 726{
 727        unsigned long background_thresh, dirty_thresh;
 728
 729        get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 730
 731        return (global_page_state(NR_FILE_DIRTY) +
 732                global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
 733}
 734
 735/*
 736 * Explicit flushing or periodic writeback of "old" data.
 737 *
 738 * Define "old": the first time one of an inode's pages is dirtied, we mark the
 739 * dirtying-time in the inode's address_space.  So this periodic writeback code
 740 * just walks the superblock inode list, writing back any inodes which are
 741 * older than a specific point in time.
 742 *
 743 * Try to run once per dirty_writeback_interval.  But if a writeback event
 744 * takes longer than a dirty_writeback_interval interval, then leave a
 745 * one-second gap.
 746 *
 747 * older_than_this takes precedence over nr_to_write.  So we'll only write back
 748 * all dirty pages if they are all attached to "old" mappings.
 749 */
 750static long wb_writeback(struct bdi_writeback *wb,
 751                         struct wb_writeback_args *args)
 752{
 753        struct writeback_control wbc = {
 754                .bdi                    = wb->bdi,
 755                .sb                     = args->sb,
 756                .sync_mode              = args->sync_mode,
 757                .older_than_this        = NULL,
 758                .for_kupdate            = args->for_kupdate,
 759                .range_cyclic           = args->range_cyclic,
 760        };
 761        unsigned long oldest_jif;
 762        long wrote = 0;
 763        struct inode *inode;
 764
 765        if (wbc.for_kupdate) {
 766                wbc.older_than_this = &oldest_jif;
 767                oldest_jif = jiffies -
 768                                msecs_to_jiffies(dirty_expire_interval * 10);
 769        }
 770        if (!wbc.range_cyclic) {
 771                wbc.range_start = 0;
 772                wbc.range_end = LLONG_MAX;
 773        }
 774
 775        for (;;) {
 776                /*
 777                 * Stop writeback when nr_pages has been consumed
 778                 */
 779                if (args->nr_pages <= 0)
 780                        break;
 781
 782                /*
 783                 * For background writeout, stop when we are below the
 784                 * background dirty threshold
 785                 */
 786                if (args->for_background && !over_bground_thresh())
 787                        break;
 788
 789                wbc.more_io = 0;
 790                wbc.encountered_congestion = 0;
 791                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 792                wbc.pages_skipped = 0;
 793                writeback_inodes_wb(wb, &wbc);
 794                args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 795                wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 796
 797                /*
 798                 * If we consumed everything, see if we have more
 799                 */
 800                if (wbc.nr_to_write <= 0)
 801                        continue;
 802                /*
 803                 * Didn't write everything and we don't have more IO, bail
 804                 */
 805                if (!wbc.more_io)
 806                        break;
 807                /*
 808                 * Did we write something? Try for more
 809                 */
 810                if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
 811                        continue;
 812                /*
 813                 * Nothing written. Wait for some inode to
 814                 * become available for writeback. Otherwise
 815                 * we'll just busyloop.
 816                 */
 817                spin_lock(&inode_lock);
 818                if (!list_empty(&wb->b_more_io))  {
 819                        inode = list_entry(wb->b_more_io.prev,
 820                                                struct inode, i_list);
 821                        inode_wait_for_writeback(inode);
 822                }
 823                spin_unlock(&inode_lock);
 824        }
 825
 826        return wrote;
 827}
 828
 829/*
 830 * Return the next bdi_work struct that hasn't been processed by this
 831 * wb thread yet. ->seen is initially set for each thread that exists
 832 * for this device, when a thread first notices a piece of work it
 833 * clears its bit. Depending on writeback type, the thread will notify
 834 * completion on either receiving the work (WB_SYNC_NONE) or after
 835 * it is done (WB_SYNC_ALL).
 836 */
 837static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
 838                                           struct bdi_writeback *wb)
 839{
 840        struct bdi_work *work, *ret = NULL;
 841
 842        rcu_read_lock();
 843
 844        list_for_each_entry_rcu(work, &bdi->work_list, list) {
 845                if (!test_bit(wb->nr, &work->seen))
 846                        continue;
 847                clear_bit(wb->nr, &work->seen);
 848
 849                ret = work;
 850                break;
 851        }
 852
 853        rcu_read_unlock();
 854        return ret;
 855}
 856
 857static long wb_check_old_data_flush(struct bdi_writeback *wb)
 858{
 859        unsigned long expired;
 860        long nr_pages;
 861
 862        expired = wb->last_old_flush +
 863                        msecs_to_jiffies(dirty_writeback_interval * 10);
 864        if (time_before(jiffies, expired))
 865                return 0;
 866
 867        wb->last_old_flush = jiffies;
 868        nr_pages = global_page_state(NR_FILE_DIRTY) +
 869                        global_page_state(NR_UNSTABLE_NFS) +
 870                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
 871
 872        if (nr_pages) {
 873                struct wb_writeback_args args = {
 874                        .nr_pages       = nr_pages,
 875                        .sync_mode      = WB_SYNC_NONE,
 876                        .for_kupdate    = 1,
 877                        .range_cyclic   = 1,
 878                };
 879
 880                return wb_writeback(wb, &args);
 881        }
 882
 883        return 0;
 884}
 885
 886/*
 887 * Retrieve work items and do the writeback they describe
 888 */
 889long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 890{
 891        struct backing_dev_info *bdi = wb->bdi;
 892        struct bdi_work *work;
 893        long wrote = 0;
 894
 895        while ((work = get_next_work_item(bdi, wb)) != NULL) {
 896                struct wb_writeback_args args = work->args;
 897
 898                /*
 899                 * Override sync mode, in case we must wait for completion
 900                 */
 901                if (force_wait)
 902                        work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
 903
 904                /*
 905                 * If this isn't a data integrity operation, just notify
 906                 * that we have seen this work and we are now starting it.
 907                 */
 908                if (args.sync_mode == WB_SYNC_NONE)
 909                        wb_clear_pending(wb, work);
 910
 911                wrote += wb_writeback(wb, &args);
 912
 913                /*
 914                 * This is a data integrity writeback, so only do the
 915                 * notification when we have completed the work.
 916                 */
 917                if (args.sync_mode == WB_SYNC_ALL)
 918                        wb_clear_pending(wb, work);
 919        }
 920
 921        /*
 922         * Check for periodic writeback, kupdated() style
 923         */
 924        wrote += wb_check_old_data_flush(wb);
 925
 926        return wrote;
 927}
 928
 929/*
 930 * Handle writeback of dirty data for the device backed by this bdi. Also
 931 * wakes up periodically and does kupdated style flushing.
 932 */
 933int bdi_writeback_task(struct bdi_writeback *wb)
 934{
 935        unsigned long last_active = jiffies;
 936        unsigned long wait_jiffies = -1UL;
 937        long pages_written;
 938
 939        while (!kthread_should_stop()) {
 940                pages_written = wb_do_writeback(wb, 0);
 941
 942                if (pages_written)
 943                        last_active = jiffies;
 944                else if (wait_jiffies != -1UL) {
 945                        unsigned long max_idle;
 946
 947                        /*
 948                         * Longest period of inactivity that we tolerate. If we
 949                         * see dirty data again later, the task will get
 950                         * recreated automatically.
 951                         */
 952                        max_idle = max(5UL * 60 * HZ, wait_jiffies);
 953                        if (time_after(jiffies, max_idle + last_active))
 954                                break;
 955                }
 956
 957                wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
 958                schedule_timeout_interruptible(wait_jiffies);
 959                try_to_freeze();
 960        }
 961
 962        return 0;
 963}
 964
 965/*
 966 * Schedule writeback for all backing devices. This does WB_SYNC_NONE
 967 * writeback, for integrity writeback see bdi_sync_writeback().
 968 */
 969static void bdi_writeback_all(struct super_block *sb, long nr_pages)
 970{
 971        struct wb_writeback_args args = {
 972                .sb             = sb,
 973                .nr_pages       = nr_pages,
 974                .sync_mode      = WB_SYNC_NONE,
 975        };
 976        struct backing_dev_info *bdi;
 977
 978        rcu_read_lock();
 979
 980        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 981                if (!bdi_has_dirty_io(bdi))
 982                        continue;
 983
 984                bdi_alloc_queue_work(bdi, &args);
 985        }
 986
 987        rcu_read_unlock();
 988}
 989
 990/*
 991 * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
 992 * the whole world.
 993 */
 994void wakeup_flusher_threads(long nr_pages)
 995{
 996        if (nr_pages == 0)
 997                nr_pages = global_page_state(NR_FILE_DIRTY) +
 998                                global_page_state(NR_UNSTABLE_NFS);
 999        bdi_writeback_all(NULL, nr_pages);
1000}

1001
1002static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1003{
1004        if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
1005                struct dentry *dentry;
1006                const char *name = "?";
1007
1008                dentry = d_find_alias(inode);
1009                if (dentry) {
1010                        spin_lock(&dentry->d_lock);
1011                        name = (const char *) dentry->d_name.name;
1012                }
1013                printk(KERN_DEBUG
1014                       "%s(%d): dirtied inode %lu (%s) on %s\n",
1015                       current->comm, task_pid_nr(current), inode->i_ino,
1016                       name, inode->i_sb->s_id);
1017                if (dentry) {
1018                        spin_unlock(&dentry->d_lock);
1019                        dput(dentry);
1020                }
1021        }
1022}
1023
1024/**
1025 *      __mark_inode_dirty -    internal function
1026 *      @inode: inode to mark
1027 *      @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
1028 *      Mark an inode as dirty. Callers should use mark_inode_dirty or
1029 *      mark_inode_dirty_sync.
1030 *
1031 * Put the inode on the super block's dirty list.
1032 *
1033 * CAREFUL! We mark it dirty unconditionally, but move it onto the
1034 * dirty list only if it is hashed or if it refers to a blockdev.
1035 * If it was not hashed, it will never be added to the dirty list
1036 * even if it is later hashed, as it will have been marked dirty already.
1037 *
1038 * In short, make sure you hash any inodes _before_ you start marking
1039 * them dirty.
1040 *
1041 * This function *must* be atomic for the I_DIRTY_PAGES case -
1042 * set_page_dirty() is called under spinlock in several places.
1043 *
1044 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
1045 * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
1046 * the kernel-internal blockdev inode represents the dirtying time of the
1047 * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
1048 * page->mapping->host, so the page-dirtying time is recorded in the internal
1049 * blockdev inode.
1050 */
1051void __mark_inode_dirty(struct inode *inode, int flags)
1052{
1053        struct super_block *sb = inode->i_sb;
1054
1055        /*
1056         * Don't do this for I_DIRTY_PAGES - that doesn't actually
1057         * dirty the inode itself
1058         */
1059        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
1060                if (sb->s_op->dirty_inode)
1061                        sb->s_op->dirty_inode(inode);
1062        }
1063
1064        /*
1065         * make sure that changes are seen by all cpus before we test i_state
1066         * -- mikulas
1067         */
1068        smp_mb();
1069
1070        /* avoid the locking if we can */
1071        if ((inode->i_state & flags) == flags)
1072                return;
1073
1074        if (unlikely(block_dump))
1075                block_dump___mark_inode_dirty(inode);
1076
1077        spin_lock(&inode_lock);
1078        if ((inode->i_state & flags) != flags) {
1079                const int was_dirty = inode->i_state & I_DIRTY;
1080
1081                inode->i_state |= flags;
1082
1083                /*
1084                 * If the inode is being synced, just update its dirty state.
1085                 * The unlocker will place the inode on the appropriate
1086                 * superblock list, based upon its state.
1087                 */
1088                if (inode->i_state & I_SYNC)
1089                        goto out;
1090
1091                /*
1092                 * Only add valid (hashed) inodes to the superblock's
1093                 * dirty list.  Add blockdev inodes as well.
1094                 */
1095                if (!S_ISBLK(inode->i_mode)) {
1096                        if (hlist_unhashed(&inode->i_hash))
1097                                goto out;
1098                }
1099                if (inode->i_state & (I_FREEING|I_CLEAR))
1100                        goto out;
1101
1102                /*
1103                 * If the inode was already on b_dirty/b_io/b_more_io, don't
1104                 * reposition it (that would break b_dirty time-ordering).
1105                 */
1106                if (!was_dirty) {
1107                        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1108                        struct backing_dev_info *bdi = wb->bdi;
1109
1110                        if (bdi_cap_writeback_dirty(bdi) &&
1111                            !test_bit(BDI_registered, &bdi->state)) {
1112                                WARN_ON(1);
1113                                printk(KERN_ERR "bdi-%s not registered\n",
1114                                                                bdi->name);
1115                        }
1116
1117                        inode->dirtied_when = jiffies;
1118                        list_move(&inode->i_list, &wb->b_dirty);
1119                }
1120        }
1121out:
1122        spin_unlock(&inode_lock);
1123}
1124EXPORT_SYMBOL(__mark_inode_dirty);
1125
1126/*
1127 * Write out a superblock's list of dirty inodes.  A wait will be performed
1128 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1129 *
1130 * If older_than_this is non-NULL, then only write out inodes which
1131 * had their first dirtying at a time earlier than *older_than_this.
1132 *
1133 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
1134 * This function assumes that the blockdev superblock's inodes are backed by
1135 * a variety of queues, so all inodes are searched.  For other superblocks,
1136 * assume that all inodes are backed by the same queue.
1137 *
1138 * The inodes to be written are parked on bdi->b_io.  They are moved back onto
1139 * bdi->b_dirty as they are selected for writing.  This way, none can be missed
1140 * on the writer throttling path, and we get decent balancing between many
1141 * throttled threads: we don't want them all piling up on inode_sync_wait.
1142 */
1143static void wait_sb_inodes(struct super_block *sb)
1144{
1145        struct inode *inode, *old_inode = NULL;
1146
1147        /*
1148         * We need to be protected against the filesystem going from
1149         * r/o to r/w or vice versa.
1150         */
1151        WARN_ON(!rwsem_is_locked(&sb->s_umount));
1152
1153        spin_lock(&inode_lock);
1154
1155        /*
1156         * Data integrity sync. Must wait for all pages under writeback,
1157         * because there may have been pages dirtied before our sync
1158         * call, but which had writeout started before we write it out.
1159         * In which case, the inode may not be on the dirty list, but
1160         * we still have to wait for that writeout.
1161         */
1162        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1163                struct address_space *mapping;
1164
1165                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
1166                        continue;
1167                mapping = inode->i_mapping;
1168                if (mapping->nrpages == 0)
1169                        continue;
1170                __iget(inode);
1171                spin_unlock(&inode_lock);
1172                /*
1173                 * We hold a reference to 'inode' so it couldn't have
1174                 * been removed from s_inodes list while we dropped the
1175                 * inode_lock.  We cannot iput the inode now as we can
1176                 * be holding the last reference and we cannot iput it
1177                 * under inode_lock. So we keep the reference and iput
1178                 * it later.
1179                 */
1180                iput(old_inode);
1181                old_inode = inode;
1182
1183                filemap_fdatawait(mapping);
1184
1185                cond_resched();
1186
1187                spin_lock(&inode_lock);
1188        }
1189        spin_unlock(&inode_lock);
1190        iput(old_inode);
1191}
1192
1193/**
1194 * writeback_inodes_sb  -       writeback dirty inodes from given super_block
1195 * @sb: the superblock
1196 *
1197 * Start writeback on some inodes on this super_block. No guarantees are made
1198 * on how many (if any) will be written, and this function does not wait
1199 * for IO completion of submitted IO. The number of pages submitted is
1200 * returned.
1201 */
1202void writeback_inodes_sb(struct super_block *sb)
1203{
1204        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1205        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1206        long nr_to_write;
1207
1208        nr_to_write = nr_dirty + nr_unstable +
1209                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1210
1211        bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
1212}
1213EXPORT_SYMBOL(writeback_inodes_sb);
1214
1215/**
1216 * sync_inodes_sb       -       sync sb inode pages
1217 * @sb: the superblock
1218 *
1219 * This function writes and waits on any dirty inode belonging to this
1220 * super_block. The number of pages synced is returned.
1221 */
1222void sync_inodes_sb(struct super_block *sb)
1223{
1224        bdi_sync_writeback(sb->s_bdi, sb);
1225        wait_sb_inodes(sb);
1226}
1227EXPORT_SYMBOL(sync_inodes_sb);
1228
1229/**
1230 * write_inode_now      -       write an inode to disk
1231 * @inode: inode to write to disk
1232 * @sync: whether the write should be synchronous or not
1233 *
1234 * This function commits an inode to disk immediately if it is dirty. This is
1235 * primarily needed by knfsd.
1236 *
1237 * The caller must either have a ref on the inode or must have set I_WILL_FREE.
1238 */
1239int write_inode_now(struct inode *inode, int sync)
1240{
1241        int ret;
1242        struct writeback_control wbc = {
1243                .nr_to_write = LONG_MAX,
1244                .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
1245                .range_start = 0,
1246                .range_end = LLONG_MAX,
1247        };
1248
1249        if (!mapping_cap_writeback_dirty(inode->i_mapping))
1250                wbc.nr_to_write = 0;
1251
1252        might_sleep();
1253        spin_lock(&inode_lock);
1254        ret = writeback_single_inode(inode, &wbc);
1255        spin_unlock(&inode_lock);
1256        if (sync)
1257                inode_sync_wait(inode);
1258        return ret;
1259}
1260EXPORT_SYMBOL(write_inode_now);
1261
1262/**
1263 * sync_inode - write an inode and its pages to disk.
1264 * @inode: the inode to sync
1265 * @wbc: controls the writeback mode
1266 *
1267 * sync_inode() will write an inode and its pages to disk.  It will also
1268 * correctly update the inode on its superblock's dirty inode lists and will
1269 * update inode->i_state.
1270 *
1271 * The caller must have a ref on the inode.
1272 */
1273int sync_inode(struct inode *inode, struct writeback_control *wbc)
1274{
1275        int ret;
1276
1277        spin_lock(&inode_lock);
1278        ret = writeback_single_inode(inode, wbc);
1279        spin_unlock(&inode_lock);
1280        return ret;
1281}
1282EXPORT_SYMBOL(sync_inode);
1283