linux/fs/fs-writeback.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * fs/fs-writeback.c
   4 *
   5 * Copyright (C) 2002, Linus Torvalds.
   6 *
   7 * Contains all the functions related to writing back and waiting
   8 * upon dirty inodes against superblocks, and writing back dirty
   9 * pages against inodes.  ie: data writeback.  Writeout of the
  10 * inode itself is not handled here.
  11 *
  12 * 10Apr2002    Andrew Morton
  13 *              Split out of fs/inode.c
  14 *              Additions for address_space-based writeback
  15 */
  16
  17#include <linux/kernel.h>
  18#include <linux/export.h>
  19#include <linux/spinlock.h>
  20#include <linux/slab.h>
  21#include <linux/sched.h>
  22#include <linux/fs.h>
  23#include <linux/mm.h>
  24#include <linux/pagemap.h>
  25#include <linux/kthread.h>
  26#include <linux/writeback.h>
  27#include <linux/blkdev.h>
  28#include <linux/backing-dev.h>
  29#include <linux/tracepoint.h>
  30#include <linux/device.h>
  31#include <linux/memcontrol.h>
  32#include "internal.h"
  33
  34/*
  35 * 4MB minimal write chunk size
  36 */
  37#define MIN_WRITEBACK_PAGES     (4096UL >> (PAGE_SHIFT - 10))
  38
  39struct wb_completion {
  40        atomic_t                cnt;
  41};
  42
  43/*
  44 * Passed into wb_writeback(), essentially a subset of writeback_control
  45 */
  46struct wb_writeback_work {
  47        long nr_pages;
  48        struct super_block *sb;
  49        unsigned long *older_than_this;
  50        enum writeback_sync_modes sync_mode;
  51        unsigned int tagged_writepages:1;
  52        unsigned int for_kupdate:1;
  53        unsigned int range_cyclic:1;
  54        unsigned int for_background:1;
  55        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
  56        unsigned int auto_free:1;       /* free on completion */
  57        enum wb_reason reason;          /* why was writeback initiated? */
  58
  59        struct list_head list;          /* pending work list */
  60        struct wb_completion *done;     /* set if the caller waits */
  61};
  62
  63/*
  64 * If one wants to wait for one or more wb_writeback_works, each work's
  65 * ->done should be set to a wb_completion defined using the following
  66 * macro.  Once all work items are issued with wb_queue_work(), the caller
  67 * can wait for the completion of all using wb_wait_for_completion().  Work
  68 * items which are waited upon aren't freed automatically on completion.
  69 */
  70#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)                              \
  71        struct wb_completion cmpl = {                                   \
  72                .cnt            = ATOMIC_INIT(1),                       \
  73        }
  74
  75
  76/*
  77 * If an inode is constantly having its pages dirtied, but then the
  78 * updates stop dirtytime_expire_interval seconds in the past, it's
  79 * possible for the worst case time between when an inode has its
  80 * timestamps updated and when they finally get written out to be two
  81 * dirtytime_expire_intervals.  We set the default to 12 hours (in
  82 * seconds), which means most of the time inodes will have their
  83 * timestamps written to disk after 12 hours, but in the worst case a
  84 * few inodes might not their timestamps updated for 24 hours.
  85 */
  86unsigned int dirtytime_expire_interval = 12 * 60 * 60;
  87
  88static inline struct inode *wb_inode(struct list_head *head)
  89{
  90        return list_entry(head, struct inode, i_io_list);
  91}
  92
  93/*
  94 * Include the creation of the trace points after defining the
  95 * wb_writeback_work structure and inline functions so that the definition
  96 * remains local to this file.
  97 */
  98#define CREATE_TRACE_POINTS
  99#include <trace/events/writeback.h>
 100
 101EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
 102
 103static bool wb_io_lists_populated(struct bdi_writeback *wb)
 104{
 105        if (wb_has_dirty_io(wb)) {
 106                return false;
 107        } else {
 108                set_bit(WB_has_dirty_io, &wb->state);
 109                WARN_ON_ONCE(!wb->avg_write_bandwidth);
 110                atomic_long_add(wb->avg_write_bandwidth,
 111                                &wb->bdi->tot_write_bandwidth);
 112                return true;
 113        }
 114}
 115
 116static void wb_io_lists_depopulated(struct bdi_writeback *wb)
 117{
 118        if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
 119            list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
 120                clear_bit(WB_has_dirty_io, &wb->state);
 121                WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
 122                                        &wb->bdi->tot_write_bandwidth) < 0);
 123        }
 124}
 125
 126/**
 127 * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
 128 * @inode: inode to be moved
 129 * @wb: target bdi_writeback
 130 * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
 131 *
 132 * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
 133 * Returns %true if @inode is the first occupant of the !dirty_time IO
 134 * lists; otherwise, %false.
 135 */
 136static bool inode_io_list_move_locked(struct inode *inode,
 137                                      struct bdi_writeback *wb,
 138                                      struct list_head *head)
 139{
 140        assert_spin_locked(&wb->list_lock);
 141
 142        list_move(&inode->i_io_list, head);
 143
 144        /* dirty_time doesn't count as dirty_io until expiration */
 145        if (head != &wb->b_dirty_time)
 146                return wb_io_lists_populated(wb);
 147
 148        wb_io_lists_depopulated(wb);
 149        return false;
 150}
 151
 152/**
 153 * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
 154 * @inode: inode to be removed
 155 * @wb: bdi_writeback @inode is being removed from
 156 *
 157 * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
 158 * clear %WB_has_dirty_io if all are empty afterwards.
 159 */
 160static void inode_io_list_del_locked(struct inode *inode,
 161                                     struct bdi_writeback *wb)
 162{
 163        assert_spin_locked(&wb->list_lock);
 164
 165        list_del_init(&inode->i_io_list);
 166        wb_io_lists_depopulated(wb);
 167}
 168
 169static void wb_wakeup(struct bdi_writeback *wb)
 170{
 171        spin_lock_bh(&wb->work_lock);
 172        if (test_bit(WB_registered, &wb->state))
 173                mod_delayed_work(bdi_wq, &wb->dwork, 0);
 174        spin_unlock_bh(&wb->work_lock);
 175}
 176
 177static void finish_writeback_work(struct bdi_writeback *wb,
 178                                  struct wb_writeback_work *work)
 179{
 180        struct wb_completion *done = work->done;
 181
 182        if (work->auto_free)
 183                kfree(work);
 184        if (done && atomic_dec_and_test(&done->cnt))
 185                wake_up_all(&wb->bdi->wb_waitq);
 186}
 187
 188static void wb_queue_work(struct bdi_writeback *wb,
 189                          struct wb_writeback_work *work)
 190{
 191        trace_writeback_queue(wb, work);
 192
 193        if (work->done)
 194                atomic_inc(&work->done->cnt);
 195
 196        spin_lock_bh(&wb->work_lock);
 197
 198        if (test_bit(WB_registered, &wb->state)) {
 199                list_add_tail(&work->list, &wb->work_list);
 200                mod_delayed_work(bdi_wq, &wb->dwork, 0);
 201        } else
 202                finish_writeback_work(wb, work);
 203
 204        spin_unlock_bh(&wb->work_lock);
 205}
 206
 207/**
 208 * wb_wait_for_completion - wait for completion of bdi_writeback_works
 209 * @bdi: bdi work items were issued to
 210 * @done: target wb_completion
 211 *
 212 * Wait for one or more work items issued to @bdi with their ->done field
 213 * set to @done, which should have been defined with
 214 * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
 215 * work items are completed.  Work items which are waited upon aren't freed
 216 * automatically on completion.
 217 */
 218static void wb_wait_for_completion(struct backing_dev_info *bdi,
 219                                   struct wb_completion *done)
 220{
 221        atomic_dec(&done->cnt);         /* put down the initial count */
 222        wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
 223}
 224
 225#ifdef CONFIG_CGROUP_WRITEBACK
 226
 227/* parameters for foreign inode detection, see wb_detach_inode() */
 228#define WB_FRN_TIME_SHIFT       13      /* 1s = 2^13, upto 8 secs w/ 16bit */
 229#define WB_FRN_TIME_AVG_SHIFT   3       /* avg = avg * 7/8 + new * 1/8 */
 230#define WB_FRN_TIME_CUT_DIV     2       /* ignore rounds < avg / 2 */
 231#define WB_FRN_TIME_PERIOD      (2 * (1 << WB_FRN_TIME_SHIFT))  /* 2s */
 232
 233#define WB_FRN_HIST_SLOTS       16      /* inode->i_wb_frn_history is 16bit */
 234#define WB_FRN_HIST_UNIT        (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
 235                                        /* each slot's duration is 2s / 16 */
 236#define WB_FRN_HIST_THR_SLOTS   (WB_FRN_HIST_SLOTS / 2)
 237                                        /* if foreign slots >= 8, switch */
 238#define WB_FRN_HIST_MAX_SLOTS   (WB_FRN_HIST_THR_SLOTS / 2 + 1)
 239                                        /* one round can affect upto 5 slots */
 240
 241static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
 242static struct workqueue_struct *isw_wq;
 243
 244void __inode_attach_wb(struct inode *inode, struct page *page)
 245{
 246        struct backing_dev_info *bdi = inode_to_bdi(inode);
 247        struct bdi_writeback *wb = NULL;
 248
 249        if (inode_cgwb_enabled(inode)) {
 250                struct cgroup_subsys_state *memcg_css;
 251
 252                if (page) {
 253                        memcg_css = mem_cgroup_css_from_page(page);
 254                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
 255                } else {
 256                        /* must pin memcg_css, see wb_get_create() */
 257                        memcg_css = task_get_css(current, memory_cgrp_id);
 258                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
 259                        css_put(memcg_css);
 260                }
 261        }
 262
 263        if (!wb)
 264                wb = &bdi->wb;
 265
 266        /*
 267         * There may be multiple instances of this function racing to
 268         * update the same inode.  Use cmpxchg() to tell the winner.
 269         */
 270        if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
 271                wb_put(wb);
 272}
 273EXPORT_SYMBOL_GPL(__inode_attach_wb);
 274
 275/**
 276 * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
 277 * @inode: inode of interest with i_lock held
 278 *
 279 * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
 280 * held on entry and is released on return.  The returned wb is guaranteed
 281 * to stay @inode's associated wb until its list_lock is released.
 282 */
 283static struct bdi_writeback *
 284locked_inode_to_wb_and_lock_list(struct inode *inode)
 285        __releases(&inode->i_lock)
 286        __acquires(&wb->list_lock)
 287{
 288        while (true) {
 289                struct bdi_writeback *wb = inode_to_wb(inode);
 290
 291                /*
 292                 * inode_to_wb() association is protected by both
 293                 * @inode->i_lock and @wb->list_lock but list_lock nests
 294                 * outside i_lock.  Drop i_lock and verify that the
 295                 * association hasn't changed after acquiring list_lock.
 296                 */
 297                wb_get(wb);
 298                spin_unlock(&inode->i_lock);
 299                spin_lock(&wb->list_lock);
 300
 301                /* i_wb may have changed inbetween, can't use inode_to_wb() */
 302                if (likely(wb == inode->i_wb)) {
 303                        wb_put(wb);     /* @inode already has ref */
 304                        return wb;
 305                }
 306
 307                spin_unlock(&wb->list_lock);
 308                wb_put(wb);
 309                cpu_relax();
 310                spin_lock(&inode->i_lock);
 311        }
 312}
 313
 314/**
 315 * inode_to_wb_and_lock_list - determine an inode's wb and lock it
 316 * @inode: inode of interest
 317 *
 318 * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
 319 * on entry.
 320 */
 321static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
 322        __acquires(&wb->list_lock)
 323{
 324        spin_lock(&inode->i_lock);
 325        return locked_inode_to_wb_and_lock_list(inode);
 326}
 327
 328struct inode_switch_wbs_context {
 329        struct inode            *inode;
 330        struct bdi_writeback    *new_wb;
 331
 332        struct rcu_head         rcu_head;
 333        struct work_struct      work;
 334};
 335
 336static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
 337{
 338        down_write(&bdi->wb_switch_rwsem);
 339}
 340
 341static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
 342{
 343        up_write(&bdi->wb_switch_rwsem);
 344}
 345
 346static void inode_switch_wbs_work_fn(struct work_struct *work)
 347{
 348        struct inode_switch_wbs_context *isw =
 349                container_of(work, struct inode_switch_wbs_context, work);
 350        struct inode *inode = isw->inode;
 351        struct backing_dev_info *bdi = inode_to_bdi(inode);
 352        struct address_space *mapping = inode->i_mapping;
 353        struct bdi_writeback *old_wb = inode->i_wb;
 354        struct bdi_writeback *new_wb = isw->new_wb;
 355        XA_STATE(xas, &mapping->i_pages, 0);
 356        struct page *page;
 357        bool switched = false;
 358
 359        /*
 360         * If @inode switches cgwb membership while sync_inodes_sb() is
 361         * being issued, sync_inodes_sb() might miss it.  Synchronize.
 362         */
 363        down_read(&bdi->wb_switch_rwsem);
 364
 365        /*
 366         * By the time control reaches here, RCU grace period has passed
 367         * since I_WB_SWITCH assertion and all wb stat update transactions
 368         * between unlocked_inode_to_wb_begin/end() are guaranteed to be
 369         * synchronizing against the i_pages lock.
 370         *
 371         * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
 372         * gives us exclusion against all wb related operations on @inode
 373         * including IO list manipulations and stat updates.
 374         */
 375        if (old_wb < new_wb) {
 376                spin_lock(&old_wb->list_lock);
 377                spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
 378        } else {
 379                spin_lock(&new_wb->list_lock);
 380                spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
 381        }
 382        spin_lock(&inode->i_lock);
 383        xa_lock_irq(&mapping->i_pages);
 384
 385        /*
 386         * Once I_FREEING is visible under i_lock, the eviction path owns
 387         * the inode and we shouldn't modify ->i_io_list.
 388         */
 389        if (unlikely(inode->i_state & I_FREEING))
 390                goto skip_switch;
 391
 392        /*
 393         * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
 394         * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
 395         * pages actually under writeback.
 396         */
 397        xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
 398                if (PageDirty(page)) {
 399                        dec_wb_stat(old_wb, WB_RECLAIMABLE);
 400                        inc_wb_stat(new_wb, WB_RECLAIMABLE);
 401                }
 402        }
 403
 404        xas_set(&xas, 0);
 405        xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
 406                WARN_ON_ONCE(!PageWriteback(page));
 407                dec_wb_stat(old_wb, WB_WRITEBACK);
 408                inc_wb_stat(new_wb, WB_WRITEBACK);
 409        }
 410
 411        wb_get(new_wb);
 412
 413        /*
 414         * Transfer to @new_wb's IO list if necessary.  The specific list
 415         * @inode was on is ignored and the inode is put on ->b_dirty which
 416         * is always correct including from ->b_dirty_time.  The transfer
 417         * preserves @inode->dirtied_when ordering.
 418         */
 419        if (!list_empty(&inode->i_io_list)) {
 420                struct inode *pos;
 421
 422                inode_io_list_del_locked(inode, old_wb);
 423                inode->i_wb = new_wb;
 424                list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
 425                        if (time_after_eq(inode->dirtied_when,
 426                                          pos->dirtied_when))
 427                                break;
 428                inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
 429        } else {
 430                inode->i_wb = new_wb;
 431        }
 432
 433        /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
 434        inode->i_wb_frn_winner = 0;
 435        inode->i_wb_frn_avg_time = 0;
 436        inode->i_wb_frn_history = 0;
 437        switched = true;
 438skip_switch:
 439        /*
 440         * Paired with load_acquire in unlocked_inode_to_wb_begin() and
 441         * ensures that the new wb is visible if they see !I_WB_SWITCH.
 442         */
 443        smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
 444
 445        xa_unlock_irq(&mapping->i_pages);
 446        spin_unlock(&inode->i_lock);
 447        spin_unlock(&new_wb->list_lock);
 448        spin_unlock(&old_wb->list_lock);
 449
 450        up_read(&bdi->wb_switch_rwsem);
 451
 452        if (switched) {
 453                wb_wakeup(new_wb);
 454                wb_put(old_wb);
 455        }
 456        wb_put(new_wb);
 457
 458        iput(inode);
 459        kfree(isw);
 460
 461        atomic_dec(&isw_nr_in_flight);
 462}
 463
 464static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
 465{
 466        struct inode_switch_wbs_context *isw = container_of(rcu_head,
 467                                struct inode_switch_wbs_context, rcu_head);
 468
 469        /* needs to grab bh-unsafe locks, bounce to work item */
 470        INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
 471        queue_work(isw_wq, &isw->work);
 472}
 473
 474/**
 475 * inode_switch_wbs - change the wb association of an inode
 476 * @inode: target inode
 477 * @new_wb_id: ID of the new wb
 478 *
 479 * Switch @inode's wb association to the wb identified by @new_wb_id.  The
 480 * switching is performed asynchronously and may fail silently.
 481 */
 482static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 483{
 484        struct backing_dev_info *bdi = inode_to_bdi(inode);
 485        struct cgroup_subsys_state *memcg_css;
 486        struct inode_switch_wbs_context *isw;
 487
 488        /* noop if seems to be already in progress */
 489        if (inode->i_state & I_WB_SWITCH)
 490                return;
 491
 492        /*
 493         * Avoid starting new switches while sync_inodes_sb() is in
 494         * progress.  Otherwise, if the down_write protected issue path
 495         * blocks heavily, we might end up starting a large number of
 496         * switches which will block on the rwsem.
 497         */
 498        if (!down_read_trylock(&bdi->wb_switch_rwsem))
 499                return;
 500
 501        isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
 502        if (!isw)
 503                goto out_unlock;
 504
 505        /* find and pin the new wb */
 506        rcu_read_lock();
 507        memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
 508        if (memcg_css)
 509                isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
 510        rcu_read_unlock();
 511        if (!isw->new_wb)
 512                goto out_free;
 513
 514        /* while holding I_WB_SWITCH, no one else can update the association */
 515        spin_lock(&inode->i_lock);
 516        if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
 517            inode->i_state & (I_WB_SWITCH | I_FREEING) ||
 518            inode_to_wb(inode) == isw->new_wb) {
 519                spin_unlock(&inode->i_lock);
 520                goto out_free;
 521        }
 522        inode->i_state |= I_WB_SWITCH;
 523        __iget(inode);
 524        spin_unlock(&inode->i_lock);
 525
 526        isw->inode = inode;
 527
 528        /*
 529         * In addition to synchronizing among switchers, I_WB_SWITCH tells
 530         * the RCU protected stat update paths to grab the i_page
 531         * lock so that stat transfer can synchronize against them.
 532         * Let's continue after I_WB_SWITCH is guaranteed to be visible.
 533         */
 534        call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
 535
 536        atomic_inc(&isw_nr_in_flight);
 537
 538        goto out_unlock;
 539
 540out_free:
 541        if (isw->new_wb)
 542                wb_put(isw->new_wb);
 543        kfree(isw);
 544out_unlock:
 545        up_read(&bdi->wb_switch_rwsem);
 546}
 547
 548/**
 549 * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
 550 * @wbc: writeback_control of interest
 551 * @inode: target inode
 552 *
 553 * @inode is locked and about to be written back under the control of @wbc.
 554 * Record @inode's writeback context into @wbc and unlock the i_lock.  On
 555 * writeback completion, wbc_detach_inode() should be called.  This is used
 556 * to track the cgroup writeback context.
 557 */
 558void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 559                                 struct inode *inode)
 560{
 561        if (!inode_cgwb_enabled(inode)) {
 562                spin_unlock(&inode->i_lock);
 563                return;
 564        }
 565
 566        wbc->wb = inode_to_wb(inode);
 567        wbc->inode = inode;
 568
 569        wbc->wb_id = wbc->wb->memcg_css->id;
 570        wbc->wb_lcand_id = inode->i_wb_frn_winner;
 571        wbc->wb_tcand_id = 0;
 572        wbc->wb_bytes = 0;
 573        wbc->wb_lcand_bytes = 0;
 574        wbc->wb_tcand_bytes = 0;
 575
 576        wb_get(wbc->wb);
 577        spin_unlock(&inode->i_lock);
 578
 579        /*
 580         * A dying wb indicates that the memcg-blkcg mapping has changed
 581         * and a new wb is already serving the memcg.  Switch immediately.
 582         */
 583        if (unlikely(wb_dying(wbc->wb)))
 584                inode_switch_wbs(inode, wbc->wb_id);
 585}
 586EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
 587
 588/**
 589 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
 590 * @wbc: writeback_control of the just finished writeback
 591 *
 592 * To be called after a writeback attempt of an inode finishes and undoes
 593 * wbc_attach_and_unlock_inode().  Can be called under any context.
 594 *
 595 * As concurrent write sharing of an inode is expected to be very rare and
 596 * memcg only tracks page ownership on first-use basis severely confining
 597 * the usefulness of such sharing, cgroup writeback tracks ownership
 598 * per-inode.  While the support for concurrent write sharing of an inode
 599 * is deemed unnecessary, an inode being written to by different cgroups at
 600 * different points in time is a lot more common, and, more importantly,
 601 * charging only by first-use can too readily lead to grossly incorrect
 602 * behaviors (single foreign page can lead to gigabytes of writeback to be
 603 * incorrectly attributed).
 604 *
 605 * To resolve this issue, cgroup writeback detects the majority dirtier of
 606 * an inode and transfers the ownership to it.  To avoid unnnecessary
 607 * oscillation, the detection mechanism keeps track of history and gives
 608 * out the switch verdict only if the foreign usage pattern is stable over
 609 * a certain amount of time and/or writeback attempts.
 610 *
 611 * On each writeback attempt, @wbc tries to detect the majority writer
 612 * using Boyer-Moore majority vote algorithm.  In addition to the byte
 613 * count from the majority voting, it also counts the bytes written for the
 614 * current wb and the last round's winner wb (max of last round's current
 615 * wb, the winner from two rounds ago, and the last round's majority
 616 * candidate).  Keeping track of the historical winner helps the algorithm
 617 * to semi-reliably detect the most active writer even when it's not the
 618 * absolute majority.
 619 *
 620 * Once the winner of the round is determined, whether the winner is
 621 * foreign or not and how much IO time the round consumed is recorded in
 622 * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
 623 * over a certain threshold, the switch verdict is given.
 624 */
 625void wbc_detach_inode(struct writeback_control *wbc)
 626{
 627        struct bdi_writeback *wb = wbc->wb;
 628        struct inode *inode = wbc->inode;
 629        unsigned long avg_time, max_bytes, max_time;
 630        u16 history;
 631        int max_id;
 632
 633        if (!wb)
 634                return;
 635
 636        history = inode->i_wb_frn_history;
 637        avg_time = inode->i_wb_frn_avg_time;
 638
 639        /* pick the winner of this round */
 640        if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
 641            wbc->wb_bytes >= wbc->wb_tcand_bytes) {
 642                max_id = wbc->wb_id;
 643                max_bytes = wbc->wb_bytes;
 644        } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
 645                max_id = wbc->wb_lcand_id;
 646                max_bytes = wbc->wb_lcand_bytes;
 647        } else {
 648                max_id = wbc->wb_tcand_id;
 649                max_bytes = wbc->wb_tcand_bytes;
 650        }
 651
 652        /*
 653         * Calculate the amount of IO time the winner consumed and fold it
 654         * into the running average kept per inode.  If the consumed IO
 655         * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
 656         * deciding whether to switch or not.  This is to prevent one-off
 657         * small dirtiers from skewing the verdict.
 658         */
 659        max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
 660                                wb->avg_write_bandwidth);
 661        if (avg_time)
 662                avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
 663                            (avg_time >> WB_FRN_TIME_AVG_SHIFT);
 664        else
 665                avg_time = max_time;    /* immediate catch up on first run */
 666
 667        if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
 668                int slots;
 669
 670                /*
 671                 * The switch verdict is reached if foreign wb's consume
 672                 * more than a certain proportion of IO time in a
 673                 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
 674                 * history mask where each bit represents one sixteenth of
 675                 * the period.  Determine the number of slots to shift into
 676                 * history from @max_time.
 677                 */
 678                slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
 679                            (unsigned long)WB_FRN_HIST_MAX_SLOTS);
 680                history <<= slots;
 681                if (wbc->wb_id != max_id)
 682                        history |= (1U << slots) - 1;
 683
 684                /*
 685                 * Switch if the current wb isn't the consistent winner.
 686                 * If there are multiple closely competing dirtiers, the
 687                 * inode may switch across them repeatedly over time, which
 688                 * is okay.  The main goal is avoiding keeping an inode on
 689                 * the wrong wb for an extended period of time.
 690                 */
 691                if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
 692                        inode_switch_wbs(inode, max_id);
 693        }
 694
 695        /*
 696         * Multiple instances of this function may race to update the
 697         * following fields but we don't mind occassional inaccuracies.
 698         */
 699        inode->i_wb_frn_winner = max_id;
 700        inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
 701        inode->i_wb_frn_history = history;
 702
 703        wb_put(wbc->wb);
 704        wbc->wb = NULL;
 705}
 706EXPORT_SYMBOL_GPL(wbc_detach_inode);
 707
 708/**
 709 * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
 710 * @wbc: writeback_control of the writeback in progress
 711 * @page: page being written out
 712 * @bytes: number of bytes being written out
 713 *
 714 * @bytes from @page are about to written out during the writeback
 715 * controlled by @wbc.  Keep the book for foreign inode detection.  See
 716 * wbc_detach_inode().
 717 */
 718void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 719                              size_t bytes)
 720{
 721        struct cgroup_subsys_state *css;
 722        int id;
 723
 724        /*
 725         * pageout() path doesn't attach @wbc to the inode being written
 726         * out.  This is intentional as we don't want the function to block
 727         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
 728         * regular writeback instead of writing things out itself.
 729         */
 730        if (!wbc->wb || wbc->no_cgroup_owner)
 731                return;
 732
 733        css = mem_cgroup_css_from_page(page);
 734        /* dead cgroups shouldn't contribute to inode ownership arbitration */
 735        if (!(css->flags & CSS_ONLINE))
 736                return;
 737
 738        id = css->id;
 739
 740        if (id == wbc->wb_id) {
 741                wbc->wb_bytes += bytes;
 742                return;
 743        }
 744
 745        if (id == wbc->wb_lcand_id)
 746                wbc->wb_lcand_bytes += bytes;
 747
 748        /* Boyer-Moore majority vote algorithm */
 749        if (!wbc->wb_tcand_bytes)
 750                wbc->wb_tcand_id = id;
 751        if (id == wbc->wb_tcand_id)
 752                wbc->wb_tcand_bytes += bytes;
 753        else
 754                wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
 755}
 756EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
 757
 758/**
 759 * inode_congested - test whether an inode is congested
 760 * @inode: inode to test for congestion (may be NULL)
 761 * @cong_bits: mask of WB_[a]sync_congested bits to test
 762 *
 763 * Tests whether @inode is congested.  @cong_bits is the mask of congestion
 764 * bits to test and the return value is the mask of set bits.
 765 *
 766 * If cgroup writeback is enabled for @inode, the congestion state is
 767 * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
 768 * associated with @inode is congested; otherwise, the root wb's congestion
 769 * state is used.
 770 *
 771 * @inode is allowed to be NULL as this function is often called on
 772 * mapping->host which is NULL for the swapper space.
 773 */
 774int inode_congested(struct inode *inode, int cong_bits)
 775{
 776        /*
 777         * Once set, ->i_wb never becomes NULL while the inode is alive.
 778         * Start transaction iff ->i_wb is visible.
 779         */
 780        if (inode && inode_to_wb_is_valid(inode)) {
 781                struct bdi_writeback *wb;
 782                struct wb_lock_cookie lock_cookie = {};
 783                bool congested;
 784
 785                wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
 786                congested = wb_congested(wb, cong_bits);
 787                unlocked_inode_to_wb_end(inode, &lock_cookie);
 788                return congested;
 789        }
 790
 791        return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
 792}
 793EXPORT_SYMBOL_GPL(inode_congested);
 794
 795/**
 796 * wb_split_bdi_pages - split nr_pages to write according to bandwidth
 797 * @wb: target bdi_writeback to split @nr_pages to
 798 * @nr_pages: number of pages to write for the whole bdi
 799 *
 800 * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
 801 * relation to the total write bandwidth of all wb's w/ dirty inodes on
 802 * @wb->bdi.
 803 */
 804static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
 805{
 806        unsigned long this_bw = wb->avg_write_bandwidth;
 807        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
 808
 809        if (nr_pages == LONG_MAX)
 810                return LONG_MAX;
 811
 812        /*
 813         * This may be called on clean wb's and proportional distribution
 814         * may not make sense, just use the original @nr_pages in those
 815         * cases.  In general, we wanna err on the side of writing more.
 816         */
 817        if (!tot_bw || this_bw >= tot_bw)
 818                return nr_pages;
 819        else
 820                return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
 821}
 822
 823/**
 824 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
 825 * @bdi: target backing_dev_info
 826 * @base_work: wb_writeback_work to issue
 827 * @skip_if_busy: skip wb's which already have writeback in progress
 828 *
 829 * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
 830 * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
 831 * distributed to the busy wbs according to each wb's proportion in the
 832 * total active write bandwidth of @bdi.
 833 */
 834static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 835                                  struct wb_writeback_work *base_work,
 836                                  bool skip_if_busy)
 837{
 838        struct bdi_writeback *last_wb = NULL;
 839        struct bdi_writeback *wb = list_entry(&bdi->wb_list,
 840                                              struct bdi_writeback, bdi_node);
 841
 842        might_sleep();
 843restart:
 844        rcu_read_lock();
 845        list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
 846                DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
 847                struct wb_writeback_work fallback_work;
 848                struct wb_writeback_work *work;
 849                long nr_pages;
 850
 851                if (last_wb) {
 852                        wb_put(last_wb);
 853                        last_wb = NULL;
 854                }
 855
 856                /* SYNC_ALL writes out I_DIRTY_TIME too */
 857                if (!wb_has_dirty_io(wb) &&
 858                    (base_work->sync_mode == WB_SYNC_NONE ||
 859                     list_empty(&wb->b_dirty_time)))
 860                        continue;
 861                if (skip_if_busy && writeback_in_progress(wb))
 862                        continue;
 863
 864                nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
 865
 866                work = kmalloc(sizeof(*work), GFP_ATOMIC);
 867                if (work) {
 868                        *work = *base_work;
 869                        work->nr_pages = nr_pages;
 870                        work->auto_free = 1;
 871                        wb_queue_work(wb, work);
 872                        continue;
 873                }
 874
 875                /* alloc failed, execute synchronously using on-stack fallback */
 876                work = &fallback_work;
 877                *work = *base_work;
 878                work->nr_pages = nr_pages;
 879                work->auto_free = 0;
 880                work->done = &fallback_work_done;
 881
 882                wb_queue_work(wb, work);
 883
 884                /*
 885                 * Pin @wb so that it stays on @bdi->wb_list.  This allows
 886                 * continuing iteration from @wb after dropping and
 887                 * regrabbing rcu read lock.
 888                 */
 889                wb_get(wb);
 890                last_wb = wb;
 891
 892                rcu_read_unlock();
 893                wb_wait_for_completion(bdi, &fallback_work_done);
 894                goto restart;
 895        }
 896        rcu_read_unlock();
 897
 898        if (last_wb)
 899                wb_put(last_wb);
 900}
 901
 902/**
 903 * cgroup_writeback_umount - flush inode wb switches for umount
 904 *
 905 * This function is called when a super_block is about to be destroyed and
 906 * flushes in-flight inode wb switches.  An inode wb switch goes through
 907 * RCU and then workqueue, so the two need to be flushed in order to ensure
 908 * that all previously scheduled switches are finished.  As wb switches are
 909 * rare occurrences and synchronize_rcu() can take a while, perform
 910 * flushing iff wb switches are in flight.
 911 */
 912void cgroup_writeback_umount(void)
 913{
 914        if (atomic_read(&isw_nr_in_flight)) {
 915                /*
 916                 * Use rcu_barrier() to wait for all pending callbacks to
 917                 * ensure that all in-flight wb switches are in the workqueue.
 918                 */
 919                rcu_barrier();
 920                flush_workqueue(isw_wq);
 921        }
 922}
 923
 924static int __init cgroup_writeback_init(void)
 925{
 926        isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
 927        if (!isw_wq)
 928                return -ENOMEM;
 929        return 0;
 930}
 931fs_initcall(cgroup_writeback_init);
 932
 933#else   /* CONFIG_CGROUP_WRITEBACK */
 934
 935static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
 936static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
 937
 938static struct bdi_writeback *
 939locked_inode_to_wb_and_lock_list(struct inode *inode)
 940        __releases(&inode->i_lock)
 941        __acquires(&wb->list_lock)
 942{
 943        struct bdi_writeback *wb = inode_to_wb(inode);
 944
 945        spin_unlock(&inode->i_lock);
 946        spin_lock(&wb->list_lock);
 947        return wb;
 948}
 949
 950static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
 951        __acquires(&wb->list_lock)
 952{
 953        struct bdi_writeback *wb = inode_to_wb(inode);
 954
 955        spin_lock(&wb->list_lock);
 956        return wb;
 957}
 958
 959static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
 960{
 961        return nr_pages;
 962}
 963
 964static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 965                                  struct wb_writeback_work *base_work,
 966                                  bool skip_if_busy)
 967{
 968        might_sleep();
 969
 970        if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
 971                base_work->auto_free = 0;
 972                wb_queue_work(&bdi->wb, base_work);
 973        }
 974}
 975
 976#endif  /* CONFIG_CGROUP_WRITEBACK */
 977
 978/*
 979 * Add in the number of potentially dirty inodes, because each inode
 980 * write can dirty pagecache in the underlying blockdev.
 981 */
 982static unsigned long get_nr_dirty_pages(void)
 983{
 984        return global_node_page_state(NR_FILE_DIRTY) +
 985                global_node_page_state(NR_UNSTABLE_NFS) +
 986                get_nr_dirty_inodes();
 987}
 988
 989static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 990{
 991        if (!wb_has_dirty_io(wb))
 992                return;
 993
 994        /*
 995         * All callers of this function want to start writeback of all
 996         * dirty pages. Places like vmscan can call this at a very
 997         * high frequency, causing pointless allocations of tons of
 998         * work items and keeping the flusher threads busy retrieving
 999         * that work. Ensure that we only allow one of them pending and
1000         * inflight at the time.
1001         */
1002        if (test_bit(WB_start_all, &wb->state) ||
1003            test_and_set_bit(WB_start_all, &wb->state))
1004                return;
1005
1006        wb->start_all_reason = reason;
1007        wb_wakeup(wb);
1008}
1009
1010/**
1011 * wb_start_background_writeback - start background writeback
1012 * @wb: bdi_writback to write from
1013 *
1014 * Description:
1015 *   This makes sure WB_SYNC_NONE background writeback happens. When
1016 *   this function returns, it is only guaranteed that for given wb
1017 *   some IO is happening if we are over background dirty threshold.
1018 *   Caller need not hold sb s_umount semaphore.
1019 */
1020void wb_start_background_writeback(struct bdi_writeback *wb)
1021{
1022        /*
1023         * We just wake up the flusher thread. It will perform background
1024         * writeback as soon as there is no other work to do.
1025         */
1026        trace_writeback_wake_background(wb);
1027        wb_wakeup(wb);
1028}
1029
1030/*
1031 * Remove the inode from the writeback list it is on.
1032 */
1033void inode_io_list_del(struct inode *inode)
1034{
1035        struct bdi_writeback *wb;
1036
1037        wb = inode_to_wb_and_lock_list(inode);
1038        inode_io_list_del_locked(inode, wb);
1039        spin_unlock(&wb->list_lock);
1040}
1041
1042/*
1043 * mark an inode as under writeback on the sb
1044 */
1045void sb_mark_inode_writeback(struct inode *inode)
1046{
1047        struct super_block *sb = inode->i_sb;
1048        unsigned long flags;
1049
1050        if (list_empty(&inode->i_wb_list)) {
1051                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1052                if (list_empty(&inode->i_wb_list)) {
1053                        list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
1054                        trace_sb_mark_inode_writeback(inode);
1055                }
1056                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1057        }
1058}
1059
1060/*
1061 * clear an inode as under writeback on the sb
1062 */
1063void sb_clear_inode_writeback(struct inode *inode)
1064{
1065        struct super_block *sb = inode->i_sb;
1066        unsigned long flags;
1067
1068        if (!list_empty(&inode->i_wb_list)) {
1069                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1070                if (!list_empty(&inode->i_wb_list)) {
1071                        list_del_init(&inode->i_wb_list);
1072                        trace_sb_clear_inode_writeback(inode);
1073                }
1074                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1075        }
1076}
1077
1078/*
1079 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
1080 * furthest end of its superblock's dirty-inode list.
1081 *
1082 * Before stamping the inode's ->dirtied_when, we check to see whether it is
1083 * already the most-recently-dirtied inode on the b_dirty list.  If that is
1084 * the case then the inode must have been redirtied while it was being written
1085 * out and we don't reset its dirtied_when.
1086 */
1087static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
1088{
1089        if (!list_empty(&wb->b_dirty)) {
1090                struct inode *tail;
1091
1092                tail = wb_inode(wb->b_dirty.next);
1093                if (time_before(inode->dirtied_when, tail->dirtied_when))
1094                        inode->dirtied_when = jiffies;
1095        }
1096        inode_io_list_move_locked(inode, wb, &wb->b_dirty);
1097}
1098
1099/*
1100 * requeue inode for re-scanning after bdi->b_io list is exhausted.
1101 */
1102static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
1103{
1104        inode_io_list_move_locked(inode, wb, &wb->b_more_io);
1105}
1106
1107static void inode_sync_complete(struct inode *inode)
1108{
1109        inode->i_state &= ~I_SYNC;
1110        /* If inode is clean an unused, put it into LRU now... */
1111        inode_add_lru(inode);
1112        /* Waiters must see I_SYNC cleared before being woken up */
1113        smp_mb();
1114        wake_up_bit(&inode->i_state, __I_SYNC);
1115}
1116
1117static bool inode_dirtied_after(struct inode *inode, unsigned long t)
1118{
1119        bool ret = time_after(inode->dirtied_when, t);
1120#ifndef CONFIG_64BIT
1121        /*
1122         * For inodes being constantly redirtied, dirtied_when can get stuck.
1123         * It _appears_ to be in the future, but is actually in distant past.
1124         * This test is necessary to prevent such wrapped-around relative times
1125         * from permanently stopping the whole bdi writeback.
1126         */
1127        ret = ret && time_before_eq(inode->dirtied_when, jiffies);
1128#endif
1129        return ret;
1130}
1131
1132#define EXPIRE_DIRTY_ATIME 0x0001
1133
1134/*
1135 * Move expired (dirtied before work->older_than_this) dirty inodes from
1136 * @delaying_queue to @dispatch_queue.
1137 */
1138static int move_expired_inodes(struct list_head *delaying_queue,
1139                               struct list_head *dispatch_queue,
1140                               int flags,
1141                               struct wb_writeback_work *work)
1142{
1143        unsigned long *older_than_this = NULL;
1144        unsigned long expire_time;
1145        LIST_HEAD(tmp);
1146        struct list_head *pos, *node;
1147        struct super_block *sb = NULL;
1148        struct inode *inode;
1149        int do_sb_sort = 0;
1150        int moved = 0;
1151
1152        if ((flags & EXPIRE_DIRTY_ATIME) == 0)
1153                older_than_this = work->older_than_this;
1154        else if (!work->for_sync) {
1155                expire_time = jiffies - (dirtytime_expire_interval * HZ);
1156                older_than_this = &expire_time;
1157        }
1158        while (!list_empty(delaying_queue)) {
1159                inode = wb_inode(delaying_queue->prev);
1160                if (older_than_this &&
1161                    inode_dirtied_after(inode, *older_than_this))
1162                        break;
1163                list_move(&inode->i_io_list, &tmp);
1164                moved++;
1165                if (flags & EXPIRE_DIRTY_ATIME)
1166                        set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
1167                if (sb_is_blkdev_sb(inode->i_sb))
1168                        continue;
1169                if (sb && sb != inode->i_sb)
1170                        do_sb_sort = 1;
1171                sb = inode->i_sb;
1172        }
1173
1174        /* just one sb in list, splice to dispatch_queue and we're done */
1175        if (!do_sb_sort) {
1176                list_splice(&tmp, dispatch_queue);
1177                goto out;
1178        }
1179
1180        /* Move inodes from one superblock together */
1181        while (!list_empty(&tmp)) {
1182                sb = wb_inode(tmp.prev)->i_sb;
1183                list_for_each_prev_safe(pos, node, &tmp) {
1184                        inode = wb_inode(pos);
1185                        if (inode->i_sb == sb)
1186                                list_move(&inode->i_io_list, dispatch_queue);
1187                }
1188        }
1189out:
1190        return moved;
1191}
1192
1193/*
1194 * Queue all expired dirty inodes for io, eldest first.
1195 * Before
1196 *         newly dirtied     b_dirty    b_io    b_more_io
1197 *         =============>    gf         edc     BA
1198 * After
1199 *         newly dirtied     b_dirty    b_io    b_more_io
1200 *         =============>    g          fBAedc
1201 *                                           |
1202 *                                           +--> dequeue for IO
1203 */
1204static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
1205{
1206        int moved;
1207
1208        assert_spin_locked(&wb->list_lock);
1209        list_splice_init(&wb->b_more_io, &wb->b_io);
1210        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
1211        moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
1212                                     EXPIRE_DIRTY_ATIME, work);
1213        if (moved)
1214                wb_io_lists_populated(wb);
1215        trace_writeback_queue_io(wb, work, moved);
1216}
1217
1218static int write_inode(struct inode *inode, struct writeback_control *wbc)
1219{
1220        int ret;
1221
1222        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
1223                trace_writeback_write_inode_start(inode, wbc);
1224                ret = inode->i_sb->s_op->write_inode(inode, wbc);
1225                trace_writeback_write_inode(inode, wbc);
1226                return ret;
1227        }
1228        return 0;
1229}
1230
1231/*
1232 * Wait for writeback on an inode to complete. Called with i_lock held.
1233 * Caller must make sure inode cannot go away when we drop i_lock.
1234 */
1235static void __inode_wait_for_writeback(struct inode *inode)
1236        __releases(inode->i_lock)
1237        __acquires(inode->i_lock)
1238{
1239        DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
1240        wait_queue_head_t *wqh;
1241
1242        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1243        while (inode->i_state & I_SYNC) {
1244                spin_unlock(&inode->i_lock);
1245                __wait_on_bit(wqh, &wq, bit_wait,
1246                              TASK_UNINTERRUPTIBLE);
1247                spin_lock(&inode->i_lock);
1248        }
1249}
1250
1251/*
1252 * Wait for writeback on an inode to complete. Caller must have inode pinned.
1253 */
1254void inode_wait_for_writeback(struct inode *inode)
1255{
1256        spin_lock(&inode->i_lock);
1257        __inode_wait_for_writeback(inode);
1258        spin_unlock(&inode->i_lock);
1259}
1260
1261/*
1262 * Sleep until I_SYNC is cleared. This function must be called with i_lock
1263 * held and drops it. It is aimed for callers not holding any inode reference
1264 * so once i_lock is dropped, inode can go away.
1265 */
1266static void inode_sleep_on_writeback(struct inode *inode)
1267        __releases(inode->i_lock)
1268{
1269        DEFINE_WAIT(wait);
1270        wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
1271        int sleep;
1272
1273        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1274        sleep = inode->i_state & I_SYNC;
1275        spin_unlock(&inode->i_lock);
1276        if (sleep)
1277                schedule();
1278        finish_wait(wqh, &wait);
1279}
1280
1281/*
1282 * Find proper writeback list for the inode depending on its current state and
1283 * possibly also change of its state while we were doing writeback.  Here we
1284 * handle things such as livelock prevention or fairness of writeback among
1285 * inodes. This function can be called only by flusher thread - noone else
1286 * processes all inodes in writeback lists and requeueing inodes behind flusher
1287 * thread's back can have unexpected consequences.
1288 */
1289static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
1290                          struct writeback_control *wbc)
1291{
1292        if (inode->i_state & I_FREEING)
1293                return;
1294
1295        /*
1296         * Sync livelock prevention. Each inode is tagged and synced in one
1297         * shot. If still dirty, it will be redirty_tail()'ed below.  Update
1298         * the dirty time to prevent enqueue and sync it again.
1299         */
1300        if ((inode->i_state & I_DIRTY) &&
1301            (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
1302                inode->dirtied_when = jiffies;
1303
1304        if (wbc->pages_skipped) {
1305                /*
1306                 * writeback is not making progress due to locked
1307                 * buffers. Skip this inode for now.
1308                 */
1309                redirty_tail(inode, wb);
1310                return;
1311        }
1312
1313        if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
1314                /*
1315                 * We didn't write back all the pages.  nfs_writepages()
1316                 * sometimes bales out without doing anything.
1317                 */
1318                if (wbc->nr_to_write <= 0) {
1319                        /* Slice used up. Queue for next turn. */
1320                        requeue_io(inode, wb);
1321                } else {
1322                        /*
1323                         * Writeback blocked by something other than
1324                         * congestion. Delay the inode for some time to
1325                         * avoid spinning on the CPU (100% iowait)
1326                         * retrying writeback of the dirty page/inode
1327                         * that cannot be performed immediately.
1328                         */
1329                        redirty_tail(inode, wb);
1330                }
1331        } else if (inode->i_state & I_DIRTY) {
1332                /*
1333                 * Filesystems can dirty the inode during writeback operations,
1334                 * such as delayed allocation during submission or metadata
1335                 * updates after data IO completion.
1336                 */
1337                redirty_tail(inode, wb);
1338        } else if (inode->i_state & I_DIRTY_TIME) {
1339                inode->dirtied_when = jiffies;
1340                inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
1341        } else {
1342                /* The inode is clean. Remove from writeback lists. */
1343                inode_io_list_del_locked(inode, wb);
1344        }
1345}
1346
1347/*
1348 * Write out an inode and its dirty pages. Do not update the writeback list
1349 * linkage. That is left to the caller. The caller is also responsible for
1350 * setting I_SYNC flag and calling inode_sync_complete() to clear it.
1351 */
1352static int
1353__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1354{
1355        struct address_space *mapping = inode->i_mapping;
1356        long nr_to_write = wbc->nr_to_write;
1357        unsigned dirty;
1358        int ret;
1359
1360        WARN_ON(!(inode->i_state & I_SYNC));
1361
1362        trace_writeback_single_inode_start(inode, wbc, nr_to_write);
1363
1364        ret = do_writepages(mapping, wbc);
1365
1366        /*
1367         * Make sure to wait on the data before writing out the metadata.
1368         * This is important for filesystems that modify metadata on data
1369         * I/O completion. We don't do it for sync(2) writeback because it has a
1370         * separate, external IO completion path and ->sync_fs for guaranteeing
1371         * inode metadata is written back correctly.
1372         */
1373        if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
1374                int err = filemap_fdatawait(mapping);
1375                if (ret == 0)
1376                        ret = err;
1377        }
1378
1379        /*
1380         * Some filesystems may redirty the inode during the writeback
1381         * due to delalloc, clear dirty metadata flags right before
1382         * write_inode()
1383         */
1384        spin_lock(&inode->i_lock);
1385
1386        dirty = inode->i_state & I_DIRTY;
1387        if (inode->i_state & I_DIRTY_TIME) {
1388                if ((dirty & I_DIRTY_INODE) ||
1389                    wbc->sync_mode == WB_SYNC_ALL ||
1390                    unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
1391                    unlikely(time_after(jiffies,
1392                                        (inode->dirtied_time_when +
1393                                         dirtytime_expire_interval * HZ)))) {
1394                        dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
1395                        trace_writeback_lazytime(inode);
1396                }
1397        } else
1398                inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
1399        inode->i_state &= ~dirty;
1400
1401        /*
1402         * Paired with smp_mb() in __mark_inode_dirty().  This allows
1403         * __mark_inode_dirty() to test i_state without grabbing i_lock -
1404         * either they see the I_DIRTY bits cleared or we see the dirtied
1405         * inode.
1406         *
1407         * I_DIRTY_PAGES is always cleared together above even if @mapping
1408         * still has dirty pages.  The flag is reinstated after smp_mb() if
1409         * necessary.  This guarantees that either __mark_inode_dirty()
1410         * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
1411         */
1412        smp_mb();
1413
1414        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1415                inode->i_state |= I_DIRTY_PAGES;
1416
1417        spin_unlock(&inode->i_lock);
1418
1419        if (dirty & I_DIRTY_TIME)
1420                mark_inode_dirty_sync(inode);
1421        /* Don't write the inode if only I_DIRTY_PAGES was set */
1422        if (dirty & ~I_DIRTY_PAGES) {
1423                int err = write_inode(inode, wbc);
1424                if (ret == 0)
1425                        ret = err;
1426        }
1427        trace_writeback_single_inode(inode, wbc, nr_to_write);
1428        return ret;
1429}
1430
1431/*
1432 * Write out an inode's dirty pages. Either the caller has an active reference
1433 * on the inode or the inode has I_WILL_FREE set.
1434 *
1435 * This function is designed to be called for writing back one inode which
1436 * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
1437 * and does more profound writeback list handling in writeback_sb_inodes().
1438 */
1439static int writeback_single_inode(struct inode *inode,
1440                                  struct writeback_control *wbc)
1441{
1442        struct bdi_writeback *wb;
1443        int ret = 0;
1444
1445        spin_lock(&inode->i_lock);
1446        if (!atomic_read(&inode->i_count))
1447                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
1448        else
1449                WARN_ON(inode->i_state & I_WILL_FREE);
1450
1451        if (inode->i_state & I_SYNC) {
1452                if (wbc->sync_mode != WB_SYNC_ALL)
1453                        goto out;
1454                /*
1455                 * It's a data-integrity sync. We must wait. Since callers hold
1456                 * inode reference or inode has I_WILL_FREE set, it cannot go
1457                 * away under us.
1458                 */
1459                __inode_wait_for_writeback(inode);
1460        }
1461        WARN_ON(inode->i_state & I_SYNC);
1462        /*
1463         * Skip inode if it is clean and we have no outstanding writeback in
1464         * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
1465         * function since flusher thread may be doing for example sync in
1466         * parallel and if we move the inode, it could get skipped. So here we
1467         * make sure inode is on some writeback list and leave it there unless
1468         * we have completely cleaned the inode.
1469         */
1470        if (!(inode->i_state & I_DIRTY_ALL) &&
1471            (wbc->sync_mode != WB_SYNC_ALL ||
1472             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
1473                goto out;
1474        inode->i_state |= I_SYNC;
1475        wbc_attach_and_unlock_inode(wbc, inode);
1476
1477        ret = __writeback_single_inode(inode, wbc);
1478
1479        wbc_detach_inode(wbc);
1480
1481        wb = inode_to_wb_and_lock_list(inode);
1482        spin_lock(&inode->i_lock);
1483        /*
1484         * If inode is clean, remove it from writeback lists. Otherwise don't
1485         * touch it. See comment above for explanation.
1486         */
1487        if (!(inode->i_state & I_DIRTY_ALL))
1488                inode_io_list_del_locked(inode, wb);
1489        spin_unlock(&wb->list_lock);
1490        inode_sync_complete(inode);
1491out:
1492        spin_unlock(&inode->i_lock);
1493        return ret;
1494}
1495
1496static long writeback_chunk_size(struct bdi_writeback *wb,
1497                                 struct wb_writeback_work *work)
1498{
1499        long pages;
1500
1501        /*
1502         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
1503         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
1504         * here avoids calling into writeback_inodes_wb() more than once.
1505         *
1506         * The intended call sequence for WB_SYNC_ALL writeback is:
1507         *
1508         *      wb_writeback()
1509         *          writeback_sb_inodes()       <== called only once
1510         *              write_cache_pages()     <== called once for each inode
1511         *                   (quickly) tag currently dirty pages
1512         *                   (maybe slowly) sync all tagged pages
1513         */
1514        if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1515                pages = LONG_MAX;
1516        else {
1517                pages = min(wb->avg_write_bandwidth / 2,
1518                            global_wb_domain.dirty_limit / DIRTY_SCOPE);
1519                pages = min(pages, work->nr_pages);
1520                pages = round_down(pages + MIN_WRITEBACK_PAGES,
1521                                   MIN_WRITEBACK_PAGES);
1522        }
1523
1524        return pages;
1525}
1526
1527/*
1528 * Write a portion of b_io inodes which belong to @sb.
1529 *
1530 * Return the number of pages and/or inodes written.
1531 *
1532 * NOTE! This is called with wb->list_lock held, and will
1533 * unlock and relock that for each inode it ends up doing
1534 * IO for.
1535 */
1536static long writeback_sb_inodes(struct super_block *sb,
1537                                struct bdi_writeback *wb,
1538                                struct wb_writeback_work *work)
1539{
1540        struct writeback_control wbc = {
1541                .sync_mode              = work->sync_mode,
1542                .tagged_writepages      = work->tagged_writepages,
1543                .for_kupdate            = work->for_kupdate,
1544                .for_background         = work->for_background,
1545                .for_sync               = work->for_sync,
1546                .range_cyclic           = work->range_cyclic,
1547                .range_start            = 0,
1548                .range_end              = LLONG_MAX,
1549        };
1550        unsigned long start_time = jiffies;
1551        long write_chunk;
1552        long wrote = 0;  /* count both pages and inodes */
1553
1554        while (!list_empty(&wb->b_io)) {
1555                struct inode *inode = wb_inode(wb->b_io.prev);
1556                struct bdi_writeback *tmp_wb;
1557
1558                if (inode->i_sb != sb) {
1559                        if (work->sb) {
1560                                /*
1561                                 * We only want to write back data for this
1562                                 * superblock, move all inodes not belonging
1563                                 * to it back onto the dirty list.
1564                                 */
1565                                redirty_tail(inode, wb);
1566                                continue;
1567                        }
1568
1569                        /*
1570                         * The inode belongs to a different superblock.
1571                         * Bounce back to the caller to unpin this and
1572                         * pin the next superblock.
1573                         */
1574                        break;
1575                }
1576
1577                /*
1578                 * Don't bother with new inodes or inodes being freed, first
1579                 * kind does not need periodic writeout yet, and for the latter
1580                 * kind writeout is handled by the freer.
1581                 */
1582                spin_lock(&inode->i_lock);
1583                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
1584                        spin_unlock(&inode->i_lock);
1585                        redirty_tail(inode, wb);
1586                        continue;
1587                }
1588                if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
1589                        /*
1590                         * If this inode is locked for writeback and we are not
1591                         * doing writeback-for-data-integrity, move it to
1592                         * b_more_io so that writeback can proceed with the
1593                         * other inodes on s_io.
1594                         *
1595                         * We'll have another go at writing back this inode
1596                         * when we completed a full scan of b_io.
1597                         */
1598                        spin_unlock(&inode->i_lock);
1599                        requeue_io(inode, wb);
1600                        trace_writeback_sb_inodes_requeue(inode);
1601                        continue;
1602                }
1603                spin_unlock(&wb->list_lock);
1604
1605                /*
1606                 * We already requeued the inode if it had I_SYNC set and we
1607                 * are doing WB_SYNC_NONE writeback. So this catches only the
1608                 * WB_SYNC_ALL case.
1609                 */
1610                if (inode->i_state & I_SYNC) {
1611                        /* Wait for I_SYNC. This function drops i_lock... */
1612                        inode_sleep_on_writeback(inode);
1613                        /* Inode may be gone, start again */
1614                        spin_lock(&wb->list_lock);
1615                        continue;
1616                }
1617                inode->i_state |= I_SYNC;
1618                wbc_attach_and_unlock_inode(&wbc, inode);
1619
1620                write_chunk = writeback_chunk_size(wb, work);
1621                wbc.nr_to_write = write_chunk;
1622                wbc.pages_skipped = 0;
1623
1624                /*
1625                 * We use I_SYNC to pin the inode in memory. While it is set
1626                 * evict_inode() will wait so the inode cannot be freed.
1627                 */
1628                __writeback_single_inode(inode, &wbc);
1629
1630                wbc_detach_inode(&wbc);
1631                work->nr_pages -= write_chunk - wbc.nr_to_write;
1632                wrote += write_chunk - wbc.nr_to_write;
1633
1634                if (need_resched()) {
1635                        /*
1636                         * We're trying to balance between building up a nice
1637                         * long list of IOs to improve our merge rate, and
1638                         * getting those IOs out quickly for anyone throttling
1639                         * in balance_dirty_pages().  cond_resched() doesn't
1640                         * unplug, so get our IOs out the door before we
1641                         * give up the CPU.
1642                         */
1643                        blk_flush_plug(current);
1644                        cond_resched();
1645                }
1646
1647                /*
1648                 * Requeue @inode if still dirty.  Be careful as @inode may
1649                 * have been switched to another wb in the meantime.
1650                 */
1651                tmp_wb = inode_to_wb_and_lock_list(inode);
1652                spin_lock(&inode->i_lock);
1653                if (!(inode->i_state & I_DIRTY_ALL))
1654                        wrote++;
1655                requeue_inode(inode, tmp_wb, &wbc);
1656                inode_sync_complete(inode);
1657                spin_unlock(&inode->i_lock);
1658
1659                if (unlikely(tmp_wb != wb)) {
1660                        spin_unlock(&tmp_wb->list_lock);
1661                        spin_lock(&wb->list_lock);
1662                }
1663
1664                /*
1665                 * bail out to wb_writeback() often enough to check
1666                 * background threshold and other termination conditions.
1667                 */
1668                if (wrote) {
1669                        if (time_is_before_jiffies(start_time + HZ / 10UL))
1670                                break;
1671                        if (work->nr_pages <= 0)
1672                                break;
1673                }
1674        }
1675        return wrote;
1676}
1677
1678static long __writeback_inodes_wb(struct bdi_writeback *wb,
1679                                  struct wb_writeback_work *work)
1680{
1681        unsigned long start_time = jiffies;
1682        long wrote = 0;
1683
1684        while (!list_empty(&wb->b_io)) {
1685                struct inode *inode = wb_inode(wb->b_io.prev);
1686                struct super_block *sb = inode->i_sb;
1687
1688                if (!trylock_super(sb)) {
1689                        /*
1690                         * trylock_super() may fail consistently due to
1691                         * s_umount being grabbed by someone else. Don't use
1692                         * requeue_io() to avoid busy retrying the inode/sb.
1693                         */
1694                        redirty_tail(inode, wb);
1695                        continue;
1696                }
1697                wrote += writeback_sb_inodes(sb, wb, work);
1698                up_read(&sb->s_umount);
1699
1700                /* refer to the same tests at the end of writeback_sb_inodes */
1701                if (wrote) {
1702                        if (time_is_before_jiffies(start_time + HZ / 10UL))
1703                                break;
1704                        if (work->nr_pages <= 0)
1705                                break;
1706                }
1707        }
1708        /* Leave any unwritten inodes on b_io */
1709        return wrote;
1710}
1711
1712static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
1713                                enum wb_reason reason)
1714{
1715        struct wb_writeback_work work = {
1716                .nr_pages       = nr_pages,
1717                .sync_mode      = WB_SYNC_NONE,
1718                .range_cyclic   = 1,
1719                .reason         = reason,
1720        };
1721        struct blk_plug plug;
1722
1723        blk_start_plug(&plug);
1724        spin_lock(&wb->list_lock);
1725        if (list_empty(&wb->b_io))
1726                queue_io(wb, &work);
1727        __writeback_inodes_wb(wb, &work);
1728        spin_unlock(&wb->list_lock);
1729        blk_finish_plug(&plug);
1730
1731        return nr_pages - work.nr_pages;
1732}
1733
1734/*
1735 * Explicit flushing or periodic writeback of "old" data.
1736 *
1737 * Define "old": the first time one of an inode's pages is dirtied, we mark the
1738 * dirtying-time in the inode's address_space.  So this periodic writeback code
1739 * just walks the superblock inode list, writing back any inodes which are
1740 * older than a specific point in time.
1741 *
1742 * Try to run once per dirty_writeback_interval.  But if a writeback event
1743 * takes longer than a dirty_writeback_interval interval, then leave a
1744 * one-second gap.
1745 *
1746 * older_than_this takes precedence over nr_to_write.  So we'll only write back
1747 * all dirty pages if they are all attached to "old" mappings.
1748 */
1749static long wb_writeback(struct bdi_writeback *wb,
1750                         struct wb_writeback_work *work)
1751{
1752        unsigned long wb_start = jiffies;
1753        long nr_pages = work->nr_pages;
1754        unsigned long oldest_jif;
1755        struct inode *inode;
1756        long progress;
1757        struct blk_plug plug;
1758
1759        oldest_jif = jiffies;
1760        work->older_than_this = &oldest_jif;
1761
1762        blk_start_plug(&plug);
1763        spin_lock(&wb->list_lock);
1764        for (;;) {
1765                /*
1766                 * Stop writeback when nr_pages has been consumed
1767                 */
1768                if (work->nr_pages <= 0)
1769                        break;
1770
1771                /*
1772                 * Background writeout and kupdate-style writeback may
1773                 * run forever. Stop them if there is other work to do
1774                 * so that e.g. sync can proceed. They'll be restarted
1775                 * after the other works are all done.
1776                 */
1777                if ((work->for_background || work->for_kupdate) &&
1778                    !list_empty(&wb->work_list))
1779                        break;
1780
1781                /*
1782                 * For background writeout, stop when we are below the
1783                 * background dirty threshold
1784                 */
1785                if (work->for_background && !wb_over_bg_thresh(wb))
1786                        break;
1787
1788                /*
1789                 * Kupdate and background works are special and we want to
1790                 * include all inodes that need writing. Livelock avoidance is
1791                 * handled by these works yielding to any other work so we are
1792                 * safe.
1793                 */
1794                if (work->for_kupdate) {
1795                        oldest_jif = jiffies -
1796                                msecs_to_jiffies(dirty_expire_interval * 10);
1797                } else if (work->for_background)
1798                        oldest_jif = jiffies;
1799
1800                trace_writeback_start(wb, work);
1801                if (list_empty(&wb->b_io))
1802                        queue_io(wb, work);
1803                if (work->sb)
1804                        progress = writeback_sb_inodes(work->sb, wb, work);
1805                else
1806                        progress = __writeback_inodes_wb(wb, work);
1807                trace_writeback_written(wb, work);
1808
1809                wb_update_bandwidth(wb, wb_start);
1810
1811                /*
1812                 * Did we write something? Try for more
1813                 *
1814                 * Dirty inodes are moved to b_io for writeback in batches.
1815                 * The completion of the current batch does not necessarily
1816                 * mean the overall work is done. So we keep looping as long
1817                 * as made some progress on cleaning pages or inodes.
1818                 */
1819                if (progress)
1820                        continue;
1821                /*
1822                 * No more inodes for IO, bail
1823                 */
1824                if (list_empty(&wb->b_more_io))
1825                        break;
1826                /*
1827                 * Nothing written. Wait for some inode to
1828                 * become available for writeback. Otherwise
1829                 * we'll just busyloop.
1830                 */
1831                trace_writeback_wait(wb, work);
1832                inode = wb_inode(wb->b_more_io.prev);
1833                spin_lock(&inode->i_lock);
1834                spin_unlock(&wb->list_lock);
1835                /* This function drops i_lock... */
1836                inode_sleep_on_writeback(inode);
1837                spin_lock(&wb->list_lock);
1838        }
1839        spin_unlock(&wb->list_lock);
1840        blk_finish_plug(&plug);
1841
1842        return nr_pages - work->nr_pages;
1843}
1844
1845/*
1846 * Return the next wb_writeback_work struct that hasn't been processed yet.
1847 */
1848static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
1849{
1850        struct wb_writeback_work *work = NULL;
1851
1852        spin_lock_bh(&wb->work_lock);
1853        if (!list_empty(&wb->work_list)) {
1854                work = list_entry(wb->work_list.next,
1855                                  struct wb_writeback_work, list);
1856                list_del_init(&work->list);
1857        }
1858        spin_unlock_bh(&wb->work_lock);
1859        return work;
1860}
1861
1862static long wb_check_background_flush(struct bdi_writeback *wb)
1863{
1864        if (wb_over_bg_thresh(wb)) {
1865
1866                struct wb_writeback_work work = {
1867                        .nr_pages       = LONG_MAX,
1868                        .sync_mode      = WB_SYNC_NONE,
1869                        .for_background = 1,
1870                        .range_cyclic   = 1,
1871                        .reason         = WB_REASON_BACKGROUND,
1872                };
1873
1874                return wb_writeback(wb, &work);
1875        }
1876
1877        return 0;
1878}
1879
1880static long wb_check_old_data_flush(struct bdi_writeback *wb)
1881{
1882        unsigned long expired;
1883        long nr_pages;
1884
1885        /*
1886         * When set to zero, disable periodic writeback
1887         */
1888        if (!dirty_writeback_interval)
1889                return 0;
1890
1891        expired = wb->last_old_flush +
1892                        msecs_to_jiffies(dirty_writeback_interval * 10);
1893        if (time_before(jiffies, expired))
1894                return 0;
1895
1896        wb->last_old_flush = jiffies;
1897        nr_pages = get_nr_dirty_pages();
1898
1899        if (nr_pages) {
1900                struct wb_writeback_work work = {
1901                        .nr_pages       = nr_pages,
1902                        .sync_mode      = WB_SYNC_NONE,
1903                        .for_kupdate    = 1,
1904                        .range_cyclic   = 1,
1905                        .reason         = WB_REASON_PERIODIC,
1906                };
1907
1908                return wb_writeback(wb, &work);
1909        }
1910
1911        return 0;
1912}
1913
1914static long wb_check_start_all(struct bdi_writeback *wb)
1915{
1916        long nr_pages;
1917
1918        if (!test_bit(WB_start_all, &wb->state))
1919                return 0;
1920
1921        nr_pages = get_nr_dirty_pages();
1922        if (nr_pages) {
1923                struct wb_writeback_work work = {
1924                        .nr_pages       = wb_split_bdi_pages(wb, nr_pages),
1925                        .sync_mode      = WB_SYNC_NONE,
1926                        .range_cyclic   = 1,
1927                        .reason         = wb->start_all_reason,
1928                };
1929
1930                nr_pages = wb_writeback(wb, &work);
1931        }
1932
1933        clear_bit(WB_start_all, &wb->state);
1934        return nr_pages;
1935}
1936
1937
1938/*
1939 * Retrieve work items and do the writeback they describe
1940 */
1941static long wb_do_writeback(struct bdi_writeback *wb)
1942{
1943        struct wb_writeback_work *work;
1944        long wrote = 0;
1945
1946        set_bit(WB_writeback_running, &wb->state);
1947        while ((work = get_next_work_item(wb)) != NULL) {
1948                trace_writeback_exec(wb, work);
1949                wrote += wb_writeback(wb, work);
1950                finish_writeback_work(wb, work);
1951        }
1952
1953        /*
1954         * Check for a flush-everything request
1955         */
1956        wrote += wb_check_start_all(wb);
1957
1958        /*
1959         * Check for periodic writeback, kupdated() style
1960         */
1961        wrote += wb_check_old_data_flush(wb);
1962        wrote += wb_check_background_flush(wb);
1963        clear_bit(WB_writeback_running, &wb->state);
1964
1965        return wrote;
1966}
1967
1968/*
1969 * Handle writeback of dirty data for the device backed by this bdi. Also
1970 * reschedules periodically and does kupdated style flushing.
1971 */
1972void wb_workfn(struct work_struct *work)
1973{
1974        struct bdi_writeback *wb = container_of(to_delayed_work(work),
1975                                                struct bdi_writeback, dwork);
1976        long pages_written;
1977
1978        set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
1979        current->flags |= PF_SWAPWRITE;
1980
1981        if (likely(!current_is_workqueue_rescuer() ||
1982                   !test_bit(WB_registered, &wb->state))) {
1983                /*
1984                 * The normal path.  Keep writing back @wb until its
1985                 * work_list is empty.  Note that this path is also taken
1986                 * if @wb is shutting down even when we're running off the
1987                 * rescuer as work_list needs to be drained.
1988                 */
1989                do {
1990                        pages_written = wb_do_writeback(wb);
1991                        trace_writeback_pages_written(pages_written);
1992                } while (!list_empty(&wb->work_list));
1993        } else {
1994                /*
1995                 * bdi_wq can't get enough workers and we're running off
1996                 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
1997                 * enough for efficient IO.
1998                 */
1999                pages_written = writeback_inodes_wb(wb, 1024,
2000                                                    WB_REASON_FORKER_THREAD);
2001                trace_writeback_pages_written(pages_written);
2002        }
2003
2004        if (!list_empty(&wb->work_list))
2005                wb_wakeup(wb);
2006        else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
2007                wb_wakeup_delayed(wb);
2008
2009        current->flags &= ~PF_SWAPWRITE;
2010}
2011
2012/*
2013 * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
2014 * write back the whole world.
2015 */
2016static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2017                                         enum wb_reason reason)
2018{
2019        struct bdi_writeback *wb;
2020
2021        if (!bdi_has_dirty_io(bdi))
2022                return;
2023
2024        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2025                wb_start_writeback(wb, reason);
2026}
2027
2028void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
2029                                enum wb_reason reason)
2030{
2031        rcu_read_lock();
2032        __wakeup_flusher_threads_bdi(bdi, reason);
2033        rcu_read_unlock();
2034}
2035
2036/*
2037 * Wakeup the flusher threads to start writeback of all currently dirty pages
2038 */
2039void wakeup_flusher_threads(enum wb_reason reason)
2040{
2041        struct backing_dev_info *bdi;
2042
2043        /*
2044         * If we are expecting writeback progress we must submit plugged IO.
2045         */
2046        if (blk_needs_flush_plug(current))
2047                blk_schedule_flush_plug(current);
2048
2049        rcu_read_lock();
2050        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
2051                __wakeup_flusher_threads_bdi(bdi, reason);
2052        rcu_read_unlock();
2053}
2054
2055/*
2056 * Wake up bdi's periodically to make sure dirtytime inodes gets
2057 * written back periodically.  We deliberately do *not* check the
2058 * b_dirtytime list in wb_has_dirty_io(), since this would cause the
2059 * kernel to be constantly waking up once there are any dirtytime
2060 * inodes on the system.  So instead we define a separate delayed work
2061 * function which gets called much more rarely.  (By default, only
2062 * once every 12 hours.)
2063 *
2064 * If there is any other write activity going on in the file system,
2065 * this function won't be necessary.  But if the only thing that has
2066 * happened on the file system is a dirtytime inode caused by an atime
2067 * update, we need this infrastructure below to make sure that inode
2068 * eventually gets pushed out to disk.
2069 */
2070static void wakeup_dirtytime_writeback(struct work_struct *w);
2071static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
2072
2073static void wakeup_dirtytime_writeback(struct work_struct *w)
2074{
2075        struct backing_dev_info *bdi;
2076
2077        rcu_read_lock();
2078        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
2079                struct bdi_writeback *wb;
2080
2081                list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
2082                        if (!list_empty(&wb->b_dirty_time))
2083                                wb_wakeup(wb);
2084        }
2085        rcu_read_unlock();
2086        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2087}
2088
2089static int __init start_dirtytime_writeback(void)
2090{
2091        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
2092        return 0;
2093}
2094__initcall(start_dirtytime_writeback);
2095
2096int dirtytime_interval_handler(struct ctl_table *table, int write,
2097                               void __user *buffer, size_t *lenp, loff_t *ppos)
2098{
2099        int ret;
2100
2101        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2102        if (ret == 0 && write)
2103                mod_delayed_work(system_wq, &dirtytime_work, 0);
2104        return ret;
2105}
2106
2107static noinline void block_dump___mark_inode_dirty(struct inode *inode)
2108{
2109        if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
2110                struct dentry *dentry;
2111                const char *name = "?";
2112
2113                dentry = d_find_alias(inode);
2114                if (dentry) {
2115                        spin_lock(&dentry->d_lock);
2116                        name = (const char *) dentry->d_name.name;
2117                }
2118                printk(KERN_DEBUG
2119                       "%s(%d): dirtied inode %lu (%s) on %s\n",
2120                       current->comm, task_pid_nr(current), inode->i_ino,
2121                       name, inode->i_sb->s_id);
2122                if (dentry) {
2123                        spin_unlock(&dentry->d_lock);
2124                        dput(dentry);
2125                }
2126        }
2127}
2128
2129/**
2130 * __mark_inode_dirty - internal function
2131 *
2132 * @inode: inode to mark
2133 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
2134 *
2135 * Mark an inode as dirty. Callers should use mark_inode_dirty or
2136 * mark_inode_dirty_sync.
2137 *
2138 * Put the inode on the super block's dirty list.
2139 *
2140 * CAREFUL! We mark it dirty unconditionally, but move it onto the
2141 * dirty list only if it is hashed or if it refers to a blockdev.
2142 * If it was not hashed, it will never be added to the dirty list
2143 * even if it is later hashed, as it will have been marked dirty already.
2144 *
2145 * In short, make sure you hash any inodes _before_ you start marking
2146 * them dirty.
2147 *
2148 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
2149 * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
2150 * the kernel-internal blockdev inode represents the dirtying time of the
2151 * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
2152 * page->mapping->host, so the page-dirtying time is recorded in the internal
2153 * blockdev inode.
2154 */
2155void __mark_inode_dirty(struct inode *inode, int flags)
2156{
2157        struct super_block *sb = inode->i_sb;
2158        int dirtytime;
2159
2160        trace_writeback_mark_inode_dirty(inode, flags);
2161
2162        /*
2163         * Don't do this for I_DIRTY_PAGES - that doesn't actually
2164         * dirty the inode itself
2165         */
2166        if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) {
2167                trace_writeback_dirty_inode_start(inode, flags);
2168
2169                if (sb->s_op->dirty_inode)
2170                        sb->s_op->dirty_inode(inode, flags);
2171
2172                trace_writeback_dirty_inode(inode, flags);
2173        }
2174        if (flags & I_DIRTY_INODE)
2175                flags &= ~I_DIRTY_TIME;
2176        dirtytime = flags & I_DIRTY_TIME;
2177
2178        /*
2179         * Paired with smp_mb() in __writeback_single_inode() for the
2180         * following lockless i_state test.  See there for details.
2181         */
2182        smp_mb();
2183
2184        if (((inode->i_state & flags) == flags) ||
2185            (dirtytime && (inode->i_state & I_DIRTY_INODE)))
2186                return;
2187
2188        if (unlikely(block_dump))
2189                block_dump___mark_inode_dirty(inode);
2190
2191        spin_lock(&inode->i_lock);
2192        if (dirtytime && (inode->i_state & I_DIRTY_INODE))
2193                goto out_unlock_inode;
2194        if ((inode->i_state & flags) != flags) {
2195                const int was_dirty = inode->i_state & I_DIRTY;
2196
2197                inode_attach_wb(inode, NULL);
2198
2199                if (flags & I_DIRTY_INODE)
2200                        inode->i_state &= ~I_DIRTY_TIME;
2201                inode->i_state |= flags;
2202
2203                /*
2204                 * If the inode is being synced, just update its dirty state.
2205                 * The unlocker will place the inode on the appropriate
2206                 * superblock list, based upon its state.
2207                 */
2208                if (inode->i_state & I_SYNC)
2209                        goto out_unlock_inode;
2210
2211                /*
2212                 * Only add valid (hashed) inodes to the superblock's
2213                 * dirty list.  Add blockdev inodes as well.
2214                 */
2215                if (!S_ISBLK(inode->i_mode)) {
2216                        if (inode_unhashed(inode))
2217                                goto out_unlock_inode;
2218                }
2219                if (inode->i_state & I_FREEING)
2220                        goto out_unlock_inode;
2221
2222                /*
2223                 * If the inode was already on b_dirty/b_io/b_more_io, don't
2224                 * reposition it (that would break b_dirty time-ordering).
2225                 */
2226                if (!was_dirty) {
2227                        struct bdi_writeback *wb;
2228                        struct list_head *dirty_list;
2229                        bool wakeup_bdi = false;
2230
2231                        wb = locked_inode_to_wb_and_lock_list(inode);
2232
2233                        WARN(bdi_cap_writeback_dirty(wb->bdi) &&
2234                             !test_bit(WB_registered, &wb->state),
2235                             "bdi-%s not registered\n", wb->bdi->name);
2236
2237                        inode->dirtied_when = jiffies;
2238                        if (dirtytime)
2239                                inode->dirtied_time_when = jiffies;
2240
2241                        if (inode->i_state & I_DIRTY)
2242                                dirty_list = &wb->b_dirty;
2243                        else
2244                                dirty_list = &wb->b_dirty_time;
2245
2246                        wakeup_bdi = inode_io_list_move_locked(inode, wb,
2247                                                               dirty_list);
2248
2249                        spin_unlock(&wb->list_lock);
2250                        trace_writeback_dirty_inode_enqueue(inode);
2251
2252                        /*
2253                         * If this is the first dirty inode for this bdi,
2254                         * we have to wake-up the corresponding bdi thread
2255                         * to make sure background write-back happens
2256                         * later.
2257                         */
2258                        if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
2259                                wb_wakeup_delayed(wb);
2260                        return;
2261                }
2262        }
2263out_unlock_inode:
2264        spin_unlock(&inode->i_lock);
2265}
2266EXPORT_SYMBOL(__mark_inode_dirty);
2267
2268/*
2269 * The @s_sync_lock is used to serialise concurrent sync operations
2270 * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
2271 * Concurrent callers will block on the s_sync_lock rather than doing contending
2272 * walks. The queueing maintains sync(2) required behaviour as all the IO that
2273 * has been issued up to the time this function is enter is guaranteed to be
2274 * completed by the time we have gained the lock and waited for all IO that is
2275 * in progress regardless of the order callers are granted the lock.
2276 */
2277static void wait_sb_inodes(struct super_block *sb)
2278{
2279        LIST_HEAD(sync_list);
2280
2281        /*
2282         * We need to be protected against the filesystem going from
2283         * r/o to r/w or vice versa.
2284         */
2285        WARN_ON(!rwsem_is_locked(&sb->s_umount));
2286
2287        mutex_lock(&sb->s_sync_lock);
2288
2289        /*
2290         * Splice the writeback list onto a temporary list to avoid waiting on
2291         * inodes that have started writeback after this point.
2292         *
2293         * Use rcu_read_lock() to keep the inodes around until we have a
2294         * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
2295         * the local list because inodes can be dropped from either by writeback
2296         * completion.
2297         */
2298        rcu_read_lock();
2299        spin_lock_irq(&sb->s_inode_wblist_lock);
2300        list_splice_init(&sb->s_inodes_wb, &sync_list);
2301
2302        /*
2303         * Data integrity sync. Must wait for all pages under writeback, because
2304         * there may have been pages dirtied before our sync call, but which had
2305         * writeout started before we write it out.  In which case, the inode
2306         * may not be on the dirty list, but we still have to wait for that
2307         * writeout.
2308         */
2309        while (!list_empty(&sync_list)) {
2310                struct inode *inode = list_first_entry(&sync_list, struct inode,
2311                                                       i_wb_list);
2312                struct address_space *mapping = inode->i_mapping;
2313
2314                /*
2315                 * Move each inode back to the wb list before we drop the lock
2316                 * to preserve consistency between i_wb_list and the mapping
2317                 * writeback tag. Writeback completion is responsible to remove
2318                 * the inode from either list once the writeback tag is cleared.
2319                 */
2320                list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2321
2322                /*
2323                 * The mapping can appear untagged while still on-list since we
2324                 * do not have the mapping lock. Skip it here, wb completion
2325                 * will remove it.
2326                 */
2327                if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2328                        continue;
2329
2330                spin_unlock_irq(&sb->s_inode_wblist_lock);
2331
2332                spin_lock(&inode->i_lock);
2333                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2334                        spin_unlock(&inode->i_lock);
2335
2336                        spin_lock_irq(&sb->s_inode_wblist_lock);
2337                        continue;
2338                }
2339                __iget(inode);
2340                spin_unlock(&inode->i_lock);
2341                rcu_read_unlock();
2342
2343                /*
2344                 * We keep the error status of individual mapping so that
2345                 * applications can catch the writeback error using fsync(2).
2346                 * See filemap_fdatawait_keep_errors() for details.
2347                 */
2348                filemap_fdatawait_keep_errors(mapping);
2349
2350                cond_resched();
2351
2352                iput(inode);
2353
2354                rcu_read_lock();
2355                spin_lock_irq(&sb->s_inode_wblist_lock);
2356        }
2357        spin_unlock_irq(&sb->s_inode_wblist_lock);
2358        rcu_read_unlock();
2359        mutex_unlock(&sb->s_sync_lock);
2360}
2361
2362static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
2363                                     enum wb_reason reason, bool skip_if_busy)
2364{
2365        DEFINE_WB_COMPLETION_ONSTACK(done);
2366        struct wb_writeback_work work = {
2367                .sb                     = sb,
2368                .sync_mode              = WB_SYNC_NONE,
2369                .tagged_writepages      = 1,
2370                .done                   = &done,
2371                .nr_pages               = nr,
2372                .reason                 = reason,
2373        };
2374        struct backing_dev_info *bdi = sb->s_bdi;
2375
2376        if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
2377                return;
2378        WARN_ON(!rwsem_is_locked(&sb->s_umount));
2379
2380        bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2381        wb_wait_for_completion(bdi, &done);
2382}
2383
2384/**
2385 * writeback_inodes_sb_nr -     writeback dirty inodes from given super_block
2386 * @sb: the superblock
2387 * @nr: the number of pages to write
2388 * @reason: reason why some writeback work initiated
2389 *
2390 * Start writeback on some inodes on this super_block. No guarantees are made
2391 * on how many (if any) will be written, and this function does not wait
2392 * for IO completion of submitted IO.
2393 */
2394void writeback_inodes_sb_nr(struct super_block *sb,
2395                            unsigned long nr,
2396                            enum wb_reason reason)
2397{
2398        __writeback_inodes_sb_nr(sb, nr, reason, false);
2399}
2400EXPORT_SYMBOL(writeback_inodes_sb_nr);
2401
2402/**
2403 * writeback_inodes_sb  -       writeback dirty inodes from given super_block
2404 * @sb: the superblock
2405 * @reason: reason why some writeback work was initiated
2406 *
2407 * Start writeback on some inodes on this super_block. No guarantees are made
2408 * on how many (if any) will be written, and this function does not wait
2409 * for IO completion of submitted IO.
2410 */
2411void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2412{
2413        return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
2414}
2415EXPORT_SYMBOL(writeback_inodes_sb);
2416
2417/**
2418 * try_to_writeback_inodes_sb - try to start writeback if none underway
2419 * @sb: the superblock
2420 * @reason: reason why some writeback work was initiated
2421 *
2422 * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
2423 */
2424void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
2425{
2426        if (!down_read_trylock(&sb->s_umount))
2427                return;
2428
2429        __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
2430        up_read(&sb->s_umount);
2431}
2432EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2433
2434/**
2435 * sync_inodes_sb       -       sync sb inode pages
2436 * @sb: the superblock
2437 *
2438 * This function writes and waits on any dirty inode belonging to this
2439 * super_block.
2440 */
2441void sync_inodes_sb(struct super_block *sb)
2442{
2443        DEFINE_WB_COMPLETION_ONSTACK(done);
2444        struct wb_writeback_work work = {
2445                .sb             = sb,
2446                .sync_mode      = WB_SYNC_ALL,
2447                .nr_pages       = LONG_MAX,
2448                .range_cyclic   = 0,
2449                .done           = &done,
2450                .reason         = WB_REASON_SYNC,
2451                .for_sync       = 1,
2452        };
2453        struct backing_dev_info *bdi = sb->s_bdi;
2454
2455        /*
2456         * Can't skip on !bdi_has_dirty() because we should wait for !dirty
2457         * inodes under writeback and I_DIRTY_TIME inodes ignored by
2458         * bdi_has_dirty() need to be written out too.
2459         */
2460        if (bdi == &noop_backing_dev_info)
2461                return;
2462        WARN_ON(!rwsem_is_locked(&sb->s_umount));
2463
2464        /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
2465        bdi_down_write_wb_switch_rwsem(bdi);
2466        bdi_split_work_to_wbs(bdi, &work, false);
2467        wb_wait_for_completion(bdi, &done);
2468        bdi_up_write_wb_switch_rwsem(bdi);
2469
2470        wait_sb_inodes(sb);
2471}
2472EXPORT_SYMBOL(sync_inodes_sb);
2473
2474/**
2475 * write_inode_now      -       write an inode to disk
2476 * @inode: inode to write to disk
2477 * @sync: whether the write should be synchronous or not
2478 *
2479 * This function commits an inode to disk immediately if it is dirty. This is
2480 * primarily needed by knfsd.
2481 *
2482 * The caller must either have a ref on the inode or must have set I_WILL_FREE.
2483 */
2484int write_inode_now(struct inode *inode, int sync)
2485{
2486        struct writeback_control wbc = {
2487                .nr_to_write = LONG_MAX,
2488                .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
2489                .range_start = 0,
2490                .range_end = LLONG_MAX,
2491        };
2492
2493        if (!mapping_cap_writeback_dirty(inode->i_mapping))
2494                wbc.nr_to_write = 0;
2495
2496        might_sleep();
2497        return writeback_single_inode(inode, &wbc);
2498}
2499EXPORT_SYMBOL(write_inode_now);
2500
2501/**
2502 * sync_inode - write an inode and its pages to disk.
2503 * @inode: the inode to sync
2504 * @wbc: controls the writeback mode
2505 *
2506 * sync_inode() will write an inode and its pages to disk.  It will also
2507 * correctly update the inode on its superblock's dirty inode lists and will
2508 * update inode->i_state.
2509 *
2510 * The caller must have a ref on the inode.
2511 */
2512int sync_inode(struct inode *inode, struct writeback_control *wbc)
2513{
2514        return writeback_single_inode(inode, wbc);
2515}
2516EXPORT_SYMBOL(sync_inode);
2517
2518/**
2519 * sync_inode_metadata - write an inode to disk
2520 * @inode: the inode to sync
2521 * @wait: wait for I/O to complete.
2522 *
2523 * Write an inode to disk and adjust its dirty state after completion.
2524 *
2525 * Note: only writes the actual inode, no associated data or other metadata.
2526 */
2527int sync_inode_metadata(struct inode *inode, int wait)
2528{
2529        struct writeback_control wbc = {
2530                .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
2531                .nr_to_write = 0, /* metadata-only */
2532        };
2533
2534        return sync_inode(inode, &wbc);
2535}
2536EXPORT_SYMBOL(sync_inode_metadata);
2537