linux/mm/backing-dev.c
<<
>>
Prefs
   1
   2#include <linux/wait.h>
   3#include <linux/backing-dev.h>
   4#include <linux/kthread.h>
   5#include <linux/freezer.h>
   6#include <linux/fs.h>
   7#include <linux/pagemap.h>
   8#include <linux/mm.h>
   9#include <linux/sched.h>
  10#include <linux/module.h>
  11#include <linux/writeback.h>
  12#include <linux/device.h>
  13#include <trace/events/writeback.h>
  14
  15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
  16
  17struct backing_dev_info default_backing_dev_info = {
  18        .name           = "default",
  19        .ra_pages       = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
  20        .state          = 0,
  21        .capabilities   = BDI_CAP_MAP_COPY,
  22};
  23EXPORT_SYMBOL_GPL(default_backing_dev_info);
  24
  25struct backing_dev_info noop_backing_dev_info = {
  26        .name           = "noop",
  27        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
  28};
  29EXPORT_SYMBOL_GPL(noop_backing_dev_info);
  30
  31static struct class *bdi_class;
  32
  33/*
  34 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
  35 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
  36 * locking.
  37 */
  38DEFINE_SPINLOCK(bdi_lock);
  39LIST_HEAD(bdi_list);
  40LIST_HEAD(bdi_pending_list);
  41
  42static struct task_struct *sync_supers_tsk;
  43static struct timer_list sync_supers_timer;
  44
  45static int bdi_sync_supers(void *);
  46static void sync_supers_timer_fn(unsigned long);
  47
  48#ifdef CONFIG_DEBUG_FS
  49#include <linux/debugfs.h>
  50#include <linux/seq_file.h>
  51
  52static struct dentry *bdi_debug_root;
  53
  54static void bdi_debug_init(void)
  55{
  56        bdi_debug_root = debugfs_create_dir("bdi", NULL);
  57}
  58
  59static int bdi_debug_stats_show(struct seq_file *m, void *v)
  60{
  61        struct backing_dev_info *bdi = m->private;
  62        struct bdi_writeback *wb = &bdi->wb;
  63        unsigned long background_thresh;
  64        unsigned long dirty_thresh;
  65        unsigned long bdi_thresh;
  66        unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
  67        struct inode *inode;
  68
  69        nr_wb = nr_dirty = nr_io = nr_more_io = 0;
  70        spin_lock(&inode_wb_list_lock);
  71        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
  72                nr_dirty++;
  73        list_for_each_entry(inode, &wb->b_io, i_wb_list)
  74                nr_io++;
  75        list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
  76                nr_more_io++;
  77        spin_unlock(&inode_wb_list_lock);
  78
  79        global_dirty_limits(&background_thresh, &dirty_thresh);
  80        bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
  81
  82#define K(x) ((x) << (PAGE_SHIFT - 10))
  83        seq_printf(m,
  84                   "BdiWriteback:     %8lu kB\n"
  85                   "BdiReclaimable:   %8lu kB\n"
  86                   "BdiDirtyThresh:   %8lu kB\n"
  87                   "DirtyThresh:      %8lu kB\n"
  88                   "BackgroundThresh: %8lu kB\n"
  89                   "b_dirty:          %8lu\n"
  90                   "b_io:             %8lu\n"
  91                   "b_more_io:        %8lu\n"
  92                   "bdi_list:         %8u\n"
  93                   "state:            %8lx\n",
  94                   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
  95                   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
  96                   K(bdi_thresh), K(dirty_thresh),
  97                   K(background_thresh), nr_dirty, nr_io, nr_more_io,
  98                   !list_empty(&bdi->bdi_list), bdi->state);
  99#undef K
 100
 101        return 0;
 102}
 103
 104static int bdi_debug_stats_open(struct inode *inode, struct file *file)
 105{
 106        return single_open(file, bdi_debug_stats_show, inode->i_private);
 107}
 108
 109static const struct file_operations bdi_debug_stats_fops = {
 110        .open           = bdi_debug_stats_open,
 111        .read           = seq_read,
 112        .llseek         = seq_lseek,
 113        .release        = single_release,
 114};
 115
 116static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 117{
 118        bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
 119        bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
 120                                               bdi, &bdi_debug_stats_fops);
 121}
 122
 123static void bdi_debug_unregister(struct backing_dev_info *bdi)
 124{
 125        debugfs_remove(bdi->debug_stats);
 126        debugfs_remove(bdi->debug_dir);
 127}
 128#else
 129static inline void bdi_debug_init(void)
 130{
 131}
 132static inline void bdi_debug_register(struct backing_dev_info *bdi,
 133                                      const char *name)
 134{
 135}
 136static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
 137{
 138}
 139#endif
 140
 141static ssize_t read_ahead_kb_store(struct device *dev,
 142                                  struct device_attribute *attr,
 143                                  const char *buf, size_t count)
 144{
 145        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 146        char *end;
 147        unsigned long read_ahead_kb;
 148        ssize_t ret = -EINVAL;
 149
 150        read_ahead_kb = simple_strtoul(buf, &end, 10);
 151        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 152                bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
 153                ret = count;
 154        }
 155        return ret;
 156}
 157
 158#define K(pages) ((pages) << (PAGE_SHIFT - 10))
 159
 160#define BDI_SHOW(name, expr)                                            \
 161static ssize_t name##_show(struct device *dev,                          \
 162                           struct device_attribute *attr, char *page)   \
 163{                                                                       \
 164        struct backing_dev_info *bdi = dev_get_drvdata(dev);            \
 165                                                                        \
 166        return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);  \
 167}
 168
 169BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
 170
 171static ssize_t min_ratio_store(struct device *dev,
 172                struct device_attribute *attr, const char *buf, size_t count)
 173{
 174        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 175        char *end;
 176        unsigned int ratio;
 177        ssize_t ret = -EINVAL;
 178
 179        ratio = simple_strtoul(buf, &end, 10);
 180        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 181                ret = bdi_set_min_ratio(bdi, ratio);
 182                if (!ret)
 183                        ret = count;
 184        }
 185        return ret;
 186}
 187BDI_SHOW(min_ratio, bdi->min_ratio)
 188
 189static ssize_t max_ratio_store(struct device *dev,
 190                struct device_attribute *attr, const char *buf, size_t count)
 191{
 192        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 193        char *end;
 194        unsigned int ratio;
 195        ssize_t ret = -EINVAL;
 196
 197        ratio = simple_strtoul(buf, &end, 10);
 198        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 199                ret = bdi_set_max_ratio(bdi, ratio);
 200                if (!ret)
 201                        ret = count;
 202        }
 203        return ret;
 204}
 205BDI_SHOW(max_ratio, bdi->max_ratio)
 206
 207#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 208
 209static struct device_attribute bdi_dev_attrs[] = {
 210        __ATTR_RW(read_ahead_kb),
 211        __ATTR_RW(min_ratio),
 212        __ATTR_RW(max_ratio),
 213        __ATTR_NULL,
 214};
 215
 216static __init int bdi_class_init(void)
 217{
 218        bdi_class = class_create(THIS_MODULE, "bdi");
 219        if (IS_ERR(bdi_class))
 220                return PTR_ERR(bdi_class);
 221
 222        bdi_class->dev_attrs = bdi_dev_attrs;
 223        bdi_debug_init();
 224        return 0;
 225}
 226postcore_initcall(bdi_class_init);
 227
 228static int __init default_bdi_init(void)
 229{
 230        int err;
 231
 232        sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
 233        BUG_ON(IS_ERR(sync_supers_tsk));
 234
 235        setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
 236        bdi_arm_supers_timer();
 237
 238        err = bdi_init(&default_backing_dev_info);
 239        if (!err)
 240                bdi_register(&default_backing_dev_info, NULL, "default");
 241        err = bdi_init(&noop_backing_dev_info);
 242
 243        return err;
 244}
 245subsys_initcall(default_bdi_init);
 246
 247int bdi_has_dirty_io(struct backing_dev_info *bdi)
 248{
 249        return wb_has_dirty_io(&bdi->wb);
 250}
 251
 252static void bdi_flush_io(struct backing_dev_info *bdi)
 253{
 254        struct writeback_control wbc = {
 255                .sync_mode              = WB_SYNC_NONE,
 256                .older_than_this        = NULL,
 257                .range_cyclic           = 1,
 258                .nr_to_write            = 1024,
 259        };
 260
 261        writeback_inodes_wb(&bdi->wb, &wbc);
 262}
 263
 264/*
 265 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
 266 * or we risk deadlocking on ->s_umount. The longer term solution would be
 267 * to implement sync_supers_bdi() or similar and simply do it from the
 268 * bdi writeback thread individually.
 269 */
 270static int bdi_sync_supers(void *unused)
 271{
 272        set_user_nice(current, 0);
 273
 274        while (!kthread_should_stop()) {
 275                set_current_state(TASK_INTERRUPTIBLE);
 276                schedule();
 277
 278                /*
 279                 * Do this periodically, like kupdated() did before.
 280                 */
 281                sync_supers();
 282        }
 283
 284        return 0;
 285}
 286
 287void bdi_arm_supers_timer(void)
 288{
 289        unsigned long next;
 290
 291        if (!dirty_writeback_interval)
 292                return;
 293
 294        next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
 295        mod_timer(&sync_supers_timer, round_jiffies_up(next));
 296}
 297
 298static void sync_supers_timer_fn(unsigned long unused)
 299{
 300        wake_up_process(sync_supers_tsk);
 301        bdi_arm_supers_timer();
 302}
 303
 304static void wakeup_timer_fn(unsigned long data)
 305{
 306        struct backing_dev_info *bdi = (struct backing_dev_info *)data;
 307
 308        spin_lock_bh(&bdi->wb_lock);
 309        if (bdi->wb.task) {
 310                trace_writeback_wake_thread(bdi);
 311                wake_up_process(bdi->wb.task);
 312        } else {
 313                /*
 314                 * When bdi tasks are inactive for long time, they are killed.
 315                 * In this case we have to wake-up the forker thread which
 316                 * should create and run the bdi thread.
 317                 */
 318                trace_writeback_wake_forker_thread(bdi);
 319                wake_up_process(default_backing_dev_info.wb.task);
 320        }
 321        spin_unlock_bh(&bdi->wb_lock);
 322}
 323
 324/*
 325 * This function is used when the first inode for this bdi is marked dirty. It
 326 * wakes-up the corresponding bdi thread which should then take care of the
 327 * periodic background write-out of dirty inodes. Since the write-out would
 328 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 329 * set up a timer which wakes the bdi thread up later.
 330 *
 331 * Note, we wouldn't bother setting up the timer, but this function is on the
 332 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 333 * by delaying the wake-up.
 334 */
 335void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
 336{
 337        unsigned long timeout;
 338
 339        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
 340        mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
 341}
 342
 343/*
 344 * Calculate the longest interval (jiffies) bdi threads are allowed to be
 345 * inactive.
 346 */
 347static unsigned long bdi_longest_inactive(void)
 348{
 349        unsigned long interval;
 350
 351        interval = msecs_to_jiffies(dirty_writeback_interval * 10);
 352        return max(5UL * 60 * HZ, interval);
 353}
 354
 355static int bdi_forker_thread(void *ptr)
 356{
 357        struct bdi_writeback *me = ptr;
 358
 359        current->flags |= PF_SWAPWRITE;
 360        set_freezable();
 361
 362        /*
 363         * Our parent may run at a different priority, just set us to normal
 364         */
 365        set_user_nice(current, 0);
 366
 367        for (;;) {
 368                struct task_struct *task = NULL;
 369                struct backing_dev_info *bdi;
 370                enum {
 371                        NO_ACTION,   /* Nothing to do */
 372                        FORK_THREAD, /* Fork bdi thread */
 373                        KILL_THREAD, /* Kill inactive bdi thread */
 374                } action = NO_ACTION;
 375
 376                /*
 377                 * Temporary measure, we want to make sure we don't see
 378                 * dirty data on the default backing_dev_info
 379                 */
 380                if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
 381                        del_timer(&me->wakeup_timer);
 382                        wb_do_writeback(me, 0);
 383                }
 384
 385                spin_lock_bh(&bdi_lock);
 386                set_current_state(TASK_INTERRUPTIBLE);
 387
 388                list_for_each_entry(bdi, &bdi_list, bdi_list) {
 389                        bool have_dirty_io;
 390
 391                        if (!bdi_cap_writeback_dirty(bdi) ||
 392                             bdi_cap_flush_forker(bdi))
 393                                continue;
 394
 395                        WARN(!test_bit(BDI_registered, &bdi->state),
 396                             "bdi %p/%s is not registered!\n", bdi, bdi->name);
 397
 398                        have_dirty_io = !list_empty(&bdi->work_list) ||
 399                                        wb_has_dirty_io(&bdi->wb);
 400
 401                        /*
 402                         * If the bdi has work to do, but the thread does not
 403                         * exist - create it.
 404                         */
 405                        if (!bdi->wb.task && have_dirty_io) {
 406                                /*
 407                                 * Set the pending bit - if someone will try to
 408                                 * unregister this bdi - it'll wait on this bit.
 409                                 */
 410                                set_bit(BDI_pending, &bdi->state);
 411                                action = FORK_THREAD;
 412                                break;
 413                        }
 414
 415                        spin_lock(&bdi->wb_lock);
 416
 417                        /*
 418                         * If there is no work to do and the bdi thread was
 419                         * inactive long enough - kill it. The wb_lock is taken
 420                         * to make sure no-one adds more work to this bdi and
 421                         * wakes the bdi thread up.
 422                         */
 423                        if (bdi->wb.task && !have_dirty_io &&
 424                            time_after(jiffies, bdi->wb.last_active +
 425                                                bdi_longest_inactive())) {
 426                                task = bdi->wb.task;
 427                                bdi->wb.task = NULL;
 428                                spin_unlock(&bdi->wb_lock);
 429                                set_bit(BDI_pending, &bdi->state);
 430                                action = KILL_THREAD;
 431                                break;
 432                        }
 433                        spin_unlock(&bdi->wb_lock);
 434                }
 435                spin_unlock_bh(&bdi_lock);
 436
 437                /* Keep working if default bdi still has things to do */
 438                if (!list_empty(&me->bdi->work_list))
 439                        __set_current_state(TASK_RUNNING);
 440
 441                switch (action) {
 442                case FORK_THREAD:
 443                        __set_current_state(TASK_RUNNING);
 444                        task = kthread_create(bdi_writeback_thread, &bdi->wb,
 445                                              "flush-%s", dev_name(bdi->dev));
 446                        if (IS_ERR(task)) {
 447                                /*
 448                                 * If thread creation fails, force writeout of
 449                                 * the bdi from the thread.
 450                                 */
 451                                bdi_flush_io(bdi);
 452                        } else {
 453                                /*
 454                                 * The spinlock makes sure we do not lose
 455                                 * wake-ups when racing with 'bdi_queue_work()'.
 456                                 * And as soon as the bdi thread is visible, we
 457                                 * can start it.
 458                                 */
 459                                spin_lock_bh(&bdi->wb_lock);
 460                                bdi->wb.task = task;
 461                                spin_unlock_bh(&bdi->wb_lock);
 462                                wake_up_process(task);
 463                        }
 464                        break;
 465
 466                case KILL_THREAD:
 467                        __set_current_state(TASK_RUNNING);
 468                        kthread_stop(task);
 469                        break;
 470
 471                case NO_ACTION:
 472                        if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
 473                                /*
 474                                 * There are no dirty data. The only thing we
 475                                 * should now care about is checking for
 476                                 * inactive bdi threads and killing them. Thus,
 477                                 * let's sleep for longer time, save energy and
 478                                 * be friendly for battery-driven devices.
 479                                 */
 480                                schedule_timeout(bdi_longest_inactive());
 481                        else
 482                                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
 483                        try_to_freeze();
 484                        /* Back to the main loop */
 485                        continue;
 486                }
 487
 488                /*
 489                 * Clear pending bit and wakeup anybody waiting to tear us down.
 490                 */
 491                clear_bit(BDI_pending, &bdi->state);
 492                smp_mb__after_clear_bit();
 493                wake_up_bit(&bdi->state, BDI_pending);
 494        }
 495
 496        return 0;
 497}
 498
 499/*
 500 * Remove bdi from bdi_list, and ensure that it is no longer visible
 501 */
 502static void bdi_remove_from_list(struct backing_dev_info *bdi)
 503{
 504        spin_lock_bh(&bdi_lock);
 505        list_del_rcu(&bdi->bdi_list);
 506        spin_unlock_bh(&bdi_lock);
 507
 508        synchronize_rcu();
 509}
 510
 511int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 512                const char *fmt, ...)
 513{
 514        va_list args;
 515        struct device *dev;
 516
 517        if (bdi->dev)   /* The driver needs to use separate queues per device */
 518                return 0;
 519
 520        va_start(args, fmt);
 521        dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
 522        va_end(args);
 523        if (IS_ERR(dev))
 524                return PTR_ERR(dev);
 525
 526        bdi->dev = dev;
 527
 528        /*
 529         * Just start the forker thread for our default backing_dev_info,
 530         * and add other bdi's to the list. They will get a thread created
 531         * on-demand when they need it.
 532         */
 533        if (bdi_cap_flush_forker(bdi)) {
 534                struct bdi_writeback *wb = &bdi->wb;
 535
 536                wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
 537                                                dev_name(dev));
 538                if (IS_ERR(wb->task))
 539                        return PTR_ERR(wb->task);
 540        }
 541
 542        bdi_debug_register(bdi, dev_name(dev));
 543        set_bit(BDI_registered, &bdi->state);
 544
 545        spin_lock_bh(&bdi_lock);
 546        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
 547        spin_unlock_bh(&bdi_lock);
 548
 549        trace_writeback_bdi_register(bdi);
 550        return 0;
 551}
 552EXPORT_SYMBOL(bdi_register);
 553
 554int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
 555{
 556        return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
 557}
 558EXPORT_SYMBOL(bdi_register_dev);
 559
 560/*
 561 * Remove bdi from the global list and shutdown any threads we have running
 562 */
 563static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 564{
 565        if (!bdi_cap_writeback_dirty(bdi))
 566                return;
 567
 568        /*
 569         * Make sure nobody finds us on the bdi_list anymore
 570         */
 571        bdi_remove_from_list(bdi);
 572
 573        /*
 574         * If setup is pending, wait for that to complete first
 575         */
 576        wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
 577                        TASK_UNINTERRUPTIBLE);
 578
 579        /*
 580         * Finally, kill the kernel thread. We don't need to be RCU
 581         * safe anymore, since the bdi is gone from visibility. Force
 582         * unfreeze of the thread before calling kthread_stop(), otherwise
 583         * it would never exet if it is currently stuck in the refrigerator.
 584         */
 585        if (bdi->wb.task) {
 586                thaw_process(bdi->wb.task);
 587                kthread_stop(bdi->wb.task);
 588        }
 589}
 590
 591/*
 592 * This bdi is going away now, make sure that no super_blocks point to it
 593 */
 594static void bdi_prune_sb(struct backing_dev_info *bdi)
 595{
 596        struct super_block *sb;
 597
 598        spin_lock(&sb_lock);
 599        list_for_each_entry(sb, &super_blocks, s_list) {
 600                if (sb->s_bdi == bdi)
 601                        sb->s_bdi = &default_backing_dev_info;
 602        }
 603        spin_unlock(&sb_lock);
 604}
 605
 606void bdi_unregister(struct backing_dev_info *bdi)
 607{
 608        if (bdi->dev) {
 609                trace_writeback_bdi_unregister(bdi);
 610                bdi_prune_sb(bdi);
 611                del_timer_sync(&bdi->wb.wakeup_timer);
 612
 613                if (!bdi_cap_flush_forker(bdi))
 614                        bdi_wb_shutdown(bdi);
 615                bdi_debug_unregister(bdi);
 616                device_unregister(bdi->dev);
 617                bdi->dev = NULL;
 618        }
 619}
 620EXPORT_SYMBOL(bdi_unregister);
 621
 622static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 623{
 624        memset(wb, 0, sizeof(*wb));
 625
 626        wb->bdi = bdi;
 627        wb->last_old_flush = jiffies;
 628        INIT_LIST_HEAD(&wb->b_dirty);
 629        INIT_LIST_HEAD(&wb->b_io);
 630        INIT_LIST_HEAD(&wb->b_more_io);
 631        setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 632}
 633
 634int bdi_init(struct backing_dev_info *bdi)
 635{
 636        int i, err;
 637
 638        bdi->dev = NULL;
 639
 640        bdi->min_ratio = 0;
 641        bdi->max_ratio = 100;
 642        bdi->max_prop_frac = PROP_FRAC_BASE;
 643        spin_lock_init(&bdi->wb_lock);
 644        INIT_LIST_HEAD(&bdi->bdi_list);
 645        INIT_LIST_HEAD(&bdi->work_list);
 646
 647        bdi_wb_init(&bdi->wb, bdi);
 648
 649        for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 650                err = percpu_counter_init(&bdi->bdi_stat[i], 0);
 651                if (err)
 652                        goto err;
 653        }
 654
 655        bdi->dirty_exceeded = 0;
 656        err = prop_local_init_percpu(&bdi->completions);
 657
 658        if (err) {
 659err:
 660                while (i--)
 661                        percpu_counter_destroy(&bdi->bdi_stat[i]);
 662        }
 663
 664        return err;
 665}
 666EXPORT_SYMBOL(bdi_init);
 667
 668void bdi_destroy(struct backing_dev_info *bdi)
 669{
 670        int i;
 671
 672        /*
 673         * Splice our entries to the default_backing_dev_info, if this
 674         * bdi disappears
 675         */
 676        if (bdi_has_dirty_io(bdi)) {
 677                struct bdi_writeback *dst = &default_backing_dev_info.wb;
 678
 679                spin_lock(&inode_wb_list_lock);
 680                list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
 681                list_splice(&bdi->wb.b_io, &dst->b_io);
 682                list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
 683                spin_unlock(&inode_wb_list_lock);
 684        }
 685
 686        bdi_unregister(bdi);
 687
 688        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 689                percpu_counter_destroy(&bdi->bdi_stat[i]);
 690
 691        prop_local_destroy_percpu(&bdi->completions);
 692}
 693EXPORT_SYMBOL(bdi_destroy);
 694
 695/*
 696 * For use from filesystems to quickly init and register a bdi associated
 697 * with dirty writeback
 698 */
 699int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
 700                           unsigned int cap)
 701{
 702        char tmp[32];
 703        int err;
 704
 705        bdi->name = name;
 706        bdi->capabilities = cap;
 707        err = bdi_init(bdi);
 708        if (err)
 709                return err;
 710
 711        sprintf(tmp, "%.28s%s", name, "-%d");
 712        err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
 713        if (err) {
 714                bdi_destroy(bdi);
 715                return err;
 716        }
 717
 718        return 0;
 719}
 720EXPORT_SYMBOL(bdi_setup_and_register);
 721
 722static wait_queue_head_t congestion_wqh[2] = {
 723                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 724                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 725        };
 726static atomic_t nr_bdi_congested[2];
 727
 728void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 729{
 730        enum bdi_state bit;
 731        wait_queue_head_t *wqh = &congestion_wqh[sync];
 732
 733        bit = sync ? BDI_sync_congested : BDI_async_congested;
 734        if (test_and_clear_bit(bit, &bdi->state))
 735                atomic_dec(&nr_bdi_congested[sync]);
 736        smp_mb__after_clear_bit();
 737        if (waitqueue_active(wqh))
 738                wake_up(wqh);
 739}
 740EXPORT_SYMBOL(clear_bdi_congested);
 741
 742void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 743{
 744        enum bdi_state bit;
 745
 746        bit = sync ? BDI_sync_congested : BDI_async_congested;
 747        if (!test_and_set_bit(bit, &bdi->state))
 748                atomic_inc(&nr_bdi_congested[sync]);
 749}
 750EXPORT_SYMBOL(set_bdi_congested);
 751
 752/**
 753 * congestion_wait - wait for a backing_dev to become uncongested
 754 * @sync: SYNC or ASYNC IO
 755 * @timeout: timeout in jiffies
 756 *
 757 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
 758 * write congestion.  If no backing_devs are congested then just wait for the
 759 * next write to be completed.
 760 */
 761long congestion_wait(int sync, long timeout)
 762{
 763        long ret;
 764        unsigned long start = jiffies;
 765        DEFINE_WAIT(wait);
 766        wait_queue_head_t *wqh = &congestion_wqh[sync];
 767
 768        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 769        ret = io_schedule_timeout(timeout);
 770        finish_wait(wqh, &wait);
 771
 772        trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
 773                                        jiffies_to_usecs(jiffies - start));
 774
 775        return ret;
 776}
 777EXPORT_SYMBOL(congestion_wait);
 778
 779/**
 780 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
 781 * @zone: A zone to check if it is heavily congested
 782 * @sync: SYNC or ASYNC IO
 783 * @timeout: timeout in jiffies
 784 *
 785 * In the event of a congested backing_dev (any backing_dev) and the given
 786 * @zone has experienced recent congestion, this waits for up to @timeout
 787 * jiffies for either a BDI to exit congestion of the given @sync queue
 788 * or a write to complete.
 789 *
 790 * In the absence of zone congestion, cond_resched() is called to yield
 791 * the processor if necessary but otherwise does not sleep.
 792 *
 793 * The return value is 0 if the sleep is for the full timeout. Otherwise,
 794 * it is the number of jiffies that were still remaining when the function
 795 * returned. return_value == timeout implies the function did not sleep.
 796 */
 797long wait_iff_congested(struct zone *zone, int sync, long timeout)
 798{
 799        long ret;
 800        unsigned long start = jiffies;
 801        DEFINE_WAIT(wait);
 802        wait_queue_head_t *wqh = &congestion_wqh[sync];
 803
 804        /*
 805         * If there is no congestion, or heavy congestion is not being
 806         * encountered in the current zone, yield if necessary instead
 807         * of sleeping on the congestion queue
 808         */
 809        if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
 810                        !zone_is_reclaim_congested(zone)) {
 811                cond_resched();
 812
 813                /* In case we scheduled, work out time remaining */
 814                ret = timeout - (jiffies - start);
 815                if (ret < 0)
 816                        ret = 0;
 817
 818                goto out;
 819        }
 820
 821        /* Sleep until uncongested or a write happens */
 822        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 823        ret = io_schedule_timeout(timeout);
 824        finish_wait(wqh, &wait);
 825
 826out:
 827        trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
 828                                        jiffies_to_usecs(jiffies - start));
 829
 830        return ret;
 831}
 832EXPORT_SYMBOL(wait_iff_congested);
 833