linux/mm/backing-dev.c
<<
>>
Prefs
   1
   2#include <linux/wait.h>
   3#include <linux/backing-dev.h>
   4#include <linux/kthread.h>
   5#include <linux/freezer.h>
   6#include <linux/fs.h>
   7#include <linux/pagemap.h>
   8#include <linux/mm.h>
   9#include <linux/sched.h>
  10#include <linux/module.h>
  11#include <linux/writeback.h>
  12#include <linux/device.h>
  13#include <trace/events/writeback.h>
  14
  15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
  16
  17void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
  18{
  19}
  20EXPORT_SYMBOL(default_unplug_io_fn);
  21
  22struct backing_dev_info default_backing_dev_info = {
  23        .name           = "default",
  24        .ra_pages       = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
  25        .state          = 0,
  26        .capabilities   = BDI_CAP_MAP_COPY,
  27        .unplug_io_fn   = default_unplug_io_fn,
  28};
  29EXPORT_SYMBOL_GPL(default_backing_dev_info);
  30
  31struct backing_dev_info noop_backing_dev_info = {
  32        .name           = "noop",
  33        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
  34};
  35EXPORT_SYMBOL_GPL(noop_backing_dev_info);
  36
  37static struct class *bdi_class;
  38
  39/*
  40 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
  41 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
  42 * locking.
  43 */
  44DEFINE_SPINLOCK(bdi_lock);
  45LIST_HEAD(bdi_list);
  46LIST_HEAD(bdi_pending_list);
  47
  48static struct task_struct *sync_supers_tsk;
  49static struct timer_list sync_supers_timer;
  50
  51static int bdi_sync_supers(void *);
  52static void sync_supers_timer_fn(unsigned long);
  53
  54#ifdef CONFIG_DEBUG_FS
  55#include <linux/debugfs.h>
  56#include <linux/seq_file.h>
  57
  58static struct dentry *bdi_debug_root;
  59
  60static void bdi_debug_init(void)
  61{
  62        bdi_debug_root = debugfs_create_dir("bdi", NULL);
  63}
  64
  65static int bdi_debug_stats_show(struct seq_file *m, void *v)
  66{
  67        struct backing_dev_info *bdi = m->private;
  68        struct bdi_writeback *wb = &bdi->wb;
  69        unsigned long background_thresh;
  70        unsigned long dirty_thresh;
  71        unsigned long bdi_thresh;
  72        unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
  73        struct inode *inode;
  74
  75        nr_wb = nr_dirty = nr_io = nr_more_io = 0;
  76        spin_lock(&inode_lock);
  77        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
  78                nr_dirty++;
  79        list_for_each_entry(inode, &wb->b_io, i_wb_list)
  80                nr_io++;
  81        list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
  82                nr_more_io++;
  83        spin_unlock(&inode_lock);
  84
  85        global_dirty_limits(&background_thresh, &dirty_thresh);
  86        bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
  87
  88#define K(x) ((x) << (PAGE_SHIFT - 10))
  89        seq_printf(m,
  90                   "BdiWriteback:     %8lu kB\n"
  91                   "BdiReclaimable:   %8lu kB\n"
  92                   "BdiDirtyThresh:   %8lu kB\n"
  93                   "DirtyThresh:      %8lu kB\n"
  94                   "BackgroundThresh: %8lu kB\n"
  95                   "b_dirty:          %8lu\n"
  96                   "b_io:             %8lu\n"
  97                   "b_more_io:        %8lu\n"
  98                   "bdi_list:         %8u\n"
  99                   "state:            %8lx\n",
 100                   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
 101                   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
 102                   K(bdi_thresh), K(dirty_thresh),
 103                   K(background_thresh), nr_dirty, nr_io, nr_more_io,
 104                   !list_empty(&bdi->bdi_list), bdi->state);
 105#undef K
 106
 107        return 0;
 108}
 109
 110static int bdi_debug_stats_open(struct inode *inode, struct file *file)
 111{
 112        return single_open(file, bdi_debug_stats_show, inode->i_private);
 113}
 114
 115static const struct file_operations bdi_debug_stats_fops = {
 116        .open           = bdi_debug_stats_open,
 117        .read           = seq_read,
 118        .llseek         = seq_lseek,
 119        .release        = single_release,
 120};
 121
 122static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 123{
 124        bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
 125        bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
 126                                               bdi, &bdi_debug_stats_fops);
 127}
 128
 129static void bdi_debug_unregister(struct backing_dev_info *bdi)
 130{
 131        debugfs_remove(bdi->debug_stats);
 132        debugfs_remove(bdi->debug_dir);
 133}
 134#else
 135static inline void bdi_debug_init(void)
 136{
 137}
 138static inline void bdi_debug_register(struct backing_dev_info *bdi,
 139                                      const char *name)
 140{
 141}
 142static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
 143{
 144}
 145#endif
 146
 147static ssize_t read_ahead_kb_store(struct device *dev,
 148                                  struct device_attribute *attr,
 149                                  const char *buf, size_t count)
 150{
 151        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 152        char *end;
 153        unsigned long read_ahead_kb;
 154        ssize_t ret = -EINVAL;
 155
 156        read_ahead_kb = simple_strtoul(buf, &end, 10);
 157        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 158                bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
 159                ret = count;
 160        }
 161        return ret;
 162}
 163
 164#define K(pages) ((pages) << (PAGE_SHIFT - 10))
 165
 166#define BDI_SHOW(name, expr)                                            \
 167static ssize_t name##_show(struct device *dev,                          \
 168                           struct device_attribute *attr, char *page)   \
 169{                                                                       \
 170        struct backing_dev_info *bdi = dev_get_drvdata(dev);            \
 171                                                                        \
 172        return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);  \
 173}
 174
 175BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
 176
 177static ssize_t min_ratio_store(struct device *dev,
 178                struct device_attribute *attr, const char *buf, size_t count)
 179{
 180        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 181        char *end;
 182        unsigned int ratio;
 183        ssize_t ret = -EINVAL;
 184
 185        ratio = simple_strtoul(buf, &end, 10);
 186        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 187                ret = bdi_set_min_ratio(bdi, ratio);
 188                if (!ret)
 189                        ret = count;
 190        }
 191        return ret;
 192}
 193BDI_SHOW(min_ratio, bdi->min_ratio)
 194
 195static ssize_t max_ratio_store(struct device *dev,
 196                struct device_attribute *attr, const char *buf, size_t count)
 197{
 198        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 199        char *end;
 200        unsigned int ratio;
 201        ssize_t ret = -EINVAL;
 202
 203        ratio = simple_strtoul(buf, &end, 10);
 204        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 205                ret = bdi_set_max_ratio(bdi, ratio);
 206                if (!ret)
 207                        ret = count;
 208        }
 209        return ret;
 210}
 211BDI_SHOW(max_ratio, bdi->max_ratio)
 212
 213#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 214
 215static struct device_attribute bdi_dev_attrs[] = {
 216        __ATTR_RW(read_ahead_kb),
 217        __ATTR_RW(min_ratio),
 218        __ATTR_RW(max_ratio),
 219        __ATTR_NULL,
 220};
 221
 222static __init int bdi_class_init(void)
 223{
 224        bdi_class = class_create(THIS_MODULE, "bdi");
 225        if (IS_ERR(bdi_class))
 226                return PTR_ERR(bdi_class);
 227
 228        bdi_class->dev_attrs = bdi_dev_attrs;
 229        bdi_debug_init();
 230        return 0;
 231}
 232postcore_initcall(bdi_class_init);
 233
 234static int __init default_bdi_init(void)
 235{
 236        int err;
 237
 238        sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
 239        BUG_ON(IS_ERR(sync_supers_tsk));
 240
 241        setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
 242        bdi_arm_supers_timer();
 243
 244        err = bdi_init(&default_backing_dev_info);
 245        if (!err)
 246                bdi_register(&default_backing_dev_info, NULL, "default");
 247        err = bdi_init(&noop_backing_dev_info);
 248
 249        return err;
 250}
 251subsys_initcall(default_bdi_init);
 252
 253int bdi_has_dirty_io(struct backing_dev_info *bdi)
 254{
 255        return wb_has_dirty_io(&bdi->wb);
 256}
 257
 258static void bdi_flush_io(struct backing_dev_info *bdi)
 259{
 260        struct writeback_control wbc = {
 261                .sync_mode              = WB_SYNC_NONE,
 262                .older_than_this        = NULL,
 263                .range_cyclic           = 1,
 264                .nr_to_write            = 1024,
 265        };
 266
 267        writeback_inodes_wb(&bdi->wb, &wbc);
 268}
 269
 270/*
 271 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
 272 * or we risk deadlocking on ->s_umount. The longer term solution would be
 273 * to implement sync_supers_bdi() or similar and simply do it from the
 274 * bdi writeback thread individually.
 275 */
 276static int bdi_sync_supers(void *unused)
 277{
 278        set_user_nice(current, 0);
 279
 280        while (!kthread_should_stop()) {
 281                set_current_state(TASK_INTERRUPTIBLE);
 282                schedule();
 283
 284                /*
 285                 * Do this periodically, like kupdated() did before.
 286                 */
 287                sync_supers();
 288        }
 289
 290        return 0;
 291}
 292
 293void bdi_arm_supers_timer(void)
 294{
 295        unsigned long next;
 296
 297        if (!dirty_writeback_interval)
 298                return;
 299
 300        next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
 301        mod_timer(&sync_supers_timer, round_jiffies_up(next));
 302}
 303
 304static void sync_supers_timer_fn(unsigned long unused)
 305{
 306        wake_up_process(sync_supers_tsk);
 307        bdi_arm_supers_timer();
 308}
 309
 310static void wakeup_timer_fn(unsigned long data)
 311{
 312        struct backing_dev_info *bdi = (struct backing_dev_info *)data;
 313
 314        spin_lock_bh(&bdi->wb_lock);
 315        if (bdi->wb.task) {
 316                trace_writeback_wake_thread(bdi);
 317                wake_up_process(bdi->wb.task);
 318        } else {
 319                /*
 320                 * When bdi tasks are inactive for long time, they are killed.
 321                 * In this case we have to wake-up the forker thread which
 322                 * should create and run the bdi thread.
 323                 */
 324                trace_writeback_wake_forker_thread(bdi);
 325                wake_up_process(default_backing_dev_info.wb.task);
 326        }
 327        spin_unlock_bh(&bdi->wb_lock);
 328}
 329
 330/*
 331 * This function is used when the first inode for this bdi is marked dirty. It
 332 * wakes-up the corresponding bdi thread which should then take care of the
 333 * periodic background write-out of dirty inodes. Since the write-out would
 334 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 335 * set up a timer which wakes the bdi thread up later.
 336 *
 337 * Note, we wouldn't bother setting up the timer, but this function is on the
 338 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 339 * by delaying the wake-up.
 340 */
 341void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
 342{
 343        unsigned long timeout;
 344
 345        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
 346        mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
 347}
 348
 349/*
 350 * Calculate the longest interval (jiffies) bdi threads are allowed to be
 351 * inactive.
 352 */
 353static unsigned long bdi_longest_inactive(void)
 354{
 355        unsigned long interval;
 356
 357        interval = msecs_to_jiffies(dirty_writeback_interval * 10);
 358        return max(5UL * 60 * HZ, interval);
 359}
 360
 361static int bdi_forker_thread(void *ptr)
 362{
 363        struct bdi_writeback *me = ptr;
 364
 365        current->flags |= PF_SWAPWRITE;
 366        set_freezable();
 367
 368        /*
 369         * Our parent may run at a different priority, just set us to normal
 370         */
 371        set_user_nice(current, 0);
 372
 373        for (;;) {
 374                struct task_struct *task = NULL;
 375                struct backing_dev_info *bdi;
 376                enum {
 377                        NO_ACTION,   /* Nothing to do */
 378                        FORK_THREAD, /* Fork bdi thread */
 379                        KILL_THREAD, /* Kill inactive bdi thread */
 380                } action = NO_ACTION;
 381
 382                /*
 383                 * Temporary measure, we want to make sure we don't see
 384                 * dirty data on the default backing_dev_info
 385                 */
 386                if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
 387                        del_timer(&me->wakeup_timer);
 388                        wb_do_writeback(me, 0);
 389                }
 390
 391                spin_lock_bh(&bdi_lock);
 392                set_current_state(TASK_INTERRUPTIBLE);
 393
 394                list_for_each_entry(bdi, &bdi_list, bdi_list) {
 395                        bool have_dirty_io;
 396
 397                        if (!bdi_cap_writeback_dirty(bdi) ||
 398                             bdi_cap_flush_forker(bdi))
 399                                continue;
 400
 401                        WARN(!test_bit(BDI_registered, &bdi->state),
 402                             "bdi %p/%s is not registered!\n", bdi, bdi->name);
 403
 404                        have_dirty_io = !list_empty(&bdi->work_list) ||
 405                                        wb_has_dirty_io(&bdi->wb);
 406
 407                        /*
 408                         * If the bdi has work to do, but the thread does not
 409                         * exist - create it.
 410                         */
 411                        if (!bdi->wb.task && have_dirty_io) {
 412                                /*
 413                                 * Set the pending bit - if someone will try to
 414                                 * unregister this bdi - it'll wait on this bit.
 415                                 */
 416                                set_bit(BDI_pending, &bdi->state);
 417                                action = FORK_THREAD;
 418                                break;
 419                        }
 420
 421                        spin_lock(&bdi->wb_lock);
 422
 423                        /*
 424                         * If there is no work to do and the bdi thread was
 425                         * inactive long enough - kill it. The wb_lock is taken
 426                         * to make sure no-one adds more work to this bdi and
 427                         * wakes the bdi thread up.
 428                         */
 429                        if (bdi->wb.task && !have_dirty_io &&
 430                            time_after(jiffies, bdi->wb.last_active +
 431                                                bdi_longest_inactive())) {
 432                                task = bdi->wb.task;
 433                                bdi->wb.task = NULL;
 434                                spin_unlock(&bdi->wb_lock);
 435                                set_bit(BDI_pending, &bdi->state);
 436                                action = KILL_THREAD;
 437                                break;
 438                        }
 439                        spin_unlock(&bdi->wb_lock);
 440                }
 441                spin_unlock_bh(&bdi_lock);
 442
 443                /* Keep working if default bdi still has things to do */
 444                if (!list_empty(&me->bdi->work_list))
 445                        __set_current_state(TASK_RUNNING);
 446
 447                switch (action) {
 448                case FORK_THREAD:
 449                        __set_current_state(TASK_RUNNING);
 450                        task = kthread_create(bdi_writeback_thread, &bdi->wb,
 451                                              "flush-%s", dev_name(bdi->dev));
 452                        if (IS_ERR(task)) {
 453                                /*
 454                                 * If thread creation fails, force writeout of
 455                                 * the bdi from the thread.
 456                                 */
 457                                bdi_flush_io(bdi);
 458                        } else {
 459                                /*
 460                                 * The spinlock makes sure we do not lose
 461                                 * wake-ups when racing with 'bdi_queue_work()'.
 462                                 * And as soon as the bdi thread is visible, we
 463                                 * can start it.
 464                                 */
 465                                spin_lock_bh(&bdi->wb_lock);
 466                                bdi->wb.task = task;
 467                                spin_unlock_bh(&bdi->wb_lock);
 468                                wake_up_process(task);
 469                        }
 470                        break;
 471
 472                case KILL_THREAD:
 473                        __set_current_state(TASK_RUNNING);
 474                        kthread_stop(task);
 475                        break;
 476
 477                case NO_ACTION:
 478                        if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
 479                                /*
 480                                 * There are no dirty data. The only thing we
 481                                 * should now care about is checking for
 482                                 * inactive bdi threads and killing them. Thus,
 483                                 * let's sleep for longer time, save energy and
 484                                 * be friendly for battery-driven devices.
 485                                 */
 486                                schedule_timeout(bdi_longest_inactive());
 487                        else
 488                                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
 489                        try_to_freeze();
 490                        /* Back to the main loop */
 491                        continue;
 492                }
 493
 494                /*
 495                 * Clear pending bit and wakeup anybody waiting to tear us down.
 496                 */
 497                clear_bit(BDI_pending, &bdi->state);
 498                smp_mb__after_clear_bit();
 499                wake_up_bit(&bdi->state, BDI_pending);
 500        }
 501
 502        return 0;
 503}
 504
 505/*
 506 * Remove bdi from bdi_list, and ensure that it is no longer visible
 507 */
 508static void bdi_remove_from_list(struct backing_dev_info *bdi)
 509{
 510        spin_lock_bh(&bdi_lock);
 511        list_del_rcu(&bdi->bdi_list);
 512        spin_unlock_bh(&bdi_lock);
 513
 514        synchronize_rcu();
 515}
 516
 517int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 518                const char *fmt, ...)
 519{
 520        va_list args;
 521        struct device *dev;
 522
 523        if (bdi->dev)   /* The driver needs to use separate queues per device */
 524                return 0;
 525
 526        va_start(args, fmt);
 527        dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
 528        va_end(args);
 529        if (IS_ERR(dev))
 530                return PTR_ERR(dev);
 531
 532        bdi->dev = dev;
 533
 534        /*
 535         * Just start the forker thread for our default backing_dev_info,
 536         * and add other bdi's to the list. They will get a thread created
 537         * on-demand when they need it.
 538         */
 539        if (bdi_cap_flush_forker(bdi)) {
 540                struct bdi_writeback *wb = &bdi->wb;
 541
 542                wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
 543                                                dev_name(dev));
 544                if (IS_ERR(wb->task))
 545                        return PTR_ERR(wb->task);
 546        }
 547
 548        bdi_debug_register(bdi, dev_name(dev));
 549        set_bit(BDI_registered, &bdi->state);
 550
 551        spin_lock_bh(&bdi_lock);
 552        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
 553        spin_unlock_bh(&bdi_lock);
 554
 555        trace_writeback_bdi_register(bdi);
 556        return 0;
 557}
 558EXPORT_SYMBOL(bdi_register);
 559
 560int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
 561{
 562        return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
 563}
 564EXPORT_SYMBOL(bdi_register_dev);
 565
 566/*
 567 * Remove bdi from the global list and shutdown any threads we have running
 568 */
 569static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 570{
 571        if (!bdi_cap_writeback_dirty(bdi))
 572                return;
 573
 574        /*
 575         * Make sure nobody finds us on the bdi_list anymore
 576         */
 577        bdi_remove_from_list(bdi);
 578
 579        /*
 580         * If setup is pending, wait for that to complete first
 581         */
 582        wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
 583                        TASK_UNINTERRUPTIBLE);
 584
 585        /*
 586         * Finally, kill the kernel thread. We don't need to be RCU
 587         * safe anymore, since the bdi is gone from visibility. Force
 588         * unfreeze of the thread before calling kthread_stop(), otherwise
 589         * it would never exet if it is currently stuck in the refrigerator.
 590         */
 591        if (bdi->wb.task) {
 592                thaw_process(bdi->wb.task);
 593                kthread_stop(bdi->wb.task);
 594        }
 595}
 596
 597/*
 598 * This bdi is going away now, make sure that no super_blocks point to it
 599 */
 600static void bdi_prune_sb(struct backing_dev_info *bdi)
 601{
 602        struct super_block *sb;
 603
 604        spin_lock(&sb_lock);
 605        list_for_each_entry(sb, &super_blocks, s_list) {
 606                if (sb->s_bdi == bdi)
 607                        sb->s_bdi = NULL;
 608        }
 609        spin_unlock(&sb_lock);
 610}
 611
 612void bdi_unregister(struct backing_dev_info *bdi)
 613{
 614        if (bdi->dev) {
 615                trace_writeback_bdi_unregister(bdi);
 616                bdi_prune_sb(bdi);
 617                del_timer_sync(&bdi->wb.wakeup_timer);
 618
 619                if (!bdi_cap_flush_forker(bdi))
 620                        bdi_wb_shutdown(bdi);
 621                bdi_debug_unregister(bdi);
 622                device_unregister(bdi->dev);
 623                bdi->dev = NULL;
 624        }
 625}
 626EXPORT_SYMBOL(bdi_unregister);
 627
 628static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 629{
 630        memset(wb, 0, sizeof(*wb));
 631
 632        wb->bdi = bdi;
 633        wb->last_old_flush = jiffies;
 634        INIT_LIST_HEAD(&wb->b_dirty);
 635        INIT_LIST_HEAD(&wb->b_io);
 636        INIT_LIST_HEAD(&wb->b_more_io);
 637        setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 638}
 639
 640int bdi_init(struct backing_dev_info *bdi)
 641{
 642        int i, err;
 643
 644        bdi->dev = NULL;
 645
 646        bdi->min_ratio = 0;
 647        bdi->max_ratio = 100;
 648        bdi->max_prop_frac = PROP_FRAC_BASE;
 649        spin_lock_init(&bdi->wb_lock);
 650        INIT_LIST_HEAD(&bdi->bdi_list);
 651        INIT_LIST_HEAD(&bdi->work_list);
 652
 653        bdi_wb_init(&bdi->wb, bdi);
 654
 655        for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 656                err = percpu_counter_init(&bdi->bdi_stat[i], 0);
 657                if (err)
 658                        goto err;
 659        }
 660
 661        bdi->dirty_exceeded = 0;
 662        err = prop_local_init_percpu(&bdi->completions);
 663
 664        if (err) {
 665err:
 666                while (i--)
 667                        percpu_counter_destroy(&bdi->bdi_stat[i]);
 668        }
 669
 670        return err;
 671}
 672EXPORT_SYMBOL(bdi_init);
 673
 674void bdi_destroy(struct backing_dev_info *bdi)
 675{
 676        int i;
 677
 678        /*
 679         * Splice our entries to the default_backing_dev_info, if this
 680         * bdi disappears
 681         */
 682        if (bdi_has_dirty_io(bdi)) {
 683                struct bdi_writeback *dst = &default_backing_dev_info.wb;
 684
 685                spin_lock(&inode_lock);
 686                list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
 687                list_splice(&bdi->wb.b_io, &dst->b_io);
 688                list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
 689                spin_unlock(&inode_lock);
 690        }
 691
 692        bdi_unregister(bdi);
 693
 694        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 695                percpu_counter_destroy(&bdi->bdi_stat[i]);
 696
 697        prop_local_destroy_percpu(&bdi->completions);
 698}
 699EXPORT_SYMBOL(bdi_destroy);
 700
 701/*
 702 * For use from filesystems to quickly init and register a bdi associated
 703 * with dirty writeback
 704 */
 705int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
 706                           unsigned int cap)
 707{
 708        char tmp[32];
 709        int err;
 710
 711        bdi->name = name;
 712        bdi->capabilities = cap;
 713        err = bdi_init(bdi);
 714        if (err)
 715                return err;
 716
 717        sprintf(tmp, "%.28s%s", name, "-%d");
 718        err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
 719        if (err) {
 720                bdi_destroy(bdi);
 721                return err;
 722        }
 723
 724        return 0;
 725}
 726EXPORT_SYMBOL(bdi_setup_and_register);
 727
 728static wait_queue_head_t congestion_wqh[2] = {
 729                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 730                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 731        };
 732static atomic_t nr_bdi_congested[2];
 733
 734void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 735{
 736        enum bdi_state bit;
 737        wait_queue_head_t *wqh = &congestion_wqh[sync];
 738
 739        bit = sync ? BDI_sync_congested : BDI_async_congested;
 740        if (test_and_clear_bit(bit, &bdi->state))
 741                atomic_dec(&nr_bdi_congested[sync]);
 742        smp_mb__after_clear_bit();
 743        if (waitqueue_active(wqh))
 744                wake_up(wqh);
 745}
 746EXPORT_SYMBOL(clear_bdi_congested);
 747
 748void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 749{
 750        enum bdi_state bit;
 751
 752        bit = sync ? BDI_sync_congested : BDI_async_congested;
 753        if (!test_and_set_bit(bit, &bdi->state))
 754                atomic_inc(&nr_bdi_congested[sync]);
 755}
 756EXPORT_SYMBOL(set_bdi_congested);
 757
 758/**
 759 * congestion_wait - wait for a backing_dev to become uncongested
 760 * @sync: SYNC or ASYNC IO
 761 * @timeout: timeout in jiffies
 762 *
 763 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
 764 * write congestion.  If no backing_devs are congested then just wait for the
 765 * next write to be completed.
 766 */
 767long congestion_wait(int sync, long timeout)
 768{
 769        long ret;
 770        unsigned long start = jiffies;
 771        DEFINE_WAIT(wait);
 772        wait_queue_head_t *wqh = &congestion_wqh[sync];
 773
 774        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 775        ret = io_schedule_timeout(timeout);
 776        finish_wait(wqh, &wait);
 777
 778        trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
 779                                        jiffies_to_usecs(jiffies - start));
 780
 781        return ret;
 782}
 783EXPORT_SYMBOL(congestion_wait);
 784
 785/**
 786 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
 787 * @zone: A zone to check if it is heavily congested
 788 * @sync: SYNC or ASYNC IO
 789 * @timeout: timeout in jiffies
 790 *
 791 * In the event of a congested backing_dev (any backing_dev) and the given
 792 * @zone has experienced recent congestion, this waits for up to @timeout
 793 * jiffies for either a BDI to exit congestion of the given @sync queue
 794 * or a write to complete.
 795 *
 796 * In the absense of zone congestion, cond_resched() is called to yield
 797 * the processor if necessary but otherwise does not sleep.
 798 *
 799 * The return value is 0 if the sleep is for the full timeout. Otherwise,
 800 * it is the number of jiffies that were still remaining when the function
 801 * returned. return_value == timeout implies the function did not sleep.
 802 */
 803long wait_iff_congested(struct zone *zone, int sync, long timeout)
 804{
 805        long ret;
 806        unsigned long start = jiffies;
 807        DEFINE_WAIT(wait);
 808        wait_queue_head_t *wqh = &congestion_wqh[sync];
 809
 810        /*
 811         * If there is no congestion, or heavy congestion is not being
 812         * encountered in the current zone, yield if necessary instead
 813         * of sleeping on the congestion queue
 814         */
 815        if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
 816                        !zone_is_reclaim_congested(zone)) {
 817                cond_resched();
 818
 819                /* In case we scheduled, work out time remaining */
 820                ret = timeout - (jiffies - start);
 821                if (ret < 0)
 822                        ret = 0;
 823
 824                goto out;
 825        }
 826
 827        /* Sleep until uncongested or a write happens */
 828        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 829        ret = io_schedule_timeout(timeout);
 830        finish_wait(wqh, &wait);
 831
 832out:
 833        trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
 834                                        jiffies_to_usecs(jiffies - start));
 835
 836        return ret;
 837}
 838EXPORT_SYMBOL(wait_iff_congested);
 839