LXR linux/fs/super.c

   1/*
   2 *  linux/fs/super.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 *
   6 *  super.c contains code to handle: - mount structures
   7 *                                   - super-block tables
   8 *                                   - filesystem drivers list
   9 *                                   - mount system call
  10 *                                   - umount system call
  11 *                                   - ustat system call
  12 *
  13 * GK 2/5/95  -  Changed to support mounting the root fs via NFS
  14 *
  15 *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
  16 *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
  17 *  Added options to /proc/mounts:
  18 *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
  19 *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
  20 *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
  21 */
  22
  23#include <linux/export.h>
  24#include <linux/slab.h>
  25#include <linux/blkdev.h>
  26#include <linux/mount.h>
  27#include <linux/security.h>
  28#include <linux/writeback.h>            /* for the emergency remount stuff */
  29#include <linux/idr.h>
  30#include <linux/mutex.h>
  31#include <linux/backing-dev.h>
  32#include <linux/rculist_bl.h>
  33#include <linux/cleancache.h>
  34#include <linux/fsnotify.h>
  35#include <linux/lockdep.h>
  36#include <linux/user_namespace.h>
  37#include "internal.h"
  38
  39
  40static LIST_HEAD(super_blocks);
  41static DEFINE_SPINLOCK(sb_lock);
  42
  43static char *sb_writers_name[SB_FREEZE_LEVELS] = {
  44        "sb_writers",
  45        "sb_pagefaults",
  46        "sb_internal",
  47};
  48
  49/*
  50 * One thing we have to be careful of with a per-sb shrinker is that we don't
  51 * drop the last active reference to the superblock from within the shrinker.
  52 * If that happens we could trigger unregistering the shrinker from within the
  53 * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
  54 * take a passive reference to the superblock to avoid this from occurring.
  55 */
  56static unsigned long super_cache_scan(struct shrinker *shrink,
  57                                      struct shrink_control *sc)
  58{
  59        struct super_block *sb;
  60        long    fs_objects = 0;
  61        long    total_objects;
  62        long    freed = 0;
  63        long    dentries;
  64        long    inodes;
  65
  66        sb = container_of(shrink, struct super_block, s_shrink);
  67
  68        /*
  69         * Deadlock avoidance.  We may hold various FS locks, and we don't want
  70         * to recurse into the FS that called us in clear_inode() and friends..
  71         */
  72        if (!(sc->gfp_mask & __GFP_FS))
  73                return SHRINK_STOP;
  74
  75        if (!trylock_super(sb))
  76                return SHRINK_STOP;
  77
  78        if (sb->s_op->nr_cached_objects)
  79                fs_objects = sb->s_op->nr_cached_objects(sb, sc);
  80
  81        inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
  82        dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
  83        total_objects = dentries + inodes + fs_objects + 1;
  84        if (!total_objects)
  85                total_objects = 1;
  86
  87        /* proportion the scan between the caches */
  88        dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
  89        inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
  90        fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
  91
  92        /*
  93         * prune the dcache first as the icache is pinned by it, then
  94         * prune the icache, followed by the filesystem specific caches
  95         *
  96         * Ensure that we always scan at least one object - memcg kmem
  97         * accounting uses this to fully empty the caches.
  98         */
  99        sc->nr_to_scan = dentries + 1;
 100        freed = prune_dcache_sb(sb, sc);
 101        sc->nr_to_scan = inodes + 1;
 102        freed += prune_icache_sb(sb, sc);
 103
 104        if (fs_objects) {
 105                sc->nr_to_scan = fs_objects + 1;
 106                freed += sb->s_op->free_cached_objects(sb, sc);
 107        }
 108
 109        up_read(&sb->s_umount);
 110        return freed;
 111}
 112
 113static unsigned long super_cache_count(struct shrinker *shrink,
 114                                       struct shrink_control *sc)
 115{
 116        struct super_block *sb;
 117        long    total_objects = 0;
 118
 119        sb = container_of(shrink, struct super_block, s_shrink);
 120
 121        /*
 122         * Don't call trylock_super as it is a potential
 123         * scalability bottleneck. The counts could get updated
 124         * between super_cache_count and super_cache_scan anyway.
 125         * Call to super_cache_count with shrinker_rwsem held
 126         * ensures the safety of call to list_lru_shrink_count() and
 127         * s_op->nr_cached_objects().
 128         */
 129        if (sb->s_op && sb->s_op->nr_cached_objects)
 130                total_objects = sb->s_op->nr_cached_objects(sb, sc);
 131
 132        total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
 133        total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
 134
 135        total_objects = vfs_pressure_ratio(total_objects);
 136        return total_objects;
 137}
 138
 139static void destroy_super_work(struct work_struct *work)
 140{
 141        struct super_block *s = container_of(work, struct super_block,
 142                                                        destroy_work);
 143        int i;
 144
 145        for (i = 0; i < SB_FREEZE_LEVELS; i++)
 146                percpu_free_rwsem(&s->s_writers.rw_sem[i]);
 147        kfree(s);
 148}
 149
 150static void destroy_super_rcu(struct rcu_head *head)
 151{
 152        struct super_block *s = container_of(head, struct super_block, rcu);
 153        INIT_WORK(&s->destroy_work, destroy_super_work);
 154        schedule_work(&s->destroy_work);
 155}
 156
 157/**
 158 *      destroy_super   -       frees a superblock
 159 *      @s: superblock to free
 160 *
 161 *      Frees a superblock.
 162 */
 163static void destroy_super(struct super_block *s)
 164{
 165        list_lru_destroy(&s->s_dentry_lru);
 166        list_lru_destroy(&s->s_inode_lru);
 167        security_sb_free(s);
 168        WARN_ON(!list_empty(&s->s_mounts));
 169        put_user_ns(s->s_user_ns);
 170        kfree(s->s_subtype);
 171        kfree(s->s_options);
 172        call_rcu(&s->rcu, destroy_super_rcu);
 173}
 174
 175/**
 176 *      alloc_super     -       create new superblock
 177 *      @type:  filesystem type superblock should belong to
 178 *      @flags: the mount flags
 179 *      @user_ns: User namespace for the super_block
 180 *
 181 *      Allocates and initializes a new &struct super_block.  alloc_super()
 182 *      returns a pointer new superblock or %NULL if allocation had failed.
 183 */
 184static struct super_block *alloc_super(struct file_system_type *type, int flags,
 185                                       struct user_namespace *user_ns)
 186{
 187        struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
 188        static const struct super_operations default_op;
 189        int i;
 190
 191        if (!s)
 192                return NULL;
 193
 194        INIT_LIST_HEAD(&s->s_mounts);
 195        s->s_user_ns = get_user_ns(user_ns);
 196
 197        if (security_sb_alloc(s))
 198                goto fail;
 199
 200        for (i = 0; i < SB_FREEZE_LEVELS; i++) {
 201                if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
 202                                        sb_writers_name[i],
 203                                        &type->s_writers_key[i]))
 204                        goto fail;
 205        }
 206        init_waitqueue_head(&s->s_writers.wait_unfrozen);
 207        s->s_bdi = &noop_backing_dev_info;
 208        s->s_flags = flags;
 209        if (s->s_user_ns != &init_user_ns)
 210                s->s_iflags |= SB_I_NODEV;
 211        INIT_HLIST_NODE(&s->s_instances);
 212        INIT_HLIST_BL_HEAD(&s->s_anon);
 213        mutex_init(&s->s_sync_lock);
 214        INIT_LIST_HEAD(&s->s_inodes);
 215        spin_lock_init(&s->s_inode_list_lock);
 216        INIT_LIST_HEAD(&s->s_inodes_wb);
 217        spin_lock_init(&s->s_inode_wblist_lock);
 218
 219        if (list_lru_init_memcg(&s->s_dentry_lru))
 220                goto fail;
 221        if (list_lru_init_memcg(&s->s_inode_lru))
 222                goto fail;
 223
 224        init_rwsem(&s->s_umount);
 225        lockdep_set_class(&s->s_umount, &type->s_umount_key);
 226        /*
 227         * sget() can have s_umount recursion.
 228         *
 229         * When it cannot find a suitable sb, it allocates a new
 230         * one (this one), and tries again to find a suitable old
 231         * one.
 232         *
 233         * In case that succeeds, it will acquire the s_umount
 234         * lock of the old one. Since these are clearly distrinct
 235         * locks, and this object isn't exposed yet, there's no
 236         * risk of deadlocks.
 237         *
 238         * Annotate this by putting this lock in a different
 239         * subclass.
 240         */
 241        down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
 242        s->s_count = 1;
 243        atomic_set(&s->s_active, 1);
 244        mutex_init(&s->s_vfs_rename_mutex);
 245        lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
 246        mutex_init(&s->s_dquot.dqio_mutex);
 247        mutex_init(&s->s_dquot.dqonoff_mutex);
 248        s->s_maxbytes = MAX_NON_LFS;
 249        s->s_op = &default_op;
 250        s->s_time_gran = 1000000000;
 251        s->cleancache_poolid = CLEANCACHE_NO_POOL;
 252
 253        s->s_shrink.seeks = DEFAULT_SEEKS;
 254        s->s_shrink.scan_objects = super_cache_scan;
 255        s->s_shrink.count_objects = super_cache_count;
 256        s->s_shrink.batch = 1024;
 257        s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
 258        return s;
 259
 260fail:
 261        destroy_super(s);
 262        return NULL;
 263}
 264
 265/* Superblock refcounting  */
 266
 267/*
 268 * Drop a superblock's refcount.  The caller must hold sb_lock.
 269 */
 270static void __put_super(struct super_block *sb)
 271{
 272        if (!--sb->s_count) {
 273                list_del_init(&sb->s_list);
 274                destroy_super(sb);
 275        }
 276}
 277
 278/**
 279 *      put_super       -       drop a temporary reference to superblock
 280 *      @sb: superblock in question
 281 *
 282 *      Drops a temporary reference, frees superblock if there's no
 283 *      references left.
 284 */
 285static void put_super(struct super_block *sb)
 286{
 287        spin_lock(&sb_lock);
 288        __put_super(sb);
 289        spin_unlock(&sb_lock);
 290}
 291
 292
 293/**
 294 *      deactivate_locked_super -       drop an active reference to superblock
 295 *      @s: superblock to deactivate
 296 *
 297 *      Drops an active reference to superblock, converting it into a temporary
 298 *      one if there is no other active references left.  In that case we
 299 *      tell fs driver to shut it down and drop the temporary reference we
 300 *      had just acquired.
 301 *
 302 *      Caller holds exclusive lock on superblock; that lock is released.
 303 */
 304void deactivate_locked_super(struct super_block *s)
 305{
 306        struct file_system_type *fs = s->s_type;
 307        if (atomic_dec_and_test(&s->s_active)) {
 308                cleancache_invalidate_fs(s);
 309                unregister_shrinker(&s->s_shrink);
 310                fs->kill_sb(s);
 311
 312                /*
 313                 * Since list_lru_destroy() may sleep, we cannot call it from
 314                 * put_super(), where we hold the sb_lock. Therefore we destroy
 315                 * the lru lists right now.
 316                 */
 317                list_lru_destroy(&s->s_dentry_lru);
 318                list_lru_destroy(&s->s_inode_lru);
 319
 320                put_filesystem(fs);
 321                put_super(s);
 322        } else {
 323                up_write(&s->s_umount);
 324        }
 325}
 326
 327EXPORT_SYMBOL(deactivate_locked_super);
 328
 329/**
 330 *      deactivate_super        -       drop an active reference to superblock
 331 *      @s: superblock to deactivate
 332 *
 333 *      Variant of deactivate_locked_super(), except that superblock is *not*
 334 *      locked by caller.  If we are going to drop the final active reference,
 335 *      lock will be acquired prior to that.
 336 */
 337void deactivate_super(struct super_block *s)
 338{
 339        if (!atomic_add_unless(&s->s_active, -1, 1)) {
 340                down_write(&s->s_umount);
 341                deactivate_locked_super(s);
 342        }
 343}
 344
 345EXPORT_SYMBOL(deactivate_super);
 346
 347/**
 348 *      grab_super - acquire an active reference
 349 *      @s: reference we are trying to make active
 350 *
 351 *      Tries to acquire an active reference.  grab_super() is used when we
 352 *      had just found a superblock in super_blocks or fs_type->fs_supers
 353 *      and want to turn it into a full-blown active reference.  grab_super()
 354 *      is called with sb_lock held and drops it.  Returns 1 in case of
 355 *      success, 0 if we had failed (superblock contents was already dead or
 356 *      dying when grab_super() had been called).  Note that this is only
 357 *      called for superblocks not in rundown mode (== ones still on ->fs_supers
 358 *      of their type), so increment of ->s_count is OK here.
 359 */
 360static int grab_super(struct super_block *s) __releases(sb_lock)
 361{
 362        s->s_count++;
 363        spin_unlock(&sb_lock);
 364        down_write(&s->s_umount);
 365        if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
 366                put_super(s);
 367                return 1;
 368        }
 369        up_write(&s->s_umount);
 370        put_super(s);
 371        return 0;
 372}
 373
 374/*
 375 *      trylock_super - try to grab ->s_umount shared
 376 *      @sb: reference we are trying to grab
 377 *
 378 *      Try to prevent fs shutdown.  This is used in places where we
 379 *      cannot take an active reference but we need to ensure that the
 380 *      filesystem is not shut down while we are working on it. It returns
 381 *      false if we cannot acquire s_umount or if we lose the race and
 382 *      filesystem already got into shutdown, and returns true with the s_umount
 383 *      lock held in read mode in case of success. On successful return,
 384 *      the caller must drop the s_umount lock when done.
 385 *
 386 *      Note that unlike get_super() et.al. this one does *not* bump ->s_count.
 387 *      The reason why it's safe is that we are OK with doing trylock instead
 388 *      of down_read().  There's a couple of places that are OK with that, but
 389 *      it's very much not a general-purpose interface.
 390 */
 391bool trylock_super(struct super_block *sb)
 392{
 393        if (down_read_trylock(&sb->s_umount)) {
 394                if (!hlist_unhashed(&sb->s_instances) &&
 395                    sb->s_root && (sb->s_flags & MS_BORN))
 396                        return true;
 397                up_read(&sb->s_umount);
 398        }
 399
 400        return false;
 401}
 402
 403/**
 404 *      generic_shutdown_super  -       common helper for ->kill_sb()
 405 *      @sb: superblock to kill
 406 *
 407 *      generic_shutdown_super() does all fs-independent work on superblock
 408 *      shutdown.  Typical ->kill_sb() should pick all fs-specific objects
 409 *      that need destruction out of superblock, call generic_shutdown_super()
 410 *      and release aforementioned objects.  Note: dentries and inodes _are_
 411 *      taken care of and do not need specific handling.
 412 *
 413 *      Upon calling this function, the filesystem may no longer alter or
 414 *      rearrange the set of dentries belonging to this super_block, nor may it
 415 *      change the attachments of dentries to inodes.
 416 */
 417void generic_shutdown_super(struct super_block *sb)
 418{
 419        const struct super_operations *sop = sb->s_op;
 420
 421        if (sb->s_root) {
 422                shrink_dcache_for_umount(sb);
 423                sync_filesystem(sb);
 424                sb->s_flags &= ~MS_ACTIVE;
 425
 426                fsnotify_unmount_inodes(sb);
 427                cgroup_writeback_umount();
 428
 429                evict_inodes(sb);
 430
 431                if (sb->s_dio_done_wq) {
 432                        destroy_workqueue(sb->s_dio_done_wq);
 433                        sb->s_dio_done_wq = NULL;
 434                }
 435
 436                if (sop->put_super)
 437                        sop->put_super(sb);
 438
 439                if (!list_empty(&sb->s_inodes)) {
 440                        printk("VFS: Busy inodes after unmount of %s. "
 441                           "Self-destruct in 5 seconds.  Have a nice day...\n",
 442                           sb->s_id);
 443                }
 444        }
 445        spin_lock(&sb_lock);
 446        /* should be initialized for __put_super_and_need_restart() */
 447        hlist_del_init(&sb->s_instances);
 448        spin_unlock(&sb_lock);
 449        up_write(&sb->s_umount);
 450}
 451
 452EXPORT_SYMBOL(generic_shutdown_super);
 453
 454/**
 455 *      sget_userns -   find or create a superblock
 456 *      @type:  filesystem type superblock should belong to
 457 *      @test:  comparison callback
 458 *      @set:   setup callback
 459 *      @flags: mount flags
 460 *      @user_ns: User namespace for the super_block
 461 *      @data:  argument to each of them
 462 */
 463struct super_block *sget_userns(struct file_system_type *type,
 464                        int (*test)(struct super_block *,void *),
 465                        int (*set)(struct super_block *,void *),
 466                        int flags, struct user_namespace *user_ns,
 467                        void *data)
 468{
 469        struct super_block *s = NULL;
 470        struct super_block *old;
 471        int err;
 472
 473        if (!(flags & MS_KERNMOUNT) &&
 474            !(type->fs_flags & FS_USERNS_MOUNT) &&
 475            !capable(CAP_SYS_ADMIN))
 476                return ERR_PTR(-EPERM);
 477retry:
 478        spin_lock(&sb_lock);
 479        if (test) {
 480                hlist_for_each_entry(old, &type->fs_supers, s_instances) {
 481                        if (!test(old, data))
 482                                continue;
 483                        if (user_ns != old->s_user_ns) {
 484                                spin_unlock(&sb_lock);
 485                                if (s) {
 486                                        up_write(&s->s_umount);
 487                                        destroy_super(s);
 488                                }
 489                                return ERR_PTR(-EBUSY);
 490                        }
 491                        if (!grab_super(old))
 492                                goto retry;
 493                        if (s) {
 494                                up_write(&s->s_umount);
 495                                destroy_super(s);
 496                                s = NULL;
 497                        }
 498                        return old;
 499                }
 500        }
 501        if (!s) {
 502                spin_unlock(&sb_lock);
 503                s = alloc_super(type, flags, user_ns);
 504                if (!s)
 505                        return ERR_PTR(-ENOMEM);
 506                goto retry;
 507        }
 508                
 509        err = set(s, data);
 510        if (err) {
 511                spin_unlock(&sb_lock);
 512                up_write(&s->s_umount);
 513                destroy_super(s);
 514                return ERR_PTR(err);
 515        }
 516        s->s_type = type;
 517        strlcpy(s->s_id, type->name, sizeof(s->s_id));
 518        list_add_tail(&s->s_list, &super_blocks);
 519        hlist_add_head(&s->s_instances, &type->fs_supers);
 520        spin_unlock(&sb_lock);
 521        get_filesystem(type);
 522        register_shrinker(&s->s_shrink);
 523        return s;
 524}
 525
 526EXPORT_SYMBOL(sget_userns);
 527
 528/**
 529 *      sget    -       find or create a superblock
 530 *      @type:    filesystem type superblock should belong to
 531 *      @test:    comparison callback
 532 *      @set:     setup callback
 533 *      @flags:   mount flags
 534 *      @data:    argument to each of them
 535 */
 536struct super_block *sget(struct file_system_type *type,
 537                        int (*test)(struct super_block *,void *),
 538                        int (*set)(struct super_block *,void *),
 539                        int flags,
 540                        void *data)
 541{
 542        struct user_namespace *user_ns = current_user_ns();
 543
 544        /* Ensure the requestor has permissions over the target filesystem */
 545        if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 546                return ERR_PTR(-EPERM);
 547
 548        return sget_userns(type, test, set, flags, user_ns, data);
 549}
 550
 551EXPORT_SYMBOL(sget);
 552
 553void drop_super(struct super_block *sb)
 554{
 555        up_read(&sb->s_umount);
 556        put_super(sb);
 557}
 558
 559EXPORT_SYMBOL(drop_super);
 560
 561/**
 562 *      iterate_supers - call function for all active superblocks
 563 *      @f: function to call
 564 *      @arg: argument to pass to it
 565 *
 566 *      Scans the superblock list and calls given function, passing it
 567 *      locked superblock and given argument.
 568 */
 569void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 570{
 571        struct super_block *sb, *p = NULL;
 572
 573        spin_lock(&sb_lock);
 574        list_for_each_entry(sb, &super_blocks, s_list) {
 575                if (hlist_unhashed(&sb->s_instances))
 576                        continue;
 577                sb->s_count++;
 578                spin_unlock(&sb_lock);
 579
 580                down_read(&sb->s_umount);
 581                if (sb->s_root && (sb->s_flags & MS_BORN))
 582                        f(sb, arg);
 583                up_read(&sb->s_umount);
 584
 585                spin_lock(&sb_lock);
 586                if (p)
 587                        __put_super(p);
 588                p = sb;
 589        }
 590        if (p)
 591                __put_super(p);
 592        spin_unlock(&sb_lock);
 593}
 594
 595/**
 596 *      iterate_supers_type - call function for superblocks of given type
 597 *      @type: fs type
 598 *      @f: function to call
 599 *      @arg: argument to pass to it
 600 *
 601 *      Scans the superblock list and calls given function, passing it
 602 *      locked superblock and given argument.
 603 */
 604void iterate_supers_type(struct file_system_type *type,
 605        void (*f)(struct super_block *, void *), void *arg)
 606{
 607        struct super_block *sb, *p = NULL;
 608
 609        spin_lock(&sb_lock);
 610        hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
 611                sb->s_count++;
 612                spin_unlock(&sb_lock);
 613
 614                down_read(&sb->s_umount);
 615                if (sb->s_root && (sb->s_flags & MS_BORN))
 616                        f(sb, arg);
 617                up_read(&sb->s_umount);
 618
 619                spin_lock(&sb_lock);
 620                if (p)
 621                        __put_super(p);
 622                p = sb;
 623        }
 624        if (p)
 625                __put_super(p);
 626        spin_unlock(&sb_lock);
 627}
 628
 629EXPORT_SYMBOL(iterate_supers_type);
 630
 631/**
 632 *      get_super - get the superblock of a device
 633 *      @bdev: device to get the superblock for
 634 *      
 635 *      Scans the superblock list and finds the superblock of the file system
 636 *      mounted on the device given. %NULL is returned if no match is found.
 637 */
 638
 639struct super_block *get_super(struct block_device *bdev)
 640{
 641        struct super_block *sb;
 642
 643        if (!bdev)
 644                return NULL;
 645
 646        spin_lock(&sb_lock);
 647rescan:
 648        list_for_each_entry(sb, &super_blocks, s_list) {
 649                if (hlist_unhashed(&sb->s_instances))
 650                        continue;
 651                if (sb->s_bdev == bdev) {
 652                        sb->s_count++;
 653                        spin_unlock(&sb_lock);
 654                        down_read(&sb->s_umount);
 655                        /* still alive? */
 656                        if (sb->s_root && (sb->s_flags & MS_BORN))
 657                                return sb;
 658                        up_read(&sb->s_umount);
 659                        /* nope, got unmounted */
 660                        spin_lock(&sb_lock);
 661                        __put_super(sb);
 662                        goto rescan;
 663                }
 664        }
 665        spin_unlock(&sb_lock);
 666        return NULL;
 667}
 668
 669EXPORT_SYMBOL(get_super);
 670
 671/**
 672 *      get_super_thawed - get thawed superblock of a device
 673 *      @bdev: device to get the superblock for
 674 *
 675 *      Scans the superblock list and finds the superblock of the file system
 676 *      mounted on the device. The superblock is returned once it is thawed
 677 *      (or immediately if it was not frozen). %NULL is returned if no match
 678 *      is found.
 679 */
 680struct super_block *get_super_thawed(struct block_device *bdev)
 681{
 682        while (1) {
 683                struct super_block *s = get_super(bdev);
 684                if (!s || s->s_writers.frozen == SB_UNFROZEN)
 685                        return s;
 686                up_read(&s->s_umount);
 687                wait_event(s->s_writers.wait_unfrozen,
 688                           s->s_writers.frozen == SB_UNFROZEN);
 689                put_super(s);
 690        }
 691}
 692EXPORT_SYMBOL(get_super_thawed);
 693
 694/**
 695 * get_active_super - get an active reference to the superblock of a device
 696 * @bdev: device to get the superblock for
 697 *
 698 * Scans the superblock list and finds the superblock of the file system
 699 * mounted on the device given.  Returns the superblock with an active
 700 * reference or %NULL if none was found.
 701 */
 702struct super_block *get_active_super(struct block_device *bdev)
 703{
 704        struct super_block *sb;
 705
 706        if (!bdev)
 707                return NULL;
 708
 709restart:
 710        spin_lock(&sb_lock);
 711        list_for_each_entry(sb, &super_blocks, s_list) {
 712                if (hlist_unhashed(&sb->s_instances))
 713                        continue;
 714                if (sb->s_bdev == bdev) {
 715                        if (!grab_super(sb))
 716                                goto restart;
 717                        up_write(&sb->s_umount);
 718                        return sb;
 719                }
 720        }
 721        spin_unlock(&sb_lock);
 722        return NULL;
 723}
 724 
 725struct super_block *user_get_super(dev_t dev)
 726{
 727        struct super_block *sb;
 728
 729        spin_lock(&sb_lock);
 730rescan:
 731        list_for_each_entry(sb, &super_blocks, s_list) {
 732                if (hlist_unhashed(&sb->s_instances))
 733                        continue;
 734                if (sb->s_dev ==  dev) {
 735                        sb->s_count++;
 736                        spin_unlock(&sb_lock);
 737                        down_read(&sb->s_umount);
 738                        /* still alive? */
 739                        if (sb->s_root && (sb->s_flags & MS_BORN))
 740                                return sb;
 741                        up_read(&sb->s_umount);
 742                        /* nope, got unmounted */
 743                        spin_lock(&sb_lock);
 744                        __put_super(sb);
 745                        goto rescan;
 746                }
 747        }
 748        spin_unlock(&sb_lock);
 749        return NULL;
 750}
 751
 752/**
 753 *      do_remount_sb - asks filesystem to change mount options.
 754 *      @sb:    superblock in question
 755 *      @flags: numeric part of options
 756 *      @data:  the rest of options
 757 *      @force: whether or not to force the change
 758 *
 759 *      Alters the mount options of a mounted file system.
 760 */
 761int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 762{
 763        int retval;
 764        int remount_ro;
 765
 766        if (sb->s_writers.frozen != SB_UNFROZEN)
 767                return -EBUSY;
 768
 769#ifdef CONFIG_BLOCK
 770        if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
 771                return -EACCES;
 772#endif
 773
 774        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
 775
 776        if (remount_ro) {
 777                if (!hlist_empty(&sb->s_pins)) {
 778                        up_write(&sb->s_umount);
 779                        group_pin_kill(&sb->s_pins);
 780                        down_write(&sb->s_umount);
 781                        if (!sb->s_root)
 782                                return 0;
 783                        if (sb->s_writers.frozen != SB_UNFROZEN)
 784                                return -EBUSY;
 785                        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
 786                }
 787        }
 788        shrink_dcache_sb(sb);
 789
 790        /* If we are remounting RDONLY and current sb is read/write,
 791           make sure there are no rw files opened */
 792        if (remount_ro) {
 793                if (force) {
 794                        sb->s_readonly_remount = 1;
 795                        smp_wmb();
 796                } else {
 797                        retval = sb_prepare_remount_readonly(sb);
 798                        if (retval)
 799                                return retval;
 800                }
 801        }
 802
 803        if (sb->s_op->remount_fs) {
 804                retval = sb->s_op->remount_fs(sb, &flags, data);
 805                if (retval) {
 806                        if (!force)
 807                                goto cancel_readonly;
 808                        /* If forced remount, go ahead despite any errors */
 809                        WARN(1, "forced remount of a %s fs returned %i\n",
 810                             sb->s_type->name, retval);
 811                }
 812        }
 813        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 814        /* Needs to be ordered wrt mnt_is_readonly() */
 815        smp_wmb();
 816        sb->s_readonly_remount = 0;
 817
 818        /*
 819         * Some filesystems modify their metadata via some other path than the
 820         * bdev buffer cache (eg. use a private mapping, or directories in
 821         * pagecache, etc). Also file data modifications go via their own
 822         * mappings. So If we try to mount readonly then copy the filesystem
 823         * from bdev, we could get stale data, so invalidate it to give a best
 824         * effort at coherency.
 825         */
 826        if (remount_ro && sb->s_bdev)
 827                invalidate_bdev(sb->s_bdev);
 828        return 0;
 829
 830cancel_readonly:
 831        sb->s_readonly_remount = 0;
 832        return retval;
 833}
 834
 835static void do_emergency_remount(struct work_struct *work)
 836{
 837        struct super_block *sb, *p = NULL;
 838
 839        spin_lock(&sb_lock);
 840        list_for_each_entry(sb, &super_blocks, s_list) {
 841                if (hlist_unhashed(&sb->s_instances))
 842                        continue;
 843                sb->s_count++;
 844                spin_unlock(&sb_lock);
 845                down_write(&sb->s_umount);
 846                if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) &&
 847                    !(sb->s_flags & MS_RDONLY)) {
 848                        /*
 849                         * What lock protects sb->s_flags??
 850                         */
 851                        do_remount_sb(sb, MS_RDONLY, NULL, 1);
 852                }
 853                up_write(&sb->s_umount);
 854                spin_lock(&sb_lock);
 855                if (p)
 856                        __put_super(p);
 857                p = sb;
 858        }
 859        if (p)
 860                __put_super(p);
 861        spin_unlock(&sb_lock);
 862        kfree(work);
 863        printk("Emergency Remount complete\n");
 864}
 865
 866void emergency_remount(void)
 867{
 868        struct work_struct *work;
 869
 870        work = kmalloc(sizeof(*work), GFP_ATOMIC);
 871        if (work) {
 872                INIT_WORK(work, do_emergency_remount);
 873                schedule_work(work);
 874        }
 875}
 876
 877/*
 878 * Unnamed block devices are dummy devices used by virtual
 879 * filesystems which don't use real block-devices.  -- jrs
 880 */
 881
 882static DEFINE_IDA(unnamed_dev_ida);
 883static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
 884/* Many userspace utilities consider an FSID of 0 invalid.
 885 * Always return at least 1 from get_anon_bdev.
 886 */
 887static int unnamed_dev_start = 1;
 888
 889int get_anon_bdev(dev_t *p)
 890{
 891        int dev;
 892        int error;
 893
 894 retry:
 895        if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
 896                return -ENOMEM;
 897        spin_lock(&unnamed_dev_lock);
 898        error = ida_get_new_above(&unnamed_dev_ida, unnamed_dev_start, &dev);
 899        if (!error)
 900                unnamed_dev_start = dev + 1;
 901        spin_unlock(&unnamed_dev_lock);
 902        if (error == -EAGAIN)
 903                /* We raced and lost with another CPU. */
 904                goto retry;
 905        else if (error)
 906                return -EAGAIN;
 907
 908        if (dev >= (1 << MINORBITS)) {
 909                spin_lock(&unnamed_dev_lock);
 910                ida_remove(&unnamed_dev_ida, dev);
 911                if (unnamed_dev_start > dev)
 912                        unnamed_dev_start = dev;
 913                spin_unlock(&unnamed_dev_lock);
 914                return -EMFILE;
 915        }
 916        *p = MKDEV(0, dev & MINORMASK);
 917        return 0;
 918}
 919EXPORT_SYMBOL(get_anon_bdev);
 920
 921void free_anon_bdev(dev_t dev)
 922{
 923        int slot = MINOR(dev);
 924        spin_lock(&unnamed_dev_lock);
 925        ida_remove(&unnamed_dev_ida, slot);
 926        if (slot < unnamed_dev_start)
 927                unnamed_dev_start = slot;
 928        spin_unlock(&unnamed_dev_lock);
 929}
 930EXPORT_SYMBOL(free_anon_bdev);
 931
 932int set_anon_super(struct super_block *s, void *data)
 933{
 934        return get_anon_bdev(&s->s_dev);
 935}
 936
 937EXPORT_SYMBOL(set_anon_super);
 938
 939void kill_anon_super(struct super_block *sb)
 940{
 941        dev_t dev = sb->s_dev;
 942        generic_shutdown_super(sb);
 943        free_anon_bdev(dev);
 944}
 945
 946EXPORT_SYMBOL(kill_anon_super);
 947
 948void kill_litter_super(struct super_block *sb)
 949{
 950        if (sb->s_root)
 951                d_genocide(sb->s_root);
 952        kill_anon_super(sb);
 953}
 954
 955EXPORT_SYMBOL(kill_litter_super);
 956
 957static int ns_test_super(struct super_block *sb, void *data)
 958{
 959        return sb->s_fs_info == data;
 960}
 961
 962static int ns_set_super(struct super_block *sb, void *data)
 963{
 964        sb->s_fs_info = data;
 965        return set_anon_super(sb, NULL);
 966}
 967
 968struct dentry *mount_ns(struct file_system_type *fs_type,
 969        int flags, void *data, void *ns, struct user_namespace *user_ns,
 970        int (*fill_super)(struct super_block *, void *, int))
 971{
 972        struct super_block *sb;
 973
 974        /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
 975         * over the namespace.
 976         */
 977        if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 978                return ERR_PTR(-EPERM);
 979
 980        sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
 981                         user_ns, ns);
 982        if (IS_ERR(sb))
 983                return ERR_CAST(sb);
 984
 985        if (!sb->s_root) {
 986                int err;
 987                err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 988                if (err) {
 989                        deactivate_locked_super(sb);
 990                        return ERR_PTR(err);
 991                }
 992
 993                sb->s_flags |= MS_ACTIVE;
 994        }
 995
 996        return dget(sb->s_root);
 997}
 998
 999EXPORT_SYMBOL(mount_ns);
1000

1001#ifdef CONFIG_BLOCK
1002static int set_bdev_super(struct super_block *s, void *data)
1003{
1004        s->s_bdev = data;
1005        s->s_dev = s->s_bdev->bd_dev;
1006
1007        /*
1008         * We set the bdi here to the queue backing, file systems can
1009         * overwrite this in ->fill_super()
1010         */
1011        s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
1012        return 0;
1013}
1014
1015static int test_bdev_super(struct super_block *s, void *data)
1016{
1017        return (void *)s->s_bdev == data;
1018}
1019
1020struct dentry *mount_bdev(struct file_system_type *fs_type,
1021        int flags, const char *dev_name, void *data,
1022        int (*fill_super)(struct super_block *, void *, int))
1023{
1024        struct block_device *bdev;
1025        struct super_block *s;
1026        fmode_t mode = FMODE_READ | FMODE_EXCL;
1027        int error = 0;
1028
1029        if (!(flags & MS_RDONLY))
1030                mode |= FMODE_WRITE;
1031
1032        bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1033        if (IS_ERR(bdev))
1034                return ERR_CAST(bdev);
1035
1036        /*
1037         * once the super is inserted into the list by sget, s_umount
1038         * will protect the lockfs code from trying to start a snapshot
1039         * while we are mounting
1040         */
1041        mutex_lock(&bdev->bd_fsfreeze_mutex);
1042        if (bdev->bd_fsfreeze_count > 0) {
1043                mutex_unlock(&bdev->bd_fsfreeze_mutex);
1044                error = -EBUSY;
1045                goto error_bdev;
1046        }
1047        s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC,
1048                 bdev);
1049        mutex_unlock(&bdev->bd_fsfreeze_mutex);
1050        if (IS_ERR(s))
1051                goto error_s;
1052
1053        if (s->s_root) {
1054                if ((flags ^ s->s_flags) & MS_RDONLY) {
1055                        deactivate_locked_super(s);
1056                        error = -EBUSY;
1057                        goto error_bdev;
1058                }
1059
1060                /*
1061                 * s_umount nests inside bd_mutex during
1062                 * __invalidate_device().  blkdev_put() acquires
1063                 * bd_mutex and can't be called under s_umount.  Drop
1064                 * s_umount temporarily.  This is safe as we're
1065                 * holding an active reference.
1066                 */
1067                up_write(&s->s_umount);
1068                blkdev_put(bdev, mode);
1069                down_write(&s->s_umount);
1070        } else {
1071                s->s_mode = mode;
1072                snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
1073                sb_set_blocksize(s, block_size(bdev));
1074                error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1075                if (error) {
1076                        deactivate_locked_super(s);
1077                        goto error;
1078                }
1079
1080                s->s_flags |= MS_ACTIVE;
1081                bdev->bd_super = s;
1082        }
1083
1084        return dget(s->s_root);
1085
1086error_s:
1087        error = PTR_ERR(s);
1088error_bdev:
1089        blkdev_put(bdev, mode);
1090error:
1091        return ERR_PTR(error);
1092}
1093EXPORT_SYMBOL(mount_bdev);
1094
1095void kill_block_super(struct super_block *sb)
1096{
1097        struct block_device *bdev = sb->s_bdev;
1098        fmode_t mode = sb->s_mode;
1099
1100        bdev->bd_super = NULL;
1101        generic_shutdown_super(sb);
1102        sync_blockdev(bdev);
1103        WARN_ON_ONCE(!(mode & FMODE_EXCL));
1104        blkdev_put(bdev, mode | FMODE_EXCL);
1105}
1106
1107EXPORT_SYMBOL(kill_block_super);
1108#endif
1109
1110struct dentry *mount_nodev(struct file_system_type *fs_type,
1111        int flags, void *data,
1112        int (*fill_super)(struct super_block *, void *, int))
1113{
1114        int error;
1115        struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
1116
1117        if (IS_ERR(s))
1118                return ERR_CAST(s);
1119
1120        error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1121        if (error) {
1122                deactivate_locked_super(s);
1123                return ERR_PTR(error);
1124        }
1125        s->s_flags |= MS_ACTIVE;
1126        return dget(s->s_root);
1127}
1128EXPORT_SYMBOL(mount_nodev);
1129
1130static int compare_single(struct super_block *s, void *p)
1131{
1132        return 1;
1133}
1134
1135struct dentry *mount_single(struct file_system_type *fs_type,
1136        int flags, void *data,
1137        int (*fill_super)(struct super_block *, void *, int))
1138{
1139        struct super_block *s;
1140        int error;
1141
1142        s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
1143        if (IS_ERR(s))
1144                return ERR_CAST(s);
1145        if (!s->s_root) {
1146                error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1147                if (error) {
1148                        deactivate_locked_super(s);
1149                        return ERR_PTR(error);
1150                }
1151                s->s_flags |= MS_ACTIVE;
1152        } else {
1153                do_remount_sb(s, flags, data, 0);
1154        }
1155        return dget(s->s_root);
1156}
1157EXPORT_SYMBOL(mount_single);
1158
1159struct dentry *
1160mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
1161{
1162        struct dentry *root;
1163        struct super_block *sb;
1164        char *secdata = NULL;
1165        int error = -ENOMEM;
1166
1167        if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
1168                secdata = alloc_secdata();
1169                if (!secdata)
1170                        goto out;
1171
1172                error = security_sb_copy_data(data, secdata);
1173                if (error)
1174                        goto out_free_secdata;
1175        }
1176
1177        root = type->mount(type, flags, name, data);
1178        if (IS_ERR(root)) {
1179                error = PTR_ERR(root);
1180                goto out_free_secdata;
1181        }
1182        sb = root->d_sb;
1183        BUG_ON(!sb);
1184        WARN_ON(!sb->s_bdi);
1185        sb->s_flags |= MS_BORN;
1186
1187        error = security_sb_kern_mount(sb, flags, secdata);
1188        if (error)
1189                goto out_sb;
1190
1191        /*
1192         * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
1193         * but s_maxbytes was an unsigned long long for many releases. Throw
1194         * this warning for a little while to try and catch filesystems that
1195         * violate this rule.
1196         */
1197        WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
1198                "negative value (%lld)\n", type->name, sb->s_maxbytes);
1199
1200        up_write(&sb->s_umount);
1201        free_secdata(secdata);
1202        return root;
1203out_sb:
1204        dput(root);
1205        deactivate_locked_super(sb);
1206out_free_secdata:
1207        free_secdata(secdata);
1208out:
1209        return ERR_PTR(error);
1210}
1211
1212/*
1213 * This is an internal function, please use sb_end_{write,pagefault,intwrite}
1214 * instead.
1215 */
1216void __sb_end_write(struct super_block *sb, int level)
1217{
1218        percpu_up_read(sb->s_writers.rw_sem + level-1);
1219}
1220EXPORT_SYMBOL(__sb_end_write);
1221
1222/*
1223 * This is an internal function, please use sb_start_{write,pagefault,intwrite}
1224 * instead.
1225 */
1226int __sb_start_write(struct super_block *sb, int level, bool wait)
1227{
1228        bool force_trylock = false;
1229        int ret = 1;
1230
1231#ifdef CONFIG_LOCKDEP
1232        /*
1233         * We want lockdep to tell us about possible deadlocks with freezing
1234         * but it's it bit tricky to properly instrument it. Getting a freeze
1235         * protection works as getting a read lock but there are subtle
1236         * problems. XFS for example gets freeze protection on internal level
1237         * twice in some cases, which is OK only because we already hold a
1238         * freeze protection also on higher level. Due to these cases we have
1239         * to use wait == F (trylock mode) which must not fail.
1240         */
1241        if (wait) {
1242                int i;
1243
1244                for (i = 0; i < level - 1; i++)
1245                        if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
1246                                force_trylock = true;
1247                                break;
1248                        }
1249        }
1250#endif
1251        if (wait && !force_trylock)
1252                percpu_down_read(sb->s_writers.rw_sem + level-1);
1253        else
1254                ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
1255
1256        WARN_ON(force_trylock && !ret);
1257        return ret;
1258}
1259EXPORT_SYMBOL(__sb_start_write);
1260
1261/**
1262 * sb_wait_write - wait until all writers to given file system finish
1263 * @sb: the super for which we wait
1264 * @level: type of writers we wait for (normal vs page fault)
1265 *
1266 * This function waits until there are no writers of given type to given file
1267 * system.
1268 */
1269static void sb_wait_write(struct super_block *sb, int level)
1270{
1271        percpu_down_write(sb->s_writers.rw_sem + level-1);
1272}
1273
1274/*
1275 * We are going to return to userspace and forget about these locks, the
1276 * ownership goes to the caller of thaw_super() which does unlock().
1277 */
1278static void lockdep_sb_freeze_release(struct super_block *sb)
1279{
1280        int level;
1281
1282        for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
1283                percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
1284}
1285
1286/*
1287 * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
1288 */
1289static void lockdep_sb_freeze_acquire(struct super_block *sb)
1290{
1291        int level;
1292
1293        for (level = 0; level < SB_FREEZE_LEVELS; ++level)
1294                percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
1295}
1296
1297static void sb_freeze_unlock(struct super_block *sb)
1298{
1299        int level;
1300
1301        for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
1302                percpu_up_write(sb->s_writers.rw_sem + level);
1303}
1304
1305/**
1306 * freeze_super - lock the filesystem and force it into a consistent state
1307 * @sb: the super to lock
1308 *
1309 * Syncs the super to make sure the filesystem is consistent and calls the fs's
1310 * freeze_fs.  Subsequent calls to this without first thawing the fs will return
1311 * -EBUSY.
1312 *
1313 * During this function, sb->s_writers.frozen goes through these values:
1314 *
1315 * SB_UNFROZEN: File system is normal, all writes progress as usual.
1316 *
1317 * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
1318 * writes should be blocked, though page faults are still allowed. We wait for
1319 * all writes to complete and then proceed to the next stage.
1320 *
1321 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
1322 * but internal fs threads can still modify the filesystem (although they
1323 * should not dirty new pages or inodes), writeback can run etc. After waiting
1324 * for all running page faults we sync the filesystem which will clean all
1325 * dirty pages and inodes (no new dirty pages or inodes can be created when
1326 * sync is running).
1327 *
1328 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
1329 * modification are blocked (e.g. XFS preallocation truncation on inode
1330 * reclaim). This is usually implemented by blocking new transactions for
1331 * filesystems that have them and need this additional guard. After all
1332 * internal writers are finished we call ->freeze_fs() to finish filesystem
1333 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
1334 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
1335 *
1336 * sb->s_writers.frozen is protected by sb->s_umount.
1337 */
1338int freeze_super(struct super_block *sb)
1339{
1340        int ret;
1341
1342        atomic_inc(&sb->s_active);
1343        down_write(&sb->s_umount);
1344        if (sb->s_writers.frozen != SB_UNFROZEN) {
1345                deactivate_locked_super(sb);
1346                return -EBUSY;
1347        }
1348
1349        if (!(sb->s_flags & MS_BORN)) {
1350                up_write(&sb->s_umount);
1351                return 0;       /* sic - it's "nothing to do" */
1352        }
1353
1354        if (sb->s_flags & MS_RDONLY) {
1355                /* Nothing to do really... */
1356                sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1357                up_write(&sb->s_umount);
1358                return 0;
1359        }
1360
1361        sb->s_writers.frozen = SB_FREEZE_WRITE;
1362        /* Release s_umount to preserve sb_start_write -> s_umount ordering */
1363        up_write(&sb->s_umount);
1364        sb_wait_write(sb, SB_FREEZE_WRITE);
1365        down_write(&sb->s_umount);
1366
1367        /* Now we go and block page faults... */
1368        sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
1369        sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
1370
1371        /* All writers are done so after syncing there won't be dirty data */
1372        sync_filesystem(sb);
1373
1374        /* Now wait for internal filesystem counter */
1375        sb->s_writers.frozen = SB_FREEZE_FS;
1376        sb_wait_write(sb, SB_FREEZE_FS);
1377
1378        if (sb->s_op->freeze_fs) {
1379                ret = sb->s_op->freeze_fs(sb);
1380                if (ret) {
1381                        printk(KERN_ERR
1382                                "VFS:Filesystem freeze failed\n");
1383                        sb->s_writers.frozen = SB_UNFROZEN;
1384                        sb_freeze_unlock(sb);
1385                        wake_up(&sb->s_writers.wait_unfrozen);
1386                        deactivate_locked_super(sb);
1387                        return ret;
1388                }
1389        }
1390        /*
1391         * For debugging purposes so that fs can warn if it sees write activity
1392         * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
1393         */
1394        sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1395        lockdep_sb_freeze_release(sb);
1396        up_write(&sb->s_umount);
1397        return 0;
1398}
1399EXPORT_SYMBOL(freeze_super);
1400
1401/**
1402 * thaw_super -- unlock filesystem
1403 * @sb: the super to thaw
1404 *
1405 * Unlocks the filesystem and marks it writeable again after freeze_super().
1406 */
1407int thaw_super(struct super_block *sb)
1408{
1409        int error;
1410
1411        down_write(&sb->s_umount);
1412        if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
1413                up_write(&sb->s_umount);
1414                return -EINVAL;
1415        }
1416
1417        if (sb->s_flags & MS_RDONLY) {
1418                sb->s_writers.frozen = SB_UNFROZEN;
1419                goto out;
1420        }
1421
1422        lockdep_sb_freeze_acquire(sb);
1423
1424        if (sb->s_op->unfreeze_fs) {
1425                error = sb->s_op->unfreeze_fs(sb);
1426                if (error) {
1427                        printk(KERN_ERR
1428                                "VFS:Filesystem thaw failed\n");
1429                        lockdep_sb_freeze_release(sb);
1430                        up_write(&sb->s_umount);
1431                        return error;
1432                }
1433        }
1434
1435        sb->s_writers.frozen = SB_UNFROZEN;
1436        sb_freeze_unlock(sb);
1437out:
1438        wake_up(&sb->s_writers.wait_unfrozen);
1439        deactivate_locked_super(sb);
1440        return 0;
1441}
1442EXPORT_SYMBOL(thaw_super);
1443