LXR linux/fs/ext4/super.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/ext4/super.c
   4 *
   5 * Copyright (C) 1992, 1993, 1994, 1995
   6 * Remy Card (card@masi.ibp.fr)
   7 * Laboratoire MASI - Institut Blaise Pascal
   8 * Universite Pierre et Marie Curie (Paris VI)
   9 *
  10 *  from
  11 *
  12 *  linux/fs/minix/inode.c
  13 *
  14 *  Copyright (C) 1991, 1992  Linus Torvalds
  15 *
  16 *  Big-endian to little-endian byte-swapping/bitmaps by
  17 *        David S. Miller (davem@caip.rutgers.edu), 1995
  18 */
  19
  20#include <linux/module.h>
  21#include <linux/string.h>
  22#include <linux/fs.h>
  23#include <linux/time.h>
  24#include <linux/vmalloc.h>
  25#include <linux/slab.h>
  26#include <linux/init.h>
  27#include <linux/blkdev.h>
  28#include <linux/backing-dev.h>
  29#include <linux/parser.h>
  30#include <linux/buffer_head.h>
  31#include <linux/exportfs.h>
  32#include <linux/vfs.h>
  33#include <linux/random.h>
  34#include <linux/mount.h>
  35#include <linux/namei.h>
  36#include <linux/quotaops.h>
  37#include <linux/seq_file.h>
  38#include <linux/ctype.h>
  39#include <linux/log2.h>
  40#include <linux/crc16.h>
  41#include <linux/dax.h>
  42#include <linux/cleancache.h>
  43#include <linux/uaccess.h>
  44#include <linux/iversion.h>
  45
  46#include <linux/kthread.h>
  47#include <linux/freezer.h>
  48
  49#include "ext4.h"
  50#include "ext4_extents.h"       /* Needed for trace points definition */
  51#include "ext4_jbd2.h"
  52#include "xattr.h"
  53#include "acl.h"
  54#include "mballoc.h"
  55#include "fsmap.h"
  56
  57#define CREATE_TRACE_POINTS
  58#include <trace/events/ext4.h>
  59
  60static struct ext4_lazy_init *ext4_li_info;
  61static struct mutex ext4_li_mtx;
  62static struct ratelimit_state ext4_mount_msg_ratelimit;
  63
  64static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
  65                             unsigned long journal_devnum);
  66static int ext4_show_options(struct seq_file *seq, struct dentry *root);
  67static int ext4_commit_super(struct super_block *sb, int sync);
  68static void ext4_mark_recovery_complete(struct super_block *sb,
  69                                        struct ext4_super_block *es);
  70static void ext4_clear_journal_err(struct super_block *sb,
  71                                   struct ext4_super_block *es);
  72static int ext4_sync_fs(struct super_block *sb, int wait);
  73static int ext4_remount(struct super_block *sb, int *flags, char *data);
  74static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
  75static int ext4_unfreeze(struct super_block *sb);
  76static int ext4_freeze(struct super_block *sb);
  77static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
  78                       const char *dev_name, void *data);
  79static inline int ext2_feature_set_ok(struct super_block *sb);
  80static inline int ext3_feature_set_ok(struct super_block *sb);
  81static int ext4_feature_set_ok(struct super_block *sb, int readonly);
  82static void ext4_destroy_lazyinit_thread(void);
  83static void ext4_unregister_li_request(struct super_block *sb);
  84static void ext4_clear_request_list(void);
  85static struct inode *ext4_get_journal_inode(struct super_block *sb,
  86                                            unsigned int journal_inum);
  87
  88/*
  89 * Lock ordering
  90 *
  91 * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
  92 * i_mmap_rwsem (inode->i_mmap_rwsem)!
  93 *
  94 * page fault path:
  95 * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
  96 *   page lock -> i_data_sem (rw)
  97 *
  98 * buffered write path:
  99 * sb_start_write -> i_mutex -> mmap_sem
 100 * sb_start_write -> i_mutex -> transaction start -> page lock ->
 101 *   i_data_sem (rw)
 102 *
 103 * truncate:
 104 * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
 105 * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
 106 *   i_data_sem (rw)
 107 *
 108 * direct IO:
 109 * sb_start_write -> i_mutex -> mmap_sem
 110 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
 111 *
 112 * writepages:
 113 * transaction start -> page lock(s) -> i_data_sem (rw)
 114 */
 115
 116#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 117static struct file_system_type ext2_fs_type = {
 118        .owner          = THIS_MODULE,
 119        .name           = "ext2",
 120        .mount          = ext4_mount,
 121        .kill_sb        = kill_block_super,
 122        .fs_flags       = FS_REQUIRES_DEV,
 123};
 124MODULE_ALIAS_FS("ext2");
 125MODULE_ALIAS("ext2");
 126#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
 127#else
 128#define IS_EXT2_SB(sb) (0)
 129#endif
 130
 131
 132static struct file_system_type ext3_fs_type = {
 133        .owner          = THIS_MODULE,
 134        .name           = "ext3",
 135        .mount          = ext4_mount,
 136        .kill_sb        = kill_block_super,
 137        .fs_flags       = FS_REQUIRES_DEV,
 138};
 139MODULE_ALIAS_FS("ext3");
 140MODULE_ALIAS("ext3");
 141#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
 142
 143/*
 144 * This works like sb_bread() except it uses ERR_PTR for error
 145 * returns.  Currently with sb_bread it's impossible to distinguish
 146 * between ENOMEM and EIO situations (since both result in a NULL
 147 * return.
 148 */
 149struct buffer_head *
 150ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
 151{
 152        struct buffer_head *bh = sb_getblk(sb, block);
 153
 154        if (bh == NULL)
 155                return ERR_PTR(-ENOMEM);
 156        if (buffer_uptodate(bh))
 157                return bh;
 158        ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
 159        wait_on_buffer(bh);
 160        if (buffer_uptodate(bh))
 161                return bh;
 162        put_bh(bh);
 163        return ERR_PTR(-EIO);
 164}
 165
 166static int ext4_verify_csum_type(struct super_block *sb,
 167                                 struct ext4_super_block *es)
 168{
 169        if (!ext4_has_feature_metadata_csum(sb))
 170                return 1;
 171
 172        return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
 173}
 174
 175static __le32 ext4_superblock_csum(struct super_block *sb,
 176                                   struct ext4_super_block *es)
 177{
 178        struct ext4_sb_info *sbi = EXT4_SB(sb);
 179        int offset = offsetof(struct ext4_super_block, s_checksum);
 180        __u32 csum;
 181
 182        csum = ext4_chksum(sbi, ~0, (char *)es, offset);
 183
 184        return cpu_to_le32(csum);
 185}
 186
 187static int ext4_superblock_csum_verify(struct super_block *sb,
 188                                       struct ext4_super_block *es)
 189{
 190        if (!ext4_has_metadata_csum(sb))
 191                return 1;
 192
 193        return es->s_checksum == ext4_superblock_csum(sb, es);
 194}
 195
 196void ext4_superblock_csum_set(struct super_block *sb)
 197{
 198        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 199
 200        if (!ext4_has_metadata_csum(sb))
 201                return;
 202
 203        es->s_checksum = ext4_superblock_csum(sb, es);
 204}
 205
 206void *ext4_kvmalloc(size_t size, gfp_t flags)
 207{
 208        void *ret;
 209
 210        ret = kmalloc(size, flags | __GFP_NOWARN);
 211        if (!ret)
 212                ret = __vmalloc(size, flags, PAGE_KERNEL);
 213        return ret;
 214}
 215
 216void *ext4_kvzalloc(size_t size, gfp_t flags)
 217{
 218        void *ret;
 219
 220        ret = kzalloc(size, flags | __GFP_NOWARN);
 221        if (!ret)
 222                ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
 223        return ret;
 224}
 225
 226ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 227                               struct ext4_group_desc *bg)
 228{
 229        return le32_to_cpu(bg->bg_block_bitmap_lo) |
 230                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 231                 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 232}
 233
 234ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 235                               struct ext4_group_desc *bg)
 236{
 237        return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 238                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 239                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 240}
 241
 242ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 243                              struct ext4_group_desc *bg)
 244{
 245        return le32_to_cpu(bg->bg_inode_table_lo) |
 246                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 247                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 248}
 249
 250__u32 ext4_free_group_clusters(struct super_block *sb,
 251                               struct ext4_group_desc *bg)
 252{
 253        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 254                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 255                 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 256}
 257
 258__u32 ext4_free_inodes_count(struct super_block *sb,
 259                              struct ext4_group_desc *bg)
 260{
 261        return le16_to_cpu(bg->bg_free_inodes_count_lo) |
 262                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 263                 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 264}
 265
 266__u32 ext4_used_dirs_count(struct super_block *sb,
 267                              struct ext4_group_desc *bg)
 268{
 269        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
 270                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 271                 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 272}
 273
 274__u32 ext4_itable_unused_count(struct super_block *sb,
 275                              struct ext4_group_desc *bg)
 276{
 277        return le16_to_cpu(bg->bg_itable_unused_lo) |
 278                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 279                 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 280}
 281
 282void ext4_block_bitmap_set(struct super_block *sb,
 283                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
 284{
 285        bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
 286        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 287                bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
 288}
 289
 290void ext4_inode_bitmap_set(struct super_block *sb,
 291                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
 292{
 293        bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
 294        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 295                bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
 296}
 297
 298void ext4_inode_table_set(struct super_block *sb,
 299                          struct ext4_group_desc *bg, ext4_fsblk_t blk)
 300{
 301        bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
 302        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 303                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 304}
 305
 306void ext4_free_group_clusters_set(struct super_block *sb,
 307                                  struct ext4_group_desc *bg, __u32 count)
 308{
 309        bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
 310        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 311                bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
 312}
 313
 314void ext4_free_inodes_set(struct super_block *sb,
 315                          struct ext4_group_desc *bg, __u32 count)
 316{
 317        bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
 318        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 319                bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
 320}
 321
 322void ext4_used_dirs_set(struct super_block *sb,
 323                          struct ext4_group_desc *bg, __u32 count)
 324{
 325        bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
 326        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 327                bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
 328}
 329
 330void ext4_itable_unused_set(struct super_block *sb,
 331                          struct ext4_group_desc *bg, __u32 count)
 332{
 333        bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
 334        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 335                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 336}
 337
 338static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
 339{
 340        time64_t now = ktime_get_real_seconds();
 341
 342        now = clamp_val(now, 0, (1ull << 40) - 1);
 343
 344        *lo = cpu_to_le32(lower_32_bits(now));
 345        *hi = upper_32_bits(now);
 346}
 347
 348static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
 349{
 350        return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
 351}
 352#define ext4_update_tstamp(es, tstamp) \
 353        __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 354#define ext4_get_tstamp(es, tstamp) \
 355        __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 356
 357static void __save_error_info(struct super_block *sb, const char *func,
 358                            unsigned int line)
 359{
 360        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 361
 362        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 363        if (bdev_read_only(sb->s_bdev))
 364                return;
 365        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
 366        ext4_update_tstamp(es, s_last_error_time);
 367        strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
 368        es->s_last_error_line = cpu_to_le32(line);
 369        if (!es->s_first_error_time) {
 370                es->s_first_error_time = es->s_last_error_time;
 371                es->s_first_error_time_hi = es->s_last_error_time_hi;
 372                strncpy(es->s_first_error_func, func,
 373                        sizeof(es->s_first_error_func));
 374                es->s_first_error_line = cpu_to_le32(line);
 375                es->s_first_error_ino = es->s_last_error_ino;
 376                es->s_first_error_block = es->s_last_error_block;
 377        }
 378        /*
 379         * Start the daily error reporting function if it hasn't been
 380         * started already
 381         */
 382        if (!es->s_error_count)
 383                mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
 384        le32_add_cpu(&es->s_error_count, 1);
 385}
 386
 387static void save_error_info(struct super_block *sb, const char *func,
 388                            unsigned int line)
 389{
 390        __save_error_info(sb, func, line);
 391        ext4_commit_super(sb, 1);
 392}
 393
 394/*
 395 * The del_gendisk() function uninitializes the disk-specific data
 396 * structures, including the bdi structure, without telling anyone
 397 * else.  Once this happens, any attempt to call mark_buffer_dirty()
 398 * (for example, by ext4_commit_super), will cause a kernel OOPS.
 399 * This is a kludge to prevent these oops until we can put in a proper
 400 * hook in del_gendisk() to inform the VFS and file system layers.
 401 */
 402static int block_device_ejected(struct super_block *sb)
 403{
 404        struct inode *bd_inode = sb->s_bdev->bd_inode;
 405        struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
 406
 407        return bdi->dev == NULL;
 408}
 409
 410static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 411{
 412        struct super_block              *sb = journal->j_private;
 413        struct ext4_sb_info             *sbi = EXT4_SB(sb);
 414        int                             error = is_journal_aborted(journal);
 415        struct ext4_journal_cb_entry    *jce;
 416
 417        BUG_ON(txn->t_state == T_FINISHED);
 418
 419        ext4_process_freed_data(sb, txn->t_tid);
 420
 421        spin_lock(&sbi->s_md_lock);
 422        while (!list_empty(&txn->t_private_list)) {
 423                jce = list_entry(txn->t_private_list.next,
 424                                 struct ext4_journal_cb_entry, jce_list);
 425                list_del_init(&jce->jce_list);
 426                spin_unlock(&sbi->s_md_lock);
 427                jce->jce_func(sb, jce, error);
 428                spin_lock(&sbi->s_md_lock);
 429        }
 430        spin_unlock(&sbi->s_md_lock);
 431}
 432
 433static bool system_going_down(void)
 434{
 435        return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
 436                || system_state == SYSTEM_RESTART;
 437}
 438
 439/* Deal with the reporting of failure conditions on a filesystem such as
 440 * inconsistencies detected or read IO failures.
 441 *
 442 * On ext2, we can store the error state of the filesystem in the
 443 * superblock.  That is not possible on ext4, because we may have other
 444 * write ordering constraints on the superblock which prevent us from
 445 * writing it out straight away; and given that the journal is about to
 446 * be aborted, we can't rely on the current, or future, transactions to
 447 * write out the superblock safely.
 448 *
 449 * We'll just use the jbd2_journal_abort() error code to record an error in
 450 * the journal instead.  On recovery, the journal will complain about
 451 * that error until we've noted it down and cleared it.
 452 */
 453
 454static void ext4_handle_error(struct super_block *sb)
 455{
 456        if (test_opt(sb, WARN_ON_ERROR))
 457                WARN_ON_ONCE(1);
 458
 459        if (sb_rdonly(sb))
 460                return;
 461
 462        if (!test_opt(sb, ERRORS_CONT)) {
 463                journal_t *journal = EXT4_SB(sb)->s_journal;
 464
 465                EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 466                if (journal)
 467                        jbd2_journal_abort(journal, -EIO);
 468        }
 469        /*
 470         * We force ERRORS_RO behavior when system is rebooting. Otherwise we
 471         * could panic during 'reboot -f' as the underlying device got already
 472         * disabled.
 473         */
 474        if (test_opt(sb, ERRORS_RO) || system_going_down()) {
 475                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 476                /*
 477                 * Make sure updated value of ->s_mount_flags will be visible
 478                 * before ->s_flags update
 479                 */
 480                smp_wmb();
 481                sb->s_flags |= SB_RDONLY;
 482        } else if (test_opt(sb, ERRORS_PANIC)) {
 483                if (EXT4_SB(sb)->s_journal &&
 484                  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
 485                        return;
 486                panic("EXT4-fs (device %s): panic forced after error\n",
 487                        sb->s_id);
 488        }
 489}
 490
 491#define ext4_error_ratelimit(sb)                                        \
 492                ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),     \
 493                             "EXT4-fs error")
 494
 495void __ext4_error(struct super_block *sb, const char *function,
 496                  unsigned int line, const char *fmt, ...)
 497{
 498        struct va_format vaf;
 499        va_list args;
 500
 501        if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 502                return;
 503
 504        trace_ext4_error(sb, function, line);
 505        if (ext4_error_ratelimit(sb)) {
 506                va_start(args, fmt);
 507                vaf.fmt = fmt;
 508                vaf.va = &args;
 509                printk(KERN_CRIT
 510                       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
 511                       sb->s_id, function, line, current->comm, &vaf);
 512                va_end(args);
 513        }
 514        save_error_info(sb, function, line);
 515        ext4_handle_error(sb);
 516}
 517
 518void __ext4_error_inode(struct inode *inode, const char *function,
 519                        unsigned int line, ext4_fsblk_t block,
 520                        const char *fmt, ...)
 521{
 522        va_list args;
 523        struct va_format vaf;
 524        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 525
 526        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 527                return;
 528
 529        trace_ext4_error(inode->i_sb, function, line);
 530        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 531        es->s_last_error_block = cpu_to_le64(block);
 532        if (ext4_error_ratelimit(inode->i_sb)) {
 533                va_start(args, fmt);
 534                vaf.fmt = fmt;
 535                vaf.va = &args;
 536                if (block)
 537                        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 538                               "inode #%lu: block %llu: comm %s: %pV\n",
 539                               inode->i_sb->s_id, function, line, inode->i_ino,
 540                               block, current->comm, &vaf);
 541                else
 542                        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 543                               "inode #%lu: comm %s: %pV\n",
 544                               inode->i_sb->s_id, function, line, inode->i_ino,
 545                               current->comm, &vaf);
 546                va_end(args);
 547        }
 548        save_error_info(inode->i_sb, function, line);
 549        ext4_handle_error(inode->i_sb);
 550}
 551
 552void __ext4_error_file(struct file *file, const char *function,
 553                       unsigned int line, ext4_fsblk_t block,
 554                       const char *fmt, ...)
 555{
 556        va_list args;
 557        struct va_format vaf;
 558        struct ext4_super_block *es;
 559        struct inode *inode = file_inode(file);
 560        char pathname[80], *path;
 561
 562        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 563                return;
 564
 565        trace_ext4_error(inode->i_sb, function, line);
 566        es = EXT4_SB(inode->i_sb)->s_es;
 567        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 568        if (ext4_error_ratelimit(inode->i_sb)) {
 569                path = file_path(file, pathname, sizeof(pathname));
 570                if (IS_ERR(path))
 571                        path = "(unknown)";
 572                va_start(args, fmt);
 573                vaf.fmt = fmt;
 574                vaf.va = &args;
 575                if (block)
 576                        printk(KERN_CRIT
 577                               "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 578                               "block %llu: comm %s: path %s: %pV\n",
 579                               inode->i_sb->s_id, function, line, inode->i_ino,
 580                               block, current->comm, path, &vaf);
 581                else
 582                        printk(KERN_CRIT
 583                               "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 584                               "comm %s: path %s: %pV\n",
 585                               inode->i_sb->s_id, function, line, inode->i_ino,
 586                               current->comm, path, &vaf);
 587                va_end(args);
 588        }
 589        save_error_info(inode->i_sb, function, line);
 590        ext4_handle_error(inode->i_sb);
 591}
 592
 593const char *ext4_decode_error(struct super_block *sb, int errno,
 594                              char nbuf[16])
 595{
 596        char *errstr = NULL;
 597
 598        switch (errno) {
 599        case -EFSCORRUPTED:
 600                errstr = "Corrupt filesystem";
 601                break;
 602        case -EFSBADCRC:
 603                errstr = "Filesystem failed CRC";
 604                break;
 605        case -EIO:
 606                errstr = "IO failure";
 607                break;
 608        case -ENOMEM:
 609                errstr = "Out of memory";
 610                break;
 611        case -EROFS:
 612                if (!sb || (EXT4_SB(sb)->s_journal &&
 613                            EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 614                        errstr = "Journal has aborted";
 615                else
 616                        errstr = "Readonly filesystem";
 617                break;
 618        default:
 619                /* If the caller passed in an extra buffer for unknown
 620                 * errors, textualise them now.  Else we just return
 621                 * NULL. */
 622                if (nbuf) {
 623                        /* Check for truncated error codes... */
 624                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
 625                                errstr = nbuf;
 626                }
 627                break;
 628        }
 629
 630        return errstr;
 631}
 632
 633/* __ext4_std_error decodes expected errors from journaling functions
 634 * automatically and invokes the appropriate error response.  */
 635
 636void __ext4_std_error(struct super_block *sb, const char *function,
 637                      unsigned int line, int errno)
 638{
 639        char nbuf[16];
 640        const char *errstr;
 641
 642        if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 643                return;
 644
 645        /* Special case: if the error is EROFS, and we're not already
 646         * inside a transaction, then there's really no point in logging
 647         * an error. */
 648        if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
 649                return;
 650
 651        if (ext4_error_ratelimit(sb)) {
 652                errstr = ext4_decode_error(sb, errno, nbuf);
 653                printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
 654                       sb->s_id, function, line, errstr);
 655        }
 656
 657        save_error_info(sb, function, line);
 658        ext4_handle_error(sb);
 659}
 660
 661/*
 662 * ext4_abort is a much stronger failure handler than ext4_error.  The
 663 * abort function may be used to deal with unrecoverable failures such
 664 * as journal IO errors or ENOMEM at a critical moment in log management.
 665 *
 666 * We unconditionally force the filesystem into an ABORT|READONLY state,
 667 * unless the error response on the fs has been set to panic in which
 668 * case we take the easy way out and panic immediately.
 669 */
 670
 671void __ext4_abort(struct super_block *sb, const char *function,
 672                unsigned int line, const char *fmt, ...)
 673{
 674        struct va_format vaf;
 675        va_list args;
 676
 677        if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 678                return;
 679
 680        save_error_info(sb, function, line);
 681        va_start(args, fmt);
 682        vaf.fmt = fmt;
 683        vaf.va = &args;
 684        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n",
 685               sb->s_id, function, line, &vaf);
 686        va_end(args);
 687
 688        if (sb_rdonly(sb) == 0) {
 689                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 690                EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 691                /*
 692                 * Make sure updated value of ->s_mount_flags will be visible
 693                 * before ->s_flags update
 694                 */
 695                smp_wmb();
 696                sb->s_flags |= SB_RDONLY;
 697                if (EXT4_SB(sb)->s_journal)
 698                        jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 699                save_error_info(sb, function, line);
 700        }
 701        if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
 702                if (EXT4_SB(sb)->s_journal &&
 703                  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
 704                        return;
 705                panic("EXT4-fs panic from previous error\n");
 706        }
 707}
 708
 709void __ext4_msg(struct super_block *sb,
 710                const char *prefix, const char *fmt, ...)
 711{
 712        struct va_format vaf;
 713        va_list args;
 714
 715        if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
 716                return;
 717
 718        va_start(args, fmt);
 719        vaf.fmt = fmt;
 720        vaf.va = &args;
 721        printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
 722        va_end(args);
 723}
 724
 725#define ext4_warning_ratelimit(sb)                                      \
 726                ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
 727                             "EXT4-fs warning")
 728
 729void __ext4_warning(struct super_block *sb, const char *function,
 730                    unsigned int line, const char *fmt, ...)
 731{
 732        struct va_format vaf;
 733        va_list args;
 734
 735        if (!ext4_warning_ratelimit(sb))
 736                return;
 737
 738        va_start(args, fmt);
 739        vaf.fmt = fmt;
 740        vaf.va = &args;
 741        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
 742               sb->s_id, function, line, &vaf);
 743        va_end(args);
 744}
 745
 746void __ext4_warning_inode(const struct inode *inode, const char *function,
 747                          unsigned int line, const char *fmt, ...)
 748{
 749        struct va_format vaf;
 750        va_list args;
 751
 752        if (!ext4_warning_ratelimit(inode->i_sb))
 753                return;
 754
 755        va_start(args, fmt);
 756        vaf.fmt = fmt;
 757        vaf.va = &args;
 758        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
 759               "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
 760               function, line, inode->i_ino, current->comm, &vaf);
 761        va_end(args);
 762}
 763
 764void __ext4_grp_locked_error(const char *function, unsigned int line,
 765                             struct super_block *sb, ext4_group_t grp,
 766                             unsigned long ino, ext4_fsblk_t block,
 767                             const char *fmt, ...)
 768__releases(bitlock)
 769__acquires(bitlock)
 770{
 771        struct va_format vaf;
 772        va_list args;
 773        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 774
 775        if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 776                return;
 777
 778        trace_ext4_error(sb, function, line);
 779        es->s_last_error_ino = cpu_to_le32(ino);
 780        es->s_last_error_block = cpu_to_le64(block);
 781        __save_error_info(sb, function, line);
 782
 783        if (ext4_error_ratelimit(sb)) {
 784                va_start(args, fmt);
 785                vaf.fmt = fmt;
 786                vaf.va = &args;
 787                printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
 788                       sb->s_id, function, line, grp);
 789                if (ino)
 790                        printk(KERN_CONT "inode %lu: ", ino);
 791                if (block)
 792                        printk(KERN_CONT "block %llu:",
 793                               (unsigned long long) block);
 794                printk(KERN_CONT "%pV\n", &vaf);
 795                va_end(args);
 796        }
 797
 798        if (test_opt(sb, WARN_ON_ERROR))
 799                WARN_ON_ONCE(1);
 800
 801        if (test_opt(sb, ERRORS_CONT)) {
 802                ext4_commit_super(sb, 0);
 803                return;
 804        }
 805
 806        ext4_unlock_group(sb, grp);
 807        ext4_commit_super(sb, 1);
 808        ext4_handle_error(sb);
 809        /*
 810         * We only get here in the ERRORS_RO case; relocking the group
 811         * may be dangerous, but nothing bad will happen since the
 812         * filesystem will have already been marked read/only and the
 813         * journal has been aborted.  We return 1 as a hint to callers
 814         * who might what to use the return value from
 815         * ext4_grp_locked_error() to distinguish between the
 816         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
 817         * aggressively from the ext4 function in question, with a
 818         * more appropriate error code.
 819         */
 820        ext4_lock_group(sb, grp);
 821        return;
 822}
 823
 824void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
 825                                     ext4_group_t group,
 826                                     unsigned int flags)
 827{
 828        struct ext4_sb_info *sbi = EXT4_SB(sb);
 829        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 830        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 831        int ret;
 832
 833        if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
 834                ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
 835                                            &grp->bb_state);
 836                if (!ret)
 837                        percpu_counter_sub(&sbi->s_freeclusters_counter,
 838                                           grp->bb_free);
 839        }
 840
 841        if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
 842                ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
 843                                            &grp->bb_state);
 844                if (!ret && gdp) {
 845                        int count;
 846
 847                        count = ext4_free_inodes_count(sb, gdp);
 848                        percpu_counter_sub(&sbi->s_freeinodes_counter,
 849                                           count);
 850                }
 851        }
 852}
 853
 854void ext4_update_dynamic_rev(struct super_block *sb)
 855{
 856        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 857
 858        if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
 859                return;
 860
 861        ext4_warning(sb,
 862                     "updating to rev %d because of new feature flag, "
 863                     "running e2fsck is recommended",
 864                     EXT4_DYNAMIC_REV);
 865
 866        es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
 867        es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
 868        es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
 869        /* leave es->s_feature_*compat flags alone */
 870        /* es->s_uuid will be set by e2fsck if empty */
 871
 872        /*
 873         * The rest of the superblock fields should be zero, and if not it
 874         * means they are likely already in use, so leave them alone.  We
 875         * can leave it up to e2fsck to clean up any inconsistencies there.
 876         */
 877}
 878
 879/*
 880 * Open the external journal device
 881 */
 882static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 883{
 884        struct block_device *bdev;
 885        char b[BDEVNAME_SIZE];
 886
 887        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 888        if (IS_ERR(bdev))
 889                goto fail;
 890        return bdev;
 891
 892fail:
 893        ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
 894                        __bdevname(dev, b), PTR_ERR(bdev));
 895        return NULL;
 896}
 897
 898/*
 899 * Release the journal device
 900 */
 901static void ext4_blkdev_put(struct block_device *bdev)
 902{
 903        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 904}
 905
 906static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
 907{
 908        struct block_device *bdev;
 909        bdev = sbi->journal_bdev;
 910        if (bdev) {
 911                ext4_blkdev_put(bdev);
 912                sbi->journal_bdev = NULL;
 913        }
 914}
 915
 916static inline struct inode *orphan_list_entry(struct list_head *l)
 917{
 918        return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
 919}
 920
 921static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 922{
 923        struct list_head *l;
 924
 925        ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
 926                 le32_to_cpu(sbi->s_es->s_last_orphan));
 927
 928        printk(KERN_ERR "sb_info orphan list:\n");
 929        list_for_each(l, &sbi->s_orphan) {
 930                struct inode *inode = orphan_list_entry(l);
 931                printk(KERN_ERR "  "
 932                       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
 933                       inode->i_sb->s_id, inode->i_ino, inode,
 934                       inode->i_mode, inode->i_nlink,
 935                       NEXT_ORPHAN(inode));
 936        }
 937}
 938
 939#ifdef CONFIG_QUOTA
 940static int ext4_quota_off(struct super_block *sb, int type);
 941
 942static inline void ext4_quota_off_umount(struct super_block *sb)
 943{
 944        int type;
 945
 946        /* Use our quota_off function to clear inode flags etc. */
 947        for (type = 0; type < EXT4_MAXQUOTAS; type++)
 948                ext4_quota_off(sb, type);
 949}
 950
 951/*
 952 * This is a helper function which is used in the mount/remount
 953 * codepaths (which holds s_umount) to fetch the quota file name.
 954 */
 955static inline char *get_qf_name(struct super_block *sb,
 956                                struct ext4_sb_info *sbi,
 957                                int type)
 958{
 959        return rcu_dereference_protected(sbi->s_qf_names[type],
 960                                         lockdep_is_held(&sb->s_umount));
 961}
 962#else
 963static inline void ext4_quota_off_umount(struct super_block *sb)
 964{
 965}
 966#endif
 967
 968static void ext4_put_super(struct super_block *sb)
 969{
 970        struct ext4_sb_info *sbi = EXT4_SB(sb);
 971        struct ext4_super_block *es = sbi->s_es;
 972        int aborted = 0;
 973        int i, err;
 974
 975        ext4_unregister_li_request(sb);
 976        ext4_quota_off_umount(sb);
 977
 978        destroy_workqueue(sbi->rsv_conversion_wq);
 979
 980        if (sbi->s_journal) {
 981                aborted = is_journal_aborted(sbi->s_journal);
 982                err = jbd2_journal_destroy(sbi->s_journal);
 983                sbi->s_journal = NULL;
 984                if ((err < 0) && !aborted)
 985                        ext4_abort(sb, "Couldn't clean up the journal");
 986        }
 987
 988        ext4_unregister_sysfs(sb);
 989        ext4_es_unregister_shrinker(sbi);
 990        del_timer_sync(&sbi->s_err_report);
 991        ext4_release_system_zone(sb);
 992        ext4_mb_release(sb);
 993        ext4_ext_release(sb);
 994
 995        if (!sb_rdonly(sb) && !aborted) {
 996                ext4_clear_feature_journal_needs_recovery(sb);
 997                es->s_state = cpu_to_le16(sbi->s_mount_state);
 998        }
 999        if (!sb_rdonly(sb))
1000                ext4_commit_super(sb, 1);

1001
1002        for (i = 0; i < sbi->s_gdb_count; i++)
1003                brelse(sbi->s_group_desc[i]);
1004        kvfree(sbi->s_group_desc);
1005        kvfree(sbi->s_flex_groups);
1006        percpu_counter_destroy(&sbi->s_freeclusters_counter);
1007        percpu_counter_destroy(&sbi->s_freeinodes_counter);
1008        percpu_counter_destroy(&sbi->s_dirs_counter);
1009        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
1010        percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
1011#ifdef CONFIG_QUOTA
1012        for (i = 0; i < EXT4_MAXQUOTAS; i++)
1013                kfree(get_qf_name(sb, sbi, i));
1014#endif
1015
1016        /* Debugging code just in case the in-memory inode orphan list
1017         * isn't empty.  The on-disk one can be non-empty if we've
1018         * detected an error and taken the fs readonly, but the
1019         * in-memory list had better be clean by this point. */
1020        if (!list_empty(&sbi->s_orphan))
1021                dump_orphan_list(sb, sbi);
1022        J_ASSERT(list_empty(&sbi->s_orphan));
1023
1024        sync_blockdev(sb->s_bdev);
1025        invalidate_bdev(sb->s_bdev);
1026        if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
1027                /*
1028                 * Invalidate the journal device's buffers.  We don't want them
1029                 * floating about in memory - the physical journal device may
1030                 * hotswapped, and it breaks the `ro-after' testing code.
1031                 */
1032                sync_blockdev(sbi->journal_bdev);
1033                invalidate_bdev(sbi->journal_bdev);
1034                ext4_blkdev_remove(sbi);
1035        }
1036
1037        ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
1038        sbi->s_ea_inode_cache = NULL;
1039
1040        ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
1041        sbi->s_ea_block_cache = NULL;
1042
1043        if (sbi->s_mmp_tsk)
1044                kthread_stop(sbi->s_mmp_tsk);
1045        brelse(sbi->s_sbh);
1046        sb->s_fs_info = NULL;
1047        /*
1048         * Now that we are completely done shutting down the
1049         * superblock, we need to actually destroy the kobject.
1050         */
1051        kobject_put(&sbi->s_kobj);
1052        wait_for_completion(&sbi->s_kobj_unregister);
1053        if (sbi->s_chksum_driver)
1054                crypto_free_shash(sbi->s_chksum_driver);
1055        kfree(sbi->s_blockgroup_lock);
1056        fs_put_dax(sbi->s_daxdev);
1057        kfree(sbi);
1058}
1059
1060static struct kmem_cache *ext4_inode_cachep;
1061
1062/*
1063 * Called inside transaction, so use GFP_NOFS
1064 */
1065static struct inode *ext4_alloc_inode(struct super_block *sb)
1066{
1067        struct ext4_inode_info *ei;
1068
1069        ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
1070        if (!ei)
1071                return NULL;
1072
1073        inode_set_iversion(&ei->vfs_inode, 1);
1074        spin_lock_init(&ei->i_raw_lock);
1075        INIT_LIST_HEAD(&ei->i_prealloc_list);
1076        spin_lock_init(&ei->i_prealloc_lock);
1077        ext4_es_init_tree(&ei->i_es_tree);
1078        rwlock_init(&ei->i_es_lock);
1079        INIT_LIST_HEAD(&ei->i_es_list);
1080        ei->i_es_all_nr = 0;
1081        ei->i_es_shk_nr = 0;
1082        ei->i_es_shrink_lblk = 0;
1083        ei->i_reserved_data_blocks = 0;
1084        ei->i_da_metadata_calc_len = 0;
1085        ei->i_da_metadata_calc_last_lblock = 0;
1086        spin_lock_init(&(ei->i_block_reservation_lock));
1087        ext4_init_pending_tree(&ei->i_pending_tree);
1088#ifdef CONFIG_QUOTA
1089        ei->i_reserved_quota = 0;
1090        memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
1091#endif
1092        ei->jinode = NULL;
1093        INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
1094        spin_lock_init(&ei->i_completed_io_lock);
1095        ei->i_sync_tid = 0;
1096        ei->i_datasync_tid = 0;
1097        atomic_set(&ei->i_unwritten, 0);
1098        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
1099        return &ei->vfs_inode;
1100}
1101
1102static int ext4_drop_inode(struct inode *inode)
1103{
1104        int drop = generic_drop_inode(inode);
1105
1106        trace_ext4_drop_inode(inode, drop);
1107        return drop;
1108}
1109
1110static void ext4_i_callback(struct rcu_head *head)
1111{
1112        struct inode *inode = container_of(head, struct inode, i_rcu);
1113        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
1114}
1115
1116static void ext4_destroy_inode(struct inode *inode)
1117{
1118        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
1119                ext4_msg(inode->i_sb, KERN_ERR,
1120                         "Inode %lu (%p): orphan list check failed!",
1121                         inode->i_ino, EXT4_I(inode));
1122                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
1123                                EXT4_I(inode), sizeof(struct ext4_inode_info),
1124                                true);
1125                dump_stack();
1126        }
1127        call_rcu(&inode->i_rcu, ext4_i_callback);
1128}
1129
1130static void init_once(void *foo)
1131{
1132        struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
1133
1134        INIT_LIST_HEAD(&ei->i_orphan);
1135        init_rwsem(&ei->xattr_sem);
1136        init_rwsem(&ei->i_data_sem);
1137        init_rwsem(&ei->i_mmap_sem);
1138        inode_init_once(&ei->vfs_inode);
1139}
1140
1141static int __init init_inodecache(void)
1142{
1143        ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
1144                                sizeof(struct ext4_inode_info), 0,
1145                                (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
1146                                        SLAB_ACCOUNT),
1147                                offsetof(struct ext4_inode_info, i_data),
1148                                sizeof_field(struct ext4_inode_info, i_data),
1149                                init_once);
1150        if (ext4_inode_cachep == NULL)
1151                return -ENOMEM;
1152        return 0;
1153}
1154
1155static void destroy_inodecache(void)
1156{
1157        /*
1158         * Make sure all delayed rcu free inodes are flushed before we
1159         * destroy cache.
1160         */
1161        rcu_barrier();
1162        kmem_cache_destroy(ext4_inode_cachep);
1163}
1164
1165void ext4_clear_inode(struct inode *inode)
1166{
1167        invalidate_inode_buffers(inode);
1168        clear_inode(inode);
1169        dquot_drop(inode);
1170        ext4_discard_preallocations(inode);
1171        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
1172        if (EXT4_I(inode)->jinode) {
1173                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
1174                                               EXT4_I(inode)->jinode);
1175                jbd2_free_inode(EXT4_I(inode)->jinode);
1176                EXT4_I(inode)->jinode = NULL;
1177        }
1178        fscrypt_put_encryption_info(inode);
1179}
1180
1181static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1182                                        u64 ino, u32 generation)
1183{
1184        struct inode *inode;
1185
1186        /*
1187         * Currently we don't know the generation for parent directory, so
1188         * a generation of 0 means "accept any"
1189         */
1190        inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
1191        if (IS_ERR(inode))
1192                return ERR_CAST(inode);
1193        if (generation && inode->i_generation != generation) {
1194                iput(inode);
1195                return ERR_PTR(-ESTALE);
1196        }
1197
1198        return inode;
1199}
1200
1201static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
1202                                        int fh_len, int fh_type)
1203{
1204        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1205                                    ext4_nfs_get_inode);
1206}
1207
1208static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
1209                                        int fh_len, int fh_type)
1210{
1211        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1212                                    ext4_nfs_get_inode);
1213}
1214
1215static int ext4_nfs_commit_metadata(struct inode *inode)
1216{
1217        struct writeback_control wbc = {
1218                .sync_mode = WB_SYNC_ALL
1219        };
1220
1221        trace_ext4_nfs_commit_metadata(inode);
1222        return ext4_write_inode(inode, &wbc);
1223}
1224
1225/*
1226 * Try to release metadata pages (indirect blocks, directories) which are
1227 * mapped via the block device.  Since these pages could have journal heads
1228 * which would prevent try_to_free_buffers() from freeing them, we must use
1229 * jbd2 layer's try_to_free_buffers() function to release them.
1230 */
1231static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
1232                                 gfp_t wait)
1233{
1234        journal_t *journal = EXT4_SB(sb)->s_journal;
1235
1236        WARN_ON(PageChecked(page));
1237        if (!page_has_buffers(page))
1238                return 0;
1239        if (journal)
1240                return jbd2_journal_try_to_free_buffers(journal, page,
1241                                                wait & ~__GFP_DIRECT_RECLAIM);
1242        return try_to_free_buffers(page);
1243}
1244
1245#ifdef CONFIG_EXT4_FS_ENCRYPTION
1246static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
1247{
1248        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
1249                                 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
1250}
1251
1252static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
1253                                                        void *fs_data)
1254{
1255        handle_t *handle = fs_data;
1256        int res, res2, credits, retries = 0;
1257
1258        /*
1259         * Encrypting the root directory is not allowed because e2fsck expects
1260         * lost+found to exist and be unencrypted, and encrypting the root
1261         * directory would imply encrypting the lost+found directory as well as
1262         * the filename "lost+found" itself.
1263         */
1264        if (inode->i_ino == EXT4_ROOT_INO)
1265                return -EPERM;
1266
1267        if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
1268                return -EINVAL;
1269
1270        res = ext4_convert_inline_data(inode);
1271        if (res)
1272                return res;
1273
1274        /*
1275         * If a journal handle was specified, then the encryption context is
1276         * being set on a new inode via inheritance and is part of a larger
1277         * transaction to create the inode.  Otherwise the encryption context is
1278         * being set on an existing inode in its own transaction.  Only in the
1279         * latter case should the "retry on ENOSPC" logic be used.
1280         */
1281
1282        if (handle) {
1283                res = ext4_xattr_set_handle(handle, inode,
1284                                            EXT4_XATTR_INDEX_ENCRYPTION,
1285                                            EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
1286                                            ctx, len, 0);
1287                if (!res) {
1288                        ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
1289                        ext4_clear_inode_state(inode,
1290                                        EXT4_STATE_MAY_INLINE_DATA);
1291                        /*
1292                         * Update inode->i_flags - S_ENCRYPTED will be enabled,
1293                         * S_DAX may be disabled
1294                         */
1295                        ext4_set_inode_flags(inode);
1296                }
1297                return res;
1298        }
1299
1300        res = dquot_initialize(inode);
1301        if (res)
1302                return res;
1303retry:
1304        res = ext4_xattr_set_credits(inode, len, false /* is_create */,
1305                                     &credits);
1306        if (res)
1307                return res;
1308
1309        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
1310        if (IS_ERR(handle))
1311                return PTR_ERR(handle);
1312
1313        res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
1314                                    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
1315                                    ctx, len, 0);
1316        if (!res) {
1317                ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
1318                /*
1319                 * Update inode->i_flags - S_ENCRYPTED will be enabled,
1320                 * S_DAX may be disabled
1321                 */
1322                ext4_set_inode_flags(inode);
1323                res = ext4_mark_inode_dirty(handle, inode);
1324                if (res)
1325                        EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
1326        }
1327        res2 = ext4_journal_stop(handle);
1328
1329        if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1330                goto retry;
1331        if (!res)
1332                res = res2;
1333        return res;
1334}
1335
1336static bool ext4_dummy_context(struct inode *inode)
1337{
1338        return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
1339}
1340
1341static const struct fscrypt_operations ext4_cryptops = {
1342        .key_prefix             = "ext4:",
1343        .get_context            = ext4_get_context,
1344        .set_context            = ext4_set_context,
1345        .dummy_context          = ext4_dummy_context,
1346        .empty_dir              = ext4_empty_dir,
1347        .max_namelen            = EXT4_NAME_LEN,
1348};
1349#endif
1350
1351#ifdef CONFIG_QUOTA
1352static const char * const quotatypes[] = INITQFNAMES;
1353#define QTYPE2NAME(t) (quotatypes[t])
1354
1355static int ext4_write_dquot(struct dquot *dquot);
1356static int ext4_acquire_dquot(struct dquot *dquot);
1357static int ext4_release_dquot(struct dquot *dquot);
1358static int ext4_mark_dquot_dirty(struct dquot *dquot);
1359static int ext4_write_info(struct super_block *sb, int type);
1360static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1361                         const struct path *path);
1362static int ext4_quota_on_mount(struct super_block *sb, int type);
1363static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1364                               size_t len, loff_t off);
1365static ssize_t ext4_quota_write(struct super_block *sb, int type,
1366                                const char *data, size_t len, loff_t off);
1367static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
1368                             unsigned int flags);
1369static int ext4_enable_quotas(struct super_block *sb);
1370static int ext4_get_next_id(struct super_block *sb, struct kqid *qid);
1371
1372static struct dquot **ext4_get_dquots(struct inode *inode)
1373{
1374        return EXT4_I(inode)->i_dquot;
1375}
1376
1377static const struct dquot_operations ext4_quota_operations = {
1378        .get_reserved_space     = ext4_get_reserved_space,
1379        .write_dquot            = ext4_write_dquot,
1380        .acquire_dquot          = ext4_acquire_dquot,
1381        .release_dquot          = ext4_release_dquot,
1382        .mark_dirty             = ext4_mark_dquot_dirty,
1383        .write_info             = ext4_write_info,
1384        .alloc_dquot            = dquot_alloc,
1385        .destroy_dquot          = dquot_destroy,
1386        .get_projid             = ext4_get_projid,
1387        .get_inode_usage        = ext4_get_inode_usage,
1388        .get_next_id            = ext4_get_next_id,
1389};
1390
1391static const struct quotactl_ops ext4_qctl_operations = {
1392        .quota_on       = ext4_quota_on,
1393        .quota_off      = ext4_quota_off,
1394        .quota_sync     = dquot_quota_sync,
1395        .get_state      = dquot_get_state,
1396        .set_info       = dquot_set_dqinfo,
1397        .get_dqblk      = dquot_get_dqblk,
1398        .set_dqblk      = dquot_set_dqblk,
1399        .get_nextdqblk  = dquot_get_next_dqblk,
1400};
1401#endif
1402
1403static const struct super_operations ext4_sops = {
1404        .alloc_inode    = ext4_alloc_inode,
1405        .destroy_inode  = ext4_destroy_inode,
1406        .write_inode    = ext4_write_inode,
1407        .dirty_inode    = ext4_dirty_inode,
1408        .drop_inode     = ext4_drop_inode,
1409        .evict_inode    = ext4_evict_inode,
1410        .put_super      = ext4_put_super,
1411        .sync_fs        = ext4_sync_fs,
1412        .freeze_fs      = ext4_freeze,
1413        .unfreeze_fs    = ext4_unfreeze,
1414        .statfs         = ext4_statfs,
1415        .remount_fs     = ext4_remount,
1416        .show_options   = ext4_show_options,
1417#ifdef CONFIG_QUOTA
1418        .quota_read     = ext4_quota_read,
1419        .quota_write    = ext4_quota_write,
1420        .get_dquots     = ext4_get_dquots,
1421#endif
1422        .bdev_try_to_free_page = bdev_try_to_free_page,
1423};
1424
1425static const struct export_operations ext4_export_ops = {
1426        .fh_to_dentry = ext4_fh_to_dentry,
1427        .fh_to_parent = ext4_fh_to_parent,
1428        .get_parent = ext4_get_parent,
1429        .commit_metadata = ext4_nfs_commit_metadata,
1430};
1431
1432enum {
1433        Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1434        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1435        Opt_nouid32, Opt_debug, Opt_removed,
1436        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1437        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1438        Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1439        Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1440        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1441        Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
1442        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1443        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1444        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1445        Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
1446        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
1447        Opt_nowarn_on_error, Opt_mblk_io_submit,
1448        Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
1449        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1450        Opt_inode_readahead_blks, Opt_journal_ioprio,
1451        Opt_dioread_nolock, Opt_dioread_lock,
1452        Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1453        Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
1454};
1455
1456static const match_table_t tokens = {
1457        {Opt_bsd_df, "bsddf"},
1458        {Opt_minix_df, "minixdf"},
1459        {Opt_grpid, "grpid"},
1460        {Opt_grpid, "bsdgroups"},
1461        {Opt_nogrpid, "nogrpid"},
1462        {Opt_nogrpid, "sysvgroups"},
1463        {Opt_resgid, "resgid=%u"},
1464        {Opt_resuid, "resuid=%u"},
1465        {Opt_sb, "sb=%u"},
1466        {Opt_err_cont, "errors=continue"},
1467        {Opt_err_panic, "errors=panic"},
1468        {Opt_err_ro, "errors=remount-ro"},
1469        {Opt_nouid32, "nouid32"},
1470        {Opt_debug, "debug"},
1471        {Opt_removed, "oldalloc"},
1472        {Opt_removed, "orlov"},
1473        {Opt_user_xattr, "user_xattr"},
1474        {Opt_nouser_xattr, "nouser_xattr"},
1475        {Opt_acl, "acl"},
1476        {Opt_noacl, "noacl"},
1477        {Opt_noload, "norecovery"},
1478        {Opt_noload, "noload"},
1479        {Opt_removed, "nobh"},
1480        {Opt_removed, "bh"},
1481        {Opt_commit, "commit=%u"},
1482        {Opt_min_batch_time, "min_batch_time=%u"},
1483        {Opt_max_batch_time, "max_batch_time=%u"},
1484        {Opt_journal_dev, "journal_dev=%u"},
1485        {Opt_journal_path, "journal_path=%s"},
1486        {Opt_journal_checksum, "journal_checksum"},
1487        {Opt_nojournal_checksum, "nojournal_checksum"},
1488        {Opt_journal_async_commit, "journal_async_commit"},
1489        {Opt_abort, "abort"},
1490        {Opt_data_journal, "data=journal"},
1491        {Opt_data_ordered, "data=ordered"},
1492        {Opt_data_writeback, "data=writeback"},
1493        {Opt_data_err_abort, "data_err=abort"},
1494        {Opt_data_err_ignore, "data_err=ignore"},
1495        {Opt_offusrjquota, "usrjquota="},
1496        {Opt_usrjquota, "usrjquota=%s"},
1497        {Opt_offgrpjquota, "grpjquota="},
1498        {Opt_grpjquota, "grpjquota=%s"},
1499        {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
1500        {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
1501        {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
1502        {Opt_grpquota, "grpquota"},
1503        {Opt_noquota, "noquota"},
1504        {Opt_quota, "quota"},
1505        {Opt_usrquota, "usrquota"},
1506        {Opt_prjquota, "prjquota"},
1507        {Opt_barrier, "barrier=%u"},
1508        {Opt_barrier, "barrier"},
1509        {Opt_nobarrier, "nobarrier"},
1510        {Opt_i_version, "i_version"},
1511        {Opt_dax, "dax"},
1512        {Opt_stripe, "stripe=%u"},
1513        {Opt_delalloc, "delalloc"},
1514        {Opt_warn_on_error, "warn_on_error"},
1515        {Opt_nowarn_on_error, "nowarn_on_error"},
1516        {Opt_lazytime, "lazytime"},
1517        {Opt_nolazytime, "nolazytime"},
1518        {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"},
1519        {Opt_nodelalloc, "nodelalloc"},
1520        {Opt_removed, "mblk_io_submit"},
1521        {Opt_removed, "nomblk_io_submit"},
1522        {Opt_block_validity, "block_validity"},
1523        {Opt_noblock_validity, "noblock_validity"},
1524        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1525        {Opt_journal_ioprio, "journal_ioprio=%u"},
1526        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1527        {Opt_auto_da_alloc, "auto_da_alloc"},
1528        {Opt_noauto_da_alloc, "noauto_da_alloc"},
1529        {Opt_dioread_nolock, "dioread_nolock"},
1530        {Opt_dioread_lock, "dioread_lock"},
1531        {Opt_discard, "discard"},
1532        {Opt_nodiscard, "nodiscard"},
1533        {Opt_init_itable, "init_itable=%u"},
1534        {Opt_init_itable, "init_itable"},
1535        {Opt_noinit_itable, "noinit_itable"},
1536        {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
1537        {Opt_test_dummy_encryption, "test_dummy_encryption"},
1538        {Opt_nombcache, "nombcache"},
1539        {Opt_nombcache, "no_mbcache"},  /* for backward compatibility */
1540        {Opt_removed, "check=none"},    /* mount option from ext2/3 */
1541        {Opt_removed, "nocheck"},       /* mount option from ext2/3 */
1542        {Opt_removed, "reservation"},   /* mount option from ext2/3 */
1543        {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
1544        {Opt_removed, "journal=%u"},    /* mount option from ext2/3 */
1545        {Opt_err, NULL},
1546};
1547
1548static ext4_fsblk_t get_sb_block(void **data)
1549{
1550        ext4_fsblk_t    sb_block;
1551        char            *options = (char *) *data;
1552
1553        if (!options || strncmp(options, "sb=", 3) != 0)
1554                return 1;       /* Default location */
1555
1556        options += 3;
1557        /* TODO: use simple_strtoll with >32bit ext4 */
1558        sb_block = simple_strtoul(options, &options, 0);
1559        if (*options && *options != ',') {
1560                printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
1561                       (char *) *data);
1562                return 1;
1563        }
1564        if (*options == ',')
1565                options++;
1566        *data = (void *) options;
1567
1568        return sb_block;
1569}
1570
1571#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1572static const char deprecated_msg[] =
1573        "Mount option \"%s\" will be removed by %s\n"
1574        "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1575
1576#ifdef CONFIG_QUOTA
1577static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1578{
1579        struct ext4_sb_info *sbi = EXT4_SB(sb);
1580        char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
1581        int ret = -1;
1582
1583        if (sb_any_quota_loaded(sb) && !old_qname) {
1584                ext4_msg(sb, KERN_ERR,
1585                        "Cannot change journaled "
1586                        "quota options when quota turned on");
1587                return -1;
1588        }
1589        if (ext4_has_feature_quota(sb)) {
1590                ext4_msg(sb, KERN_INFO, "Journaled quota options "
1591                         "ignored when QUOTA feature is enabled");
1592                return 1;
1593        }
1594        qname = match_strdup(args);
1595        if (!qname) {
1596                ext4_msg(sb, KERN_ERR,
1597                        "Not enough memory for storing quotafile name");
1598                return -1;
1599        }
1600        if (old_qname) {
1601                if (strcmp(old_qname, qname) == 0)
1602                        ret = 1;
1603                else
1604                        ext4_msg(sb, KERN_ERR,
1605                                 "%s quota file already specified",
1606                                 QTYPE2NAME(qtype));
1607                goto errout;
1608        }
1609        if (strchr(qname, '/')) {
1610                ext4_msg(sb, KERN_ERR,
1611                        "quotafile must be on filesystem root");
1612                goto errout;
1613        }
1614        rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
1615        set_opt(sb, QUOTA);
1616        return 1;
1617errout:
1618        kfree(qname);
1619        return ret;
1620}
1621
1622static int clear_qf_name(struct super_block *sb, int qtype)
1623{
1624
1625        struct ext4_sb_info *sbi = EXT4_SB(sb);
1626        char *old_qname = get_qf_name(sb, sbi, qtype);
1627
1628        if (sb_any_quota_loaded(sb) && old_qname) {
1629                ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1630                        " when quota turned on");
1631                return -1;
1632        }
1633        rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
1634        synchronize_rcu();
1635        kfree(old_qname);
1636        return 1;
1637}
1638#endif
1639
1640#define MOPT_SET        0x0001
1641#define MOPT_CLEAR      0x0002
1642#define MOPT_NOSUPPORT  0x0004
1643#define MOPT_EXPLICIT   0x0008
1644#define MOPT_CLEAR_ERR  0x0010
1645#define MOPT_GTE0       0x0020
1646#ifdef CONFIG_QUOTA
1647#define MOPT_Q          0
1648#define MOPT_QFMT       0x0040
1649#else
1650#define MOPT_Q          MOPT_NOSUPPORT
1651#define MOPT_QFMT       MOPT_NOSUPPORT
1652#endif
1653#define MOPT_DATAJ      0x0080
1654#define MOPT_NO_EXT2    0x0100
1655#define MOPT_NO_EXT3    0x0200
1656#define MOPT_EXT4_ONLY  (MOPT_NO_EXT2 | MOPT_NO_EXT3)
1657#define MOPT_STRING     0x0400
1658
1659static const struct mount_opts {
1660        int     token;
1661        int     mount_opt;
1662        int     flags;
1663} ext4_mount_opts[] = {
1664        {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1665        {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1666        {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1667        {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1668        {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1669        {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1670        {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
1671         MOPT_EXT4_ONLY | MOPT_SET},
1672        {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
1673         MOPT_EXT4_ONLY | MOPT_CLEAR},
1674        {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1675        {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1676        {Opt_delalloc, EXT4_MOUNT_DELALLOC,
1677         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1678        {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1679         MOPT_EXT4_ONLY | MOPT_CLEAR},
1680        {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
1681        {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
1682        {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1683         MOPT_EXT4_ONLY | MOPT_CLEAR},
1684        {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1685         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1686        {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1687                                    EXT4_MOUNT_JOURNAL_CHECKSUM),
1688         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1689        {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
1690        {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
1691        {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1692        {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1693        {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
1694         MOPT_NO_EXT2},
1695        {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
1696         MOPT_NO_EXT2},
1697        {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1698        {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1699        {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1700        {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1701        {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1702        {Opt_commit, 0, MOPT_GTE0},
1703        {Opt_max_batch_time, 0, MOPT_GTE0},
1704        {Opt_min_batch_time, 0, MOPT_GTE0},
1705        {Opt_inode_readahead_blks, 0, MOPT_GTE0},
1706        {Opt_init_itable, 0, MOPT_GTE0},
1707        {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
1708        {Opt_stripe, 0, MOPT_GTE0},
1709        {Opt_resuid, 0, MOPT_GTE0},
1710        {Opt_resgid, 0, MOPT_GTE0},
1711        {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0},
1712        {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING},
1713        {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0},
1714        {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1715        {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1716        {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
1717         MOPT_NO_EXT2 | MOPT_DATAJ},
1718        {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1719        {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1720#ifdef CONFIG_EXT4_FS_POSIX_ACL
1721        {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1722        {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
1723#else
1724        {Opt_acl, 0, MOPT_NOSUPPORT},
1725        {Opt_noacl, 0, MOPT_NOSUPPORT},
1726#endif
1727        {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1728        {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1729        {Opt_debug_want_extra_isize, 0, MOPT_GTE0},
1730        {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1731        {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1732                                                        MOPT_SET | MOPT_Q},
1733        {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1734                                                        MOPT_SET | MOPT_Q},
1735        {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
1736                                                        MOPT_SET | MOPT_Q},
1737        {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1738                       EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
1739                                                        MOPT_CLEAR | MOPT_Q},
1740        {Opt_usrjquota, 0, MOPT_Q},
1741        {Opt_grpjquota, 0, MOPT_Q},
1742        {Opt_offusrjquota, 0, MOPT_Q},
1743        {Opt_offgrpjquota, 0, MOPT_Q},
1744        {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1745        {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1746        {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1747        {Opt_max_dir_size_kb, 0, MOPT_GTE0},
1748        {Opt_test_dummy_encryption, 0, MOPT_GTE0},
1749        {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
1750        {Opt_err, 0, 0}
1751};
1752
1753static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1754                            substring_t *args, unsigned long *journal_devnum,
1755                            unsigned int *journal_ioprio, int is_remount)
1756{
1757        struct ext4_sb_info *sbi = EXT4_SB(sb);
1758        const struct mount_opts *m;
1759        kuid_t uid;
1760        kgid_t gid;
1761        int arg = 0;
1762
1763#ifdef CONFIG_QUOTA
1764        if (token == Opt_usrjquota)
1765                return set_qf_name(sb, USRQUOTA, &args[0]);
1766        else if (token == Opt_grpjquota)
1767                return set_qf_name(sb, GRPQUOTA, &args[0]);
1768        else if (token == Opt_offusrjquota)
1769                return clear_qf_name(sb, USRQUOTA);
1770        else if (token == Opt_offgrpjquota)
1771                return clear_qf_name(sb, GRPQUOTA);
1772#endif
1773        switch (token) {
1774        case Opt_noacl:
1775        case Opt_nouser_xattr:
1776                ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
1777                break;
1778        case Opt_sb:
1779                return 1;       /* handled by get_sb_block() */
1780        case Opt_removed:
1781                ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
1782                return 1;
1783        case Opt_abort:
1784                sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1785                return 1;
1786        case Opt_i_version:
1787                sb->s_flags |= SB_I_VERSION;
1788                return 1;
1789        case Opt_lazytime:
1790                sb->s_flags |= SB_LAZYTIME;
1791                return 1;
1792        case Opt_nolazytime:
1793                sb->s_flags &= ~SB_LAZYTIME;
1794                return 1;
1795        }
1796
1797        for (m = ext4_mount_opts; m->token != Opt_err; m++)
1798                if (token == m->token)
1799                        break;
1800
1801        if (m->token == Opt_err) {
1802                ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
1803                         "or missing value", opt);
1804                return -1;
1805        }
1806
1807        if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
1808                ext4_msg(sb, KERN_ERR,
1809                         "Mount option \"%s\" incompatible with ext2", opt);
1810                return -1;
1811        }
1812        if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
1813                ext4_msg(sb, KERN_ERR,
1814                         "Mount option \"%s\" incompatible with ext3", opt);
1815                return -1;
1816        }
1817
1818        if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
1819                return -1;
1820        if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
1821                return -1;
1822        if (m->flags & MOPT_EXPLICIT) {
1823                if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
1824                        set_opt2(sb, EXPLICIT_DELALLOC);
1825                } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
1826                        set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
1827                } else
1828                        return -1;
1829        }
1830        if (m->flags & MOPT_CLEAR_ERR)
1831                clear_opt(sb, ERRORS_MASK);
1832        if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
1833                ext4_msg(sb, KERN_ERR, "Cannot change quota "
1834                         "options when quota turned on");
1835                return -1;
1836        }
1837
1838        if (m->flags & MOPT_NOSUPPORT) {
1839                ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
1840        } else if (token == Opt_commit) {
1841                if (arg == 0)
1842                        arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
1843                sbi->s_commit_interval = HZ * arg;
1844        } else if (token == Opt_debug_want_extra_isize) {
1845                sbi->s_want_extra_isize = arg;
1846        } else if (token == Opt_max_batch_time) {
1847                sbi->s_max_batch_time = arg;
1848        } else if (token == Opt_min_batch_time) {
1849                sbi->s_min_batch_time = arg;
1850        } else if (token == Opt_inode_readahead_blks) {
1851                if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
1852                        ext4_msg(sb, KERN_ERR,
1853                                 "EXT4-fs: inode_readahead_blks must be "
1854                                 "0 or a power of 2 smaller than 2^31");
1855                        return -1;
1856                }
1857                sbi->s_inode_readahead_blks = arg;
1858        } else if (token == Opt_init_itable) {
1859                set_opt(sb, INIT_INODE_TABLE);
1860                if (!args->from)
1861                        arg = EXT4_DEF_LI_WAIT_MULT;
1862                sbi->s_li_wait_mult = arg;
1863        } else if (token == Opt_max_dir_size_kb) {
1864                sbi->s_max_dir_size_kb = arg;
1865        } else if (token == Opt_stripe) {
1866                sbi->s_stripe = arg;
1867        } else if (token == Opt_resuid) {
1868                uid = make_kuid(current_user_ns(), arg);
1869                if (!uid_valid(uid)) {
1870                        ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
1871                        return -1;
1872                }
1873                sbi->s_resuid = uid;
1874        } else if (token == Opt_resgid) {
1875                gid = make_kgid(current_user_ns(), arg);
1876                if (!gid_valid(gid)) {
1877                        ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
1878                        return -1;
1879                }
1880                sbi->s_resgid = gid;
1881        } else if (token == Opt_journal_dev) {
1882                if (is_remount) {
1883                        ext4_msg(sb, KERN_ERR,
1884                                 "Cannot specify journal on remount");
1885                        return -1;
1886                }
1887                *journal_devnum = arg;
1888        } else if (token == Opt_journal_path) {
1889                char *journal_path;
1890                struct inode *journal_inode;
1891                struct path path;
1892                int error;
1893
1894                if (is_remount) {
1895                        ext4_msg(sb, KERN_ERR,
1896                                 "Cannot specify journal on remount");
1897                        return -1;
1898                }
1899                journal_path = match_strdup(&args[0]);
1900                if (!journal_path) {
1901                        ext4_msg(sb, KERN_ERR, "error: could not dup "
1902                                "journal device string");
1903                        return -1;
1904                }
1905
1906                error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
1907                if (error) {
1908                        ext4_msg(sb, KERN_ERR, "error: could not find "
1909                                "journal device path: error %d", error);
1910                        kfree(journal_path);
1911                        return -1;
1912                }
1913
1914                journal_inode = d_inode(path.dentry);
1915                if (!S_ISBLK(journal_inode->i_mode)) {
1916                        ext4_msg(sb, KERN_ERR, "error: journal path %s "
1917                                "is not a block device", journal_path);
1918                        path_put(&path);
1919                        kfree(journal_path);
1920                        return -1;
1921                }
1922
1923                *journal_devnum = new_encode_dev(journal_inode->i_rdev);
1924                path_put(&path);
1925                kfree(journal_path);
1926        } else if (token == Opt_journal_ioprio) {
1927                if (arg > 7) {
1928                        ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
1929                                 " (must be 0-7)");
1930                        return -1;
1931                }
1932                *journal_ioprio =
1933                        IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
1934        } else if (token == Opt_test_dummy_encryption) {
1935#ifdef CONFIG_EXT4_FS_ENCRYPTION
1936                sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
1937                ext4_msg(sb, KERN_WARNING,
1938                         "Test dummy encryption mode enabled");
1939#else
1940                ext4_msg(sb, KERN_WARNING,
1941                         "Test dummy encryption mount option ignored");
1942#endif
1943        } else if (m->flags & MOPT_DATAJ) {
1944                if (is_remount) {
1945                        if (!sbi->s_journal)
1946                                ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
1947                        else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
1948                                ext4_msg(sb, KERN_ERR,
1949                                         "Cannot change data mode on remount");
1950                                return -1;
1951                        }
1952                } else {
1953                        clear_opt(sb, DATA_FLAGS);
1954                        sbi->s_mount_opt |= m->mount_opt;
1955                }
1956#ifdef CONFIG_QUOTA
1957        } else if (m->flags & MOPT_QFMT) {
1958                if (sb_any_quota_loaded(sb) &&
1959                    sbi->s_jquota_fmt != m->mount_opt) {
1960                        ext4_msg(sb, KERN_ERR, "Cannot change journaled "
1961                                 "quota options when quota turned on");
1962                        return -1;
1963                }
1964                if (ext4_has_feature_quota(sb)) {
1965                        ext4_msg(sb, KERN_INFO,
1966                                 "Quota format mount options ignored "
1967                                 "when QUOTA feature is enabled");
1968                        return 1;
1969                }
1970                sbi->s_jquota_fmt = m->mount_opt;
1971#endif
1972        } else if (token == Opt_dax) {
1973#ifdef CONFIG_FS_DAX
1974                ext4_msg(sb, KERN_WARNING,
1975                "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
1976                sbi->s_mount_opt |= m->mount_opt;
1977#else
1978                ext4_msg(sb, KERN_INFO, "dax option not supported");
1979                return -1;
1980#endif
1981        } else if (token == Opt_data_err_abort) {
1982                sbi->s_mount_opt |= m->mount_opt;
1983        } else if (token == Opt_data_err_ignore) {
1984                sbi->s_mount_opt &= ~m->mount_opt;
1985        } else {
1986                if (!args->from)
1987                        arg = 1;
1988                if (m->flags & MOPT_CLEAR)
1989                        arg = !arg;
1990                else if (unlikely(!(m->flags & MOPT_SET))) {
1991                        ext4_msg(sb, KERN_WARNING,
1992                                 "buggy handling of option %s", opt);
1993                        WARN_ON(1);
1994                        return -1;
1995                }
1996                if (arg != 0)
1997                        sbi->s_mount_opt |= m->mount_opt;
1998                else
1999                        sbi->s_mount_opt &= ~m->mount_opt;
2000        }

2001        return 1;
2002}
2003
2004static int parse_options(char *options, struct super_block *sb,
2005                         unsigned long *journal_devnum,
2006                         unsigned int *journal_ioprio,
2007                         int is_remount)
2008{
2009        struct ext4_sb_info *sbi = EXT4_SB(sb);
2010        char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
2011        substring_t args[MAX_OPT_ARGS];
2012        int token;
2013
2014        if (!options)
2015                return 1;
2016
2017        while ((p = strsep(&options, ",")) != NULL) {
2018                if (!*p)
2019                        continue;
2020                /*
2021                 * Initialize args struct so we know whether arg was
2022                 * found; some options take optional arguments.
2023                 */
2024                args[0].to = args[0].from = NULL;
2025                token = match_token(p, tokens, args);
2026                if (handle_mount_opt(sb, p, token, args, journal_devnum,
2027                                     journal_ioprio, is_remount) < 0)
2028                        return 0;
2029        }
2030#ifdef CONFIG_QUOTA
2031        /*
2032         * We do the test below only for project quotas. 'usrquota' and
2033         * 'grpquota' mount options are allowed even without quota feature
2034         * to support legacy quotas in quota files.
2035         */
2036        if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) {
2037                ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. "
2038                         "Cannot enable project quota enforcement.");
2039                return 0;
2040        }
2041        usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
2042        grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
2043        if (usr_qf_name || grp_qf_name) {
2044                if (test_opt(sb, USRQUOTA) && usr_qf_name)
2045                        clear_opt(sb, USRQUOTA);
2046
2047                if (test_opt(sb, GRPQUOTA) && grp_qf_name)
2048                        clear_opt(sb, GRPQUOTA);
2049
2050                if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
2051                        ext4_msg(sb, KERN_ERR, "old and new quota "
2052                                        "format mixing");
2053                        return 0;
2054                }
2055
2056                if (!sbi->s_jquota_fmt) {
2057                        ext4_msg(sb, KERN_ERR, "journaled quota format "
2058                                        "not specified");
2059                        return 0;
2060                }
2061        }
2062#endif
2063        if (test_opt(sb, DIOREAD_NOLOCK)) {
2064                int blocksize =
2065                        BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
2066
2067                if (blocksize < PAGE_SIZE) {
2068                        ext4_msg(sb, KERN_ERR, "can't mount with "
2069                                 "dioread_nolock if block size != PAGE_SIZE");
2070                        return 0;
2071                }
2072        }
2073        return 1;
2074}
2075
2076static inline void ext4_show_quota_options(struct seq_file *seq,
2077                                           struct super_block *sb)
2078{
2079#if defined(CONFIG_QUOTA)
2080        struct ext4_sb_info *sbi = EXT4_SB(sb);
2081        char *usr_qf_name, *grp_qf_name;
2082
2083        if (sbi->s_jquota_fmt) {
2084                char *fmtname = "";
2085
2086                switch (sbi->s_jquota_fmt) {
2087                case QFMT_VFS_OLD:
2088                        fmtname = "vfsold";
2089                        break;
2090                case QFMT_VFS_V0:
2091                        fmtname = "vfsv0";
2092                        break;
2093                case QFMT_VFS_V1:
2094                        fmtname = "vfsv1";
2095                        break;
2096                }
2097                seq_printf(seq, ",jqfmt=%s", fmtname);
2098        }
2099
2100        rcu_read_lock();
2101        usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
2102        grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
2103        if (usr_qf_name)
2104                seq_show_option(seq, "usrjquota", usr_qf_name);
2105        if (grp_qf_name)
2106                seq_show_option(seq, "grpjquota", grp_qf_name);
2107        rcu_read_unlock();
2108#endif
2109}
2110
2111static const char *token2str(int token)
2112{
2113        const struct match_token *t;
2114
2115        for (t = tokens; t->token != Opt_err; t++)
2116                if (t->token == token && !strchr(t->pattern, '='))
2117                        break;
2118        return t->pattern;
2119}
2120
2121/*
2122 * Show an option if
2123 *  - it's set to a non-default value OR
2124 *  - if the per-sb default is different from the global default
2125 */
2126static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
2127                              int nodefs)
2128{
2129        struct ext4_sb_info *sbi = EXT4_SB(sb);
2130        struct ext4_super_block *es = sbi->s_es;
2131        int def_errors, def_mount_opt = sbi->s_def_mount_opt;
2132        const struct mount_opts *m;
2133        char sep = nodefs ? '\n' : ',';
2134
2135#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
2136#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
2137
2138        if (sbi->s_sb_block != 1)
2139                SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
2140
2141        for (m = ext4_mount_opts; m->token != Opt_err; m++) {
2142                int want_set = m->flags & MOPT_SET;
2143                if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
2144                    (m->flags & MOPT_CLEAR_ERR))
2145                        continue;
2146                if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
2147                        continue; /* skip if same as the default */
2148                if ((want_set &&
2149                     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
2150                    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
2151                        continue; /* select Opt_noFoo vs Opt_Foo */
2152                SEQ_OPTS_PRINT("%s", token2str(m->token));
2153        }
2154
2155        if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
2156            le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
2157                SEQ_OPTS_PRINT("resuid=%u",
2158                                from_kuid_munged(&init_user_ns, sbi->s_resuid));
2159        if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
2160            le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
2161                SEQ_OPTS_PRINT("resgid=%u",
2162                                from_kgid_munged(&init_user_ns, sbi->s_resgid));
2163        def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
2164        if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
2165                SEQ_OPTS_PUTS("errors=remount-ro");
2166        if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
2167                SEQ_OPTS_PUTS("errors=continue");
2168        if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
2169                SEQ_OPTS_PUTS("errors=panic");
2170        if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
2171                SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
2172        if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
2173                SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
2174        if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
2175                SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
2176        if (sb->s_flags & SB_I_VERSION)
2177                SEQ_OPTS_PUTS("i_version");
2178        if (nodefs || sbi->s_stripe)
2179                SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
2180        if (nodefs || EXT4_MOUNT_DATA_FLAGS &
2181                        (sbi->s_mount_opt ^ def_mount_opt)) {
2182                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
2183                        SEQ_OPTS_PUTS("data=journal");
2184                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
2185                        SEQ_OPTS_PUTS("data=ordered");
2186                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
2187                        SEQ_OPTS_PUTS("data=writeback");
2188        }
2189        if (nodefs ||
2190            sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
2191                SEQ_OPTS_PRINT("inode_readahead_blks=%u",
2192                               sbi->s_inode_readahead_blks);
2193
2194        if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
2195                       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
2196                SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
2197        if (nodefs || sbi->s_max_dir_size_kb)
2198                SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
2199        if (test_opt(sb, DATA_ERR_ABORT))
2200                SEQ_OPTS_PUTS("data_err=abort");
2201        if (DUMMY_ENCRYPTION_ENABLED(sbi))
2202                SEQ_OPTS_PUTS("test_dummy_encryption");
2203
2204        ext4_show_quota_options(seq, sb);
2205        return 0;
2206}
2207
2208static int ext4_show_options(struct seq_file *seq, struct dentry *root)
2209{
2210        return _ext4_show_options(seq, root->d_sb, 0);
2211}
2212
2213int ext4_seq_options_show(struct seq_file *seq, void *offset)
2214{
2215        struct super_block *sb = seq->private;
2216        int rc;
2217
2218        seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
2219        rc = _ext4_show_options(seq, sb, 1);
2220        seq_puts(seq, "\n");
2221        return rc;
2222}
2223
2224static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
2225                            int read_only)
2226{
2227        struct ext4_sb_info *sbi = EXT4_SB(sb);
2228        int err = 0;
2229
2230        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
2231                ext4_msg(sb, KERN_ERR, "revision level too high, "
2232                         "forcing read-only mode");
2233                err = -EROFS;
2234        }
2235        if (read_only)
2236                goto done;
2237        if (!(sbi->s_mount_state & EXT4_VALID_FS))
2238                ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
2239                         "running e2fsck is recommended");
2240        else if (sbi->s_mount_state & EXT4_ERROR_FS)
2241                ext4_msg(sb, KERN_WARNING,
2242                         "warning: mounting fs with errors, "
2243                         "running e2fsck is recommended");
2244        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
2245                 le16_to_cpu(es->s_mnt_count) >=
2246                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
2247                ext4_msg(sb, KERN_WARNING,
2248                         "warning: maximal mount count reached, "
2249                         "running e2fsck is recommended");
2250        else if (le32_to_cpu(es->s_checkinterval) &&
2251                 (ext4_get_tstamp(es, s_lastcheck) +
2252                  le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
2253                ext4_msg(sb, KERN_WARNING,
2254                         "warning: checktime reached, "
2255                         "running e2fsck is recommended");
2256        if (!sbi->s_journal)
2257                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
2258        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
2259                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
2260        le16_add_cpu(&es->s_mnt_count, 1);
2261        ext4_update_tstamp(es, s_mtime);
2262        if (sbi->s_journal)
2263                ext4_set_feature_journal_needs_recovery(sb);
2264
2265        err = ext4_commit_super(sb, 1);
2266done:
2267        if (test_opt(sb, DEBUG))
2268                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
2269                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
2270                        sb->s_blocksize,
2271                        sbi->s_groups_count,
2272                        EXT4_BLOCKS_PER_GROUP(sb),
2273                        EXT4_INODES_PER_GROUP(sb),
2274                        sbi->s_mount_opt, sbi->s_mount_opt2);
2275
2276        cleancache_init_fs(sb);
2277        return err;
2278}
2279
2280int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
2281{
2282        struct ext4_sb_info *sbi = EXT4_SB(sb);
2283        struct flex_groups *new_groups;
2284        int size;
2285
2286        if (!sbi->s_log_groups_per_flex)
2287                return 0;
2288
2289        size = ext4_flex_group(sbi, ngroup - 1) + 1;
2290        if (size <= sbi->s_flex_groups_allocated)
2291                return 0;
2292
2293        size = roundup_pow_of_two(size * sizeof(struct flex_groups));
2294        new_groups = kvzalloc(size, GFP_KERNEL);
2295        if (!new_groups) {
2296                ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
2297                         size / (int) sizeof(struct flex_groups));
2298                return -ENOMEM;
2299        }
2300
2301        if (sbi->s_flex_groups) {
2302                memcpy(new_groups, sbi->s_flex_groups,
2303                       (sbi->s_flex_groups_allocated *
2304                        sizeof(struct flex_groups)));
2305                kvfree(sbi->s_flex_groups);
2306        }
2307        sbi->s_flex_groups = new_groups;
2308        sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
2309        return 0;
2310}
2311
2312static int ext4_fill_flex_info(struct super_block *sb)
2313{
2314        struct ext4_sb_info *sbi = EXT4_SB(sb);
2315        struct ext4_group_desc *gdp = NULL;
2316        ext4_group_t flex_group;
2317        int i, err;
2318
2319        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
2320        if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
2321                sbi->s_log_groups_per_flex = 0;
2322                return 1;
2323        }
2324
2325        err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
2326        if (err)
2327                goto failed;
2328
2329        for (i = 0; i < sbi->s_groups_count; i++) {
2330                gdp = ext4_get_group_desc(sb, i, NULL);
2331
2332                flex_group = ext4_flex_group(sbi, i);
2333                atomic_add(ext4_free_inodes_count(sb, gdp),
2334                           &sbi->s_flex_groups[flex_group].free_inodes);
2335                atomic64_add(ext4_free_group_clusters(sb, gdp),
2336                             &sbi->s_flex_groups[flex_group].free_clusters);
2337                atomic_add(ext4_used_dirs_count(sb, gdp),
2338                           &sbi->s_flex_groups[flex_group].used_dirs);
2339        }
2340
2341        return 1;
2342failed:
2343        return 0;
2344}
2345
2346static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
2347                                   struct ext4_group_desc *gdp)
2348{
2349        int offset = offsetof(struct ext4_group_desc, bg_checksum);
2350        __u16 crc = 0;
2351        __le32 le_group = cpu_to_le32(block_group);
2352        struct ext4_sb_info *sbi = EXT4_SB(sb);
2353
2354        if (ext4_has_metadata_csum(sbi->s_sb)) {
2355                /* Use new metadata_csum algorithm */
2356                __u32 csum32;
2357                __u16 dummy_csum = 0;
2358
2359                csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
2360                                     sizeof(le_group));
2361                csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
2362                csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
2363                                     sizeof(dummy_csum));
2364                offset += sizeof(dummy_csum);
2365                if (offset < sbi->s_desc_size)
2366                        csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
2367                                             sbi->s_desc_size - offset);
2368
2369                crc = csum32 & 0xFFFF;
2370                goto out;
2371        }
2372
2373        /* old crc16 code */
2374        if (!ext4_has_feature_gdt_csum(sb))
2375                return 0;
2376
2377        crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
2378        crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
2379        crc = crc16(crc, (__u8 *)gdp, offset);
2380        offset += sizeof(gdp->bg_checksum); /* skip checksum */
2381        /* for checksum of struct ext4_group_desc do the rest...*/
2382        if (ext4_has_feature_64bit(sb) &&
2383            offset < le16_to_cpu(sbi->s_es->s_desc_size))
2384                crc = crc16(crc, (__u8 *)gdp + offset,
2385                            le16_to_cpu(sbi->s_es->s_desc_size) -
2386                                offset);
2387
2388out:
2389        return cpu_to_le16(crc);
2390}
2391
2392int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
2393                                struct ext4_group_desc *gdp)
2394{
2395        if (ext4_has_group_desc_csum(sb) &&
2396            (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
2397                return 0;
2398
2399        return 1;
2400}
2401
2402void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
2403                              struct ext4_group_desc *gdp)
2404{
2405        if (!ext4_has_group_desc_csum(sb))
2406                return;
2407        gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
2408}
2409
2410/* Called at mount-time, super-block is locked */
2411static int ext4_check_descriptors(struct super_block *sb,
2412                                  ext4_fsblk_t sb_block,
2413                                  ext4_group_t *first_not_zeroed)
2414{
2415        struct ext4_sb_info *sbi = EXT4_SB(sb);
2416        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
2417        ext4_fsblk_t last_block;
2418        ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
2419        ext4_fsblk_t block_bitmap;
2420        ext4_fsblk_t inode_bitmap;
2421        ext4_fsblk_t inode_table;
2422        int flexbg_flag = 0;
2423        ext4_group_t i, grp = sbi->s_groups_count;
2424
2425        if (ext4_has_feature_flex_bg(sb))
2426                flexbg_flag = 1;
2427
2428        ext4_debug("Checking group descriptors");
2429
2430        for (i = 0; i < sbi->s_groups_count; i++) {
2431                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
2432
2433                if (i == sbi->s_groups_count - 1 || flexbg_flag)
2434                        last_block = ext4_blocks_count(sbi->s_es) - 1;
2435                else
2436                        last_block = first_block +
2437                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);
2438
2439                if ((grp == sbi->s_groups_count) &&
2440                   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2441                        grp = i;
2442
2443                block_bitmap = ext4_block_bitmap(sb, gdp);
2444                if (block_bitmap == sb_block) {
2445                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2446                                 "Block bitmap for group %u overlaps "
2447                                 "superblock", i);
2448                        if (!sb_rdonly(sb))
2449                                return 0;
2450                }
2451                if (block_bitmap >= sb_block + 1 &&
2452                    block_bitmap <= last_bg_block) {
2453                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2454                                 "Block bitmap for group %u overlaps "
2455                                 "block group descriptors", i);
2456                        if (!sb_rdonly(sb))
2457                                return 0;
2458                }
2459                if (block_bitmap < first_block || block_bitmap > last_block) {
2460                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2461                               "Block bitmap for group %u not in group "
2462                               "(block %llu)!", i, block_bitmap);
2463                        return 0;
2464                }
2465                inode_bitmap = ext4_inode_bitmap(sb, gdp);
2466                if (inode_bitmap == sb_block) {
2467                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2468                                 "Inode bitmap for group %u overlaps "
2469                                 "superblock", i);
2470                        if (!sb_rdonly(sb))
2471                                return 0;
2472                }
2473                if (inode_bitmap >= sb_block + 1 &&
2474                    inode_bitmap <= last_bg_block) {
2475                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2476                                 "Inode bitmap for group %u overlaps "
2477                                 "block group descriptors", i);
2478                        if (!sb_rdonly(sb))
2479                                return 0;
2480                }
2481                if (inode_bitmap < first_block || inode_bitmap > last_block) {
2482                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2483                               "Inode bitmap for group %u not in group "
2484                               "(block %llu)!", i, inode_bitmap);
2485                        return 0;
2486                }
2487                inode_table = ext4_inode_table(sb, gdp);
2488                if (inode_table == sb_block) {
2489                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2490                                 "Inode table for group %u overlaps "
2491                                 "superblock", i);
2492                        if (!sb_rdonly(sb))
2493                                return 0;
2494                }
2495                if (inode_table >= sb_block + 1 &&
2496                    inode_table <= last_bg_block) {
2497                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2498                                 "Inode table for group %u overlaps "
2499                                 "block group descriptors", i);
2500                        if (!sb_rdonly(sb))
2501                                return 0;
2502                }
2503                if (inode_table < first_block ||
2504                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
2505                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2506                               "Inode table for group %u not in group "
2507                               "(block %llu)!", i, inode_table);
2508                        return 0;
2509                }
2510                ext4_lock_group(sb, i);
2511                if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
2512                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2513                                 "Checksum for group %u failed (%u!=%u)",
2514                                 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
2515                                     gdp)), le16_to_cpu(gdp->bg_checksum));
2516                        if (!sb_rdonly(sb)) {
2517                                ext4_unlock_group(sb, i);
2518                                return 0;
2519                        }
2520                }
2521                ext4_unlock_group(sb, i);
2522                if (!flexbg_flag)
2523                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
2524        }
2525        if (NULL != first_not_zeroed)
2526                *first_not_zeroed = grp;
2527        return 1;
2528}
2529
2530/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
2531 * the superblock) which were deleted from all directories, but held open by
2532 * a process at the time of a crash.  We walk the list and try to delete these
2533 * inodes at recovery time (only with a read-write filesystem).
2534 *
2535 * In order to keep the orphan inode chain consistent during traversal (in
2536 * case of crash during recovery), we link each inode into the superblock
2537 * orphan list_head and handle it the same way as an inode deletion during
2538 * normal operation (which journals the operations for us).
2539 *
2540 * We only do an iget() and an iput() on each inode, which is very safe if we
2541 * accidentally point at an in-use or already deleted inode.  The worst that
2542 * can happen in this case is that we get a "bit already cleared" message from
2543 * ext4_free_inode().  The only reason we would point at a wrong inode is if
2544 * e2fsck was run on this filesystem, and it must have already done the orphan
2545 * inode cleanup for us, so we can safely abort without any further action.
2546 */
2547static void ext4_orphan_cleanup(struct super_block *sb,
2548                                struct ext4_super_block *es)
2549{
2550        unsigned int s_flags = sb->s_flags;
2551        int ret, nr_orphans = 0, nr_truncates = 0;
2552#ifdef CONFIG_QUOTA
2553        int quota_update = 0;
2554        int i;
2555#endif
2556        if (!es->s_last_orphan) {
2557                jbd_debug(4, "no orphan inodes to clean up\n");
2558                return;
2559        }
2560
2561        if (bdev_read_only(sb->s_bdev)) {
2562                ext4_msg(sb, KERN_ERR, "write access "
2563                        "unavailable, skipping orphan cleanup");
2564                return;
2565        }
2566
2567        /* Check if feature set would not allow a r/w mount */
2568        if (!ext4_feature_set_ok(sb, 0)) {
2569                ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
2570                         "unknown ROCOMPAT features");
2571                return;
2572        }
2573
2574        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2575                /* don't clear list on RO mount w/ errors */
2576                if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
2577                        ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
2578                                  "clearing orphan list.\n");
2579                        es->s_last_orphan = 0;
2580                }
2581                jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
2582                return;
2583        }
2584
2585        if (s_flags & SB_RDONLY) {
2586                ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
2587                sb->s_flags &= ~SB_RDONLY;
2588        }
2589#ifdef CONFIG_QUOTA
2590        /* Needed for iput() to work correctly and not trash data */
2591        sb->s_flags |= SB_ACTIVE;
2592
2593        /*
2594         * Turn on quotas which were not enabled for read-only mounts if
2595         * filesystem has quota feature, so that they are updated correctly.
2596         */
2597        if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
2598                int ret = ext4_enable_quotas(sb);
2599
2600                if (!ret)
2601                        quota_update = 1;
2602                else
2603                        ext4_msg(sb, KERN_ERR,
2604                                "Cannot turn on quotas: error %d", ret);
2605        }
2606
2607        /* Turn on journaled quotas used for old sytle */
2608        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2609                if (EXT4_SB(sb)->s_qf_names[i]) {
2610                        int ret = ext4_quota_on_mount(sb, i);
2611
2612                        if (!ret)
2613                                quota_update = 1;
2614                        else
2615                                ext4_msg(sb, KERN_ERR,
2616                                        "Cannot turn on journaled "
2617                                        "quota: type %d: error %d", i, ret);
2618                }
2619        }
2620#endif
2621
2622        while (es->s_last_orphan) {
2623                struct inode *inode;
2624
2625                /*
2626                 * We may have encountered an error during cleanup; if
2627                 * so, skip the rest.
2628                 */
2629                if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2630                        jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
2631                        es->s_last_orphan = 0;
2632                        break;
2633                }
2634
2635                inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
2636                if (IS_ERR(inode)) {
2637                        es->s_last_orphan = 0;
2638                        break;
2639                }
2640
2641                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2642                dquot_initialize(inode);
2643                if (inode->i_nlink) {
2644                        if (test_opt(sb, DEBUG))
2645                                ext4_msg(sb, KERN_DEBUG,
2646                                        "%s: truncating inode %lu to %lld bytes",
2647                                        __func__, inode->i_ino, inode->i_size);
2648                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2649                                  inode->i_ino, inode->i_size);
2650                        inode_lock(inode);
2651                        truncate_inode_pages(inode->i_mapping, inode->i_size);
2652                        ret = ext4_truncate(inode);
2653                        if (ret)
2654                                ext4_std_error(inode->i_sb, ret);
2655                        inode_unlock(inode);
2656                        nr_truncates++;
2657                } else {
2658                        if (test_opt(sb, DEBUG))
2659                                ext4_msg(sb, KERN_DEBUG,
2660                                        "%s: deleting unreferenced inode %lu",
2661                                        __func__, inode->i_ino);
2662                        jbd_debug(2, "deleting unreferenced inode %lu\n",
2663                                  inode->i_ino);
2664                        nr_orphans++;
2665                }
2666                iput(inode);  /* The delete magic happens here! */
2667        }
2668
2669#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
2670
2671        if (nr_orphans)
2672                ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
2673                       PLURAL(nr_orphans));
2674        if (nr_truncates)
2675                ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
2676                       PLURAL(nr_truncates));
2677#ifdef CONFIG_QUOTA
2678        /* Turn off quotas if they were enabled for orphan cleanup */
2679        if (quota_update) {
2680                for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2681                        if (sb_dqopt(sb)->files[i])
2682                                dquot_quota_off(sb, i);
2683                }
2684        }
2685#endif
2686        sb->s_flags = s_flags; /* Restore SB_RDONLY status */
2687}
2688
2689/*
2690 * Maximal extent format file size.
2691 * Resulting logical blkno at s_maxbytes must fit in our on-disk
2692 * extent format containers, within a sector_t, and within i_blocks
2693 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
2694 * so that won't be a limiting factor.
2695 *
2696 * However there is other limiting factor. We do store extents in the form
2697 * of starting block and length, hence the resulting length of the extent
2698 * covering maximum file size must fit into on-disk format containers as
2699 * well. Given that length is always by 1 unit bigger than max unit (because
2700 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
2701 *
2702 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
2703 */
2704static loff_t ext4_max_size(int blkbits, int has_huge_files)
2705{
2706        loff_t res;
2707        loff_t upper_limit = MAX_LFS_FILESIZE;
2708
2709        BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
2710
2711        if (!has_huge_files) {
2712                upper_limit = (1LL << 32) - 1;
2713
2714                /* total blocks in file system block size */
2715                upper_limit >>= (blkbits - 9);
2716                upper_limit <<= blkbits;
2717        }
2718
2719        /*
2720         * 32-bit extent-start container, ee_block. We lower the maxbytes
2721         * by one fs block, so ee_len can cover the extent of maximum file
2722         * size
2723         */
2724        res = (1LL << 32) - 1;
2725        res <<= blkbits;
2726
2727        /* Sanity check against vm- & vfs- imposed limits */
2728        if (res > upper_limit)
2729                res = upper_limit;
2730
2731        return res;
2732}
2733
2734/*
2735 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
2736 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
2737 * We need to be 1 filesystem block less than the 2^48 sector limit.
2738 */
2739static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
2740{
2741        loff_t res = EXT4_NDIR_BLOCKS;
2742        int meta_blocks;
2743        loff_t upper_limit;
2744        /* This is calculated to be the largest file size for a dense, block
2745         * mapped file such that the file's total number of 512-byte sectors,
2746         * including data and all indirect blocks, does not exceed (2^48 - 1).
2747         *
2748         * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
2749         * number of 512-byte sectors of the file.
2750         */
2751
2752        if (!has_huge_files) {
2753                /*
2754                 * !has_huge_files or implies that the inode i_block field
2755                 * represents total file blocks in 2^32 512-byte sectors ==
2756                 * size of vfs inode i_blocks * 8
2757                 */
2758                upper_limit = (1LL << 32) - 1;
2759
2760                /* total blocks in file system block size */
2761                upper_limit >>= (bits - 9);
2762
2763        } else {
2764                /*
2765                 * We use 48 bit ext4_inode i_blocks
2766                 * With EXT4_HUGE_FILE_FL set the i_blocks
2767                 * represent total number of blocks in
2768                 * file system block size
2769                 */
2770                upper_limit = (1LL << 48) - 1;
2771
2772        }
2773
2774        /* indirect blocks */
2775        meta_blocks = 1;
2776        /* double indirect blocks */
2777        meta_blocks += 1 + (1LL << (bits-2));
2778        /* tripple indirect blocks */
2779        meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
2780
2781        upper_limit -= meta_blocks;
2782        upper_limit <<= bits;
2783
2784        res += 1LL << (bits-2);
2785        res += 1LL << (2*(bits-2));
2786        res += 1LL << (3*(bits-2));
2787        res <<= bits;
2788        if (res > upper_limit)
2789                res = upper_limit;
2790
2791        if (res > MAX_LFS_FILESIZE)
2792                res = MAX_LFS_FILESIZE;
2793
2794        return res;
2795}
2796
2797static ext4_fsblk_t descriptor_loc(struct super_block *sb,
2798                                   ext4_fsblk_t logical_sb_block, int nr)
2799{
2800        struct ext4_sb_info *sbi = EXT4_SB(sb);
2801        ext4_group_t bg, first_meta_bg;
2802        int has_super = 0;
2803
2804        first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
2805
2806        if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
2807                return logical_sb_block + nr + 1;
2808        bg = sbi->s_desc_per_block * nr;
2809        if (ext4_bg_has_super(sb, bg))
2810                has_super = 1;
2811
2812        /*
2813         * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
2814         * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
2815         * on modern mke2fs or blksize > 1k on older mke2fs) then we must
2816         * compensate.
2817         */
2818        if (sb->s_blocksize == 1024 && nr == 0 &&
2819            le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
2820                has_super++;
2821
2822        return (has_super + ext4_group_first_block_no(sb, bg));
2823}
2824
2825/**
2826 * ext4_get_stripe_size: Get the stripe size.
2827 * @sbi: In memory super block info
2828 *
2829 * If we have specified it via mount option, then
2830 * use the mount option value. If the value specified at mount time is
2831 * greater than the blocks per group use the super block value.
2832 * If the super block value is greater than blocks per group return 0.
2833 * Allocator needs it be less than blocks per group.
2834 *
2835 */
2836static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2837{
2838        unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
2839        unsigned long stripe_width =
2840                        le32_to_cpu(sbi->s_es->s_raid_stripe_width);
2841        int ret;
2842
2843        if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
2844                ret = sbi->s_stripe;
2845        else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
2846                ret = stripe_width;
2847        else if (stride && stride <= sbi->s_blocks_per_group)
2848                ret = stride;
2849        else
2850                ret = 0;
2851
2852        /*
2853         * If the stripe width is 1, this makes no sense and
2854         * we set it to 0 to turn off stripe handling code.
2855         */
2856        if (ret <= 1)
2857                ret = 0;
2858
2859        return ret;
2860}
2861
2862/*
2863 * Check whether this filesystem can be mounted based on
2864 * the features present and the RDONLY/RDWR mount requested.
2865 * Returns 1 if this filesystem can be mounted as requested,
2866 * 0 if it cannot be.
2867 */
2868static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2869{
2870        if (ext4_has_unknown_ext4_incompat_features(sb)) {
2871                ext4_msg(sb, KERN_ERR,
2872                        "Couldn't mount because of "
2873                        "unsupported optional features (%x)",
2874                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2875                        ~EXT4_FEATURE_INCOMPAT_SUPP));
2876                return 0;
2877        }
2878
2879        if (readonly)
2880                return 1;
2881
2882        if (ext4_has_feature_readonly(sb)) {
2883                ext4_msg(sb, KERN_INFO, "filesystem is read-only");
2884                sb->s_flags |= SB_RDONLY;
2885                return 1;
2886        }
2887
2888        /* Check that feature set is OK for a read-write mount */
2889        if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
2890                ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
2891                         "unsupported optional features (%x)",
2892                         (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2893                                ~EXT4_FEATURE_RO_COMPAT_SUPP));
2894                return 0;
2895        }
2896        if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
2897                ext4_msg(sb, KERN_ERR,
2898                         "Can't support bigalloc feature without "
2899                         "extents feature\n");
2900                return 0;
2901        }
2902
2903#ifndef CONFIG_QUOTA
2904        if (ext4_has_feature_quota(sb) && !readonly) {
2905                ext4_msg(sb, KERN_ERR,
2906                         "Filesystem with quota feature cannot be mounted RDWR "
2907                         "without CONFIG_QUOTA");
2908                return 0;
2909        }
2910        if (ext4_has_feature_project(sb) && !readonly) {
2911                ext4_msg(sb, KERN_ERR,
2912                         "Filesystem with project quota feature cannot be mounted RDWR "
2913                         "without CONFIG_QUOTA");
2914                return 0;
2915        }
2916#endif  /* CONFIG_QUOTA */
2917        return 1;
2918}
2919
2920/*
2921 * This function is called once a day if we have errors logged
2922 * on the file system
2923 */
2924static void print_daily_error_info(struct timer_list *t)
2925{
2926        struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
2927        struct super_block *sb = sbi->s_sb;
2928        struct ext4_super_block *es = sbi->s_es;
2929
2930        if (es->s_error_count)
2931                /* fsck newer than v1.41.13 is needed to clean this condition. */
2932                ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
2933                         le32_to_cpu(es->s_error_count));
2934        if (es->s_first_error_time) {
2935                printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
2936                       sb->s_id,
2937                       ext4_get_tstamp(es, s_first_error_time),
2938                       (int) sizeof(es->s_first_error_func),
2939                       es->s_first_error_func,
2940                       le32_to_cpu(es->s_first_error_line));
2941                if (es->s_first_error_ino)
2942                        printk(KERN_CONT ": inode %u",
2943                               le32_to_cpu(es->s_first_error_ino));
2944                if (es->s_first_error_block)
2945                        printk(KERN_CONT ": block %llu", (unsigned long long)
2946                               le64_to_cpu(es->s_first_error_block));
2947                printk(KERN_CONT "\n");
2948        }
2949        if (es->s_last_error_time) {
2950                printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
2951                       sb->s_id,
2952                       ext4_get_tstamp(es, s_last_error_time),
2953                       (int) sizeof(es->s_last_error_func),
2954                       es->s_last_error_func,
2955                       le32_to_cpu(es->s_last_error_line));
2956                if (es->s_last_error_ino)
2957                        printk(KERN_CONT ": inode %u",
2958                               le32_to_cpu(es->s_last_error_ino));
2959                if (es->s_last_error_block)
2960                        printk(KERN_CONT ": block %llu", (unsigned long long)
2961                               le64_to_cpu(es->s_last_error_block));
2962                printk(KERN_CONT "\n");
2963        }
2964        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
2965}
2966
2967/* Find next suitable group and run ext4_init_inode_table */
2968static int ext4_run_li_request(struct ext4_li_request *elr)
2969{
2970        struct ext4_group_desc *gdp = NULL;
2971        ext4_group_t group, ngroups;
2972        struct super_block *sb;
2973        unsigned long timeout = 0;
2974        int ret = 0;
2975
2976        sb = elr->lr_super;
2977        ngroups = EXT4_SB(sb)->s_groups_count;
2978
2979        for (group = elr->lr_next_group; group < ngroups; group++) {
2980                gdp = ext4_get_group_desc(sb, group, NULL);
2981                if (!gdp) {
2982                        ret = 1;
2983                        break;
2984                }
2985
2986                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2987                        break;
2988        }
2989
2990        if (group >= ngroups)
2991                ret = 1;
2992
2993        if (!ret) {
2994                timeout = jiffies;
2995                ret = ext4_init_inode_table(sb, group,
2996                                            elr->lr_timeout ? 0 : 1);
2997                if (elr->lr_timeout == 0) {
2998                        timeout = (jiffies - timeout) *
2999                                  elr->lr_sbi->s_li_wait_mult;
3000                        elr->lr_timeout = timeout;

3001                }
3002                elr->lr_next_sched = jiffies + elr->lr_timeout;
3003                elr->lr_next_group = group + 1;
3004        }
3005        return ret;
3006}
3007
3008/*
3009 * Remove lr_request from the list_request and free the
3010 * request structure. Should be called with li_list_mtx held
3011 */
3012static void ext4_remove_li_request(struct ext4_li_request *elr)
3013{
3014        struct ext4_sb_info *sbi;
3015
3016        if (!elr)
3017                return;
3018
3019        sbi = elr->lr_sbi;
3020
3021        list_del(&elr->lr_request);
3022        sbi->s_li_request = NULL;
3023        kfree(elr);
3024}
3025
3026static void ext4_unregister_li_request(struct super_block *sb)
3027{
3028        mutex_lock(&ext4_li_mtx);
3029        if (!ext4_li_info) {
3030                mutex_unlock(&ext4_li_mtx);
3031                return;
3032        }
3033
3034        mutex_lock(&ext4_li_info->li_list_mtx);
3035        ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
3036        mutex_unlock(&ext4_li_info->li_list_mtx);
3037        mutex_unlock(&ext4_li_mtx);
3038}
3039
3040static struct task_struct *ext4_lazyinit_task;
3041
3042/*
3043 * This is the function where ext4lazyinit thread lives. It walks
3044 * through the request list searching for next scheduled filesystem.
3045 * When such a fs is found, run the lazy initialization request
3046 * (ext4_rn_li_request) and keep track of the time spend in this
3047 * function. Based on that time we compute next schedule time of
3048 * the request. When walking through the list is complete, compute
3049 * next waking time and put itself into sleep.
3050 */
3051static int ext4_lazyinit_thread(void *arg)
3052{
3053        struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
3054        struct list_head *pos, *n;
3055        struct ext4_li_request *elr;
3056        unsigned long next_wakeup, cur;
3057
3058        BUG_ON(NULL == eli);
3059
3060cont_thread:
3061        while (true) {
3062                next_wakeup = MAX_JIFFY_OFFSET;
3063
3064                mutex_lock(&eli->li_list_mtx);
3065                if (list_empty(&eli->li_request_list)) {
3066                        mutex_unlock(&eli->li_list_mtx);
3067                        goto exit_thread;
3068                }
3069                list_for_each_safe(pos, n, &eli->li_request_list) {
3070                        int err = 0;
3071                        int progress = 0;
3072                        elr = list_entry(pos, struct ext4_li_request,
3073                                         lr_request);
3074
3075                        if (time_before(jiffies, elr->lr_next_sched)) {
3076                                if (time_before(elr->lr_next_sched, next_wakeup))
3077                                        next_wakeup = elr->lr_next_sched;
3078                                continue;
3079                        }
3080                        if (down_read_trylock(&elr->lr_super->s_umount)) {
3081                                if (sb_start_write_trylock(elr->lr_super)) {
3082                                        progress = 1;
3083                                        /*
3084                                         * We hold sb->s_umount, sb can not
3085                                         * be removed from the list, it is
3086                                         * now safe to drop li_list_mtx
3087                                         */
3088                                        mutex_unlock(&eli->li_list_mtx);
3089                                        err = ext4_run_li_request(elr);
3090                                        sb_end_write(elr->lr_super);
3091                                        mutex_lock(&eli->li_list_mtx);
3092                                        n = pos->next;
3093                                }
3094                                up_read((&elr->lr_super->s_umount));
3095                        }
3096                        /* error, remove the lazy_init job */
3097                        if (err) {
3098                                ext4_remove_li_request(elr);
3099                                continue;
3100                        }
3101                        if (!progress) {
3102                                elr->lr_next_sched = jiffies +
3103                                        (prandom_u32()
3104                                         % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
3105                        }
3106                        if (time_before(elr->lr_next_sched, next_wakeup))
3107                                next_wakeup = elr->lr_next_sched;
3108                }
3109                mutex_unlock(&eli->li_list_mtx);
3110
3111                try_to_freeze();
3112
3113                cur = jiffies;
3114                if ((time_after_eq(cur, next_wakeup)) ||
3115                    (MAX_JIFFY_OFFSET == next_wakeup)) {
3116                        cond_resched();
3117                        continue;
3118                }
3119
3120                schedule_timeout_interruptible(next_wakeup - cur);
3121
3122                if (kthread_should_stop()) {
3123                        ext4_clear_request_list();
3124                        goto exit_thread;
3125                }
3126        }
3127
3128exit_thread:
3129        /*
3130         * It looks like the request list is empty, but we need
3131         * to check it under the li_list_mtx lock, to prevent any
3132         * additions into it, and of course we should lock ext4_li_mtx
3133         * to atomically free the list and ext4_li_info, because at
3134         * this point another ext4 filesystem could be registering
3135         * new one.
3136         */
3137        mutex_lock(&ext4_li_mtx);
3138        mutex_lock(&eli->li_list_mtx);
3139        if (!list_empty(&eli->li_request_list)) {
3140                mutex_unlock(&eli->li_list_mtx);
3141                mutex_unlock(&ext4_li_mtx);
3142                goto cont_thread;
3143        }
3144        mutex_unlock(&eli->li_list_mtx);
3145        kfree(ext4_li_info);
3146        ext4_li_info = NULL;
3147        mutex_unlock(&ext4_li_mtx);
3148
3149        return 0;
3150}
3151
3152static void ext4_clear_request_list(void)
3153{
3154        struct list_head *pos, *n;
3155        struct ext4_li_request *elr;
3156
3157        mutex_lock(&ext4_li_info->li_list_mtx);
3158        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
3159                elr = list_entry(pos, struct ext4_li_request,
3160                                 lr_request);
3161                ext4_remove_li_request(elr);
3162        }
3163        mutex_unlock(&ext4_li_info->li_list_mtx);
3164}
3165
3166static int ext4_run_lazyinit_thread(void)
3167{
3168        ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
3169                                         ext4_li_info, "ext4lazyinit");
3170        if (IS_ERR(ext4_lazyinit_task)) {
3171                int err = PTR_ERR(ext4_lazyinit_task);
3172                ext4_clear_request_list();
3173                kfree(ext4_li_info);
3174                ext4_li_info = NULL;
3175                printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
3176                                 "initialization thread\n",
3177                                 err);
3178                return err;
3179        }
3180        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
3181        return 0;
3182}
3183
3184/*
3185 * Check whether it make sense to run itable init. thread or not.
3186 * If there is at least one uninitialized inode table, return
3187 * corresponding group number, else the loop goes through all
3188 * groups and return total number of groups.
3189 */
3190static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
3191{
3192        ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
3193        struct ext4_group_desc *gdp = NULL;
3194
3195        if (!ext4_has_group_desc_csum(sb))
3196                return ngroups;
3197
3198        for (group = 0; group < ngroups; group++) {
3199                gdp = ext4_get_group_desc(sb, group, NULL);
3200                if (!gdp)
3201                        continue;
3202
3203                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3204                        break;
3205        }
3206
3207        return group;
3208}
3209
3210static int ext4_li_info_new(void)
3211{
3212        struct ext4_lazy_init *eli = NULL;
3213
3214        eli = kzalloc(sizeof(*eli), GFP_KERNEL);
3215        if (!eli)
3216                return -ENOMEM;
3217
3218        INIT_LIST_HEAD(&eli->li_request_list);
3219        mutex_init(&eli->li_list_mtx);
3220
3221        eli->li_state |= EXT4_LAZYINIT_QUIT;
3222
3223        ext4_li_info = eli;
3224
3225        return 0;
3226}
3227
3228static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
3229                                            ext4_group_t start)
3230{
3231        struct ext4_sb_info *sbi = EXT4_SB(sb);
3232        struct ext4_li_request *elr;
3233
3234        elr = kzalloc(sizeof(*elr), GFP_KERNEL);
3235        if (!elr)
3236                return NULL;
3237
3238        elr->lr_super = sb;
3239        elr->lr_sbi = sbi;
3240        elr->lr_next_group = start;
3241
3242        /*
3243         * Randomize first schedule time of the request to
3244         * spread the inode table initialization requests
3245         * better.
3246         */
3247        elr->lr_next_sched = jiffies + (prandom_u32() %
3248                                (EXT4_DEF_LI_MAX_START_DELAY * HZ));
3249        return elr;
3250}
3251
3252int ext4_register_li_request(struct super_block *sb,
3253                             ext4_group_t first_not_zeroed)
3254{
3255        struct ext4_sb_info *sbi = EXT4_SB(sb);
3256        struct ext4_li_request *elr = NULL;
3257        ext4_group_t ngroups = sbi->s_groups_count;
3258        int ret = 0;
3259
3260        mutex_lock(&ext4_li_mtx);
3261        if (sbi->s_li_request != NULL) {
3262                /*
3263                 * Reset timeout so it can be computed again, because
3264                 * s_li_wait_mult might have changed.
3265                 */
3266                sbi->s_li_request->lr_timeout = 0;
3267                goto out;
3268        }
3269
3270        if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
3271            !test_opt(sb, INIT_INODE_TABLE))
3272                goto out;
3273
3274        elr = ext4_li_request_new(sb, first_not_zeroed);
3275        if (!elr) {
3276                ret = -ENOMEM;
3277                goto out;
3278        }
3279
3280        if (NULL == ext4_li_info) {
3281                ret = ext4_li_info_new();
3282                if (ret)
3283                        goto out;
3284        }
3285
3286        mutex_lock(&ext4_li_info->li_list_mtx);
3287        list_add(&elr->lr_request, &ext4_li_info->li_request_list);
3288        mutex_unlock(&ext4_li_info->li_list_mtx);
3289
3290        sbi->s_li_request = elr;
3291        /*
3292         * set elr to NULL here since it has been inserted to
3293         * the request_list and the removal and free of it is
3294         * handled by ext4_clear_request_list from now on.
3295         */
3296        elr = NULL;
3297
3298        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
3299                ret = ext4_run_lazyinit_thread();
3300                if (ret)
3301                        goto out;
3302        }
3303out:
3304        mutex_unlock(&ext4_li_mtx);
3305        if (ret)
3306                kfree(elr);
3307        return ret;
3308}
3309
3310/*
3311 * We do not need to lock anything since this is called on
3312 * module unload.
3313 */
3314static void ext4_destroy_lazyinit_thread(void)
3315{
3316        /*
3317         * If thread exited earlier
3318         * there's nothing to be done.
3319         */
3320        if (!ext4_li_info || !ext4_lazyinit_task)
3321                return;
3322
3323        kthread_stop(ext4_lazyinit_task);
3324}
3325
3326static int set_journal_csum_feature_set(struct super_block *sb)
3327{
3328        int ret = 1;
3329        int compat, incompat;
3330        struct ext4_sb_info *sbi = EXT4_SB(sb);
3331
3332        if (ext4_has_metadata_csum(sb)) {
3333                /* journal checksum v3 */
3334                compat = 0;
3335                incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
3336        } else {
3337                /* journal checksum v1 */
3338                compat = JBD2_FEATURE_COMPAT_CHECKSUM;
3339                incompat = 0;
3340        }
3341
3342        jbd2_journal_clear_features(sbi->s_journal,
3343                        JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3344                        JBD2_FEATURE_INCOMPAT_CSUM_V3 |
3345                        JBD2_FEATURE_INCOMPAT_CSUM_V2);
3346        if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
3347                ret = jbd2_journal_set_features(sbi->s_journal,
3348                                compat, 0,
3349                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3350                                incompat);
3351        } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
3352                ret = jbd2_journal_set_features(sbi->s_journal,
3353                                compat, 0,
3354                                incompat);
3355                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3356                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3357        } else {
3358                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3359                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3360        }
3361
3362        return ret;
3363}
3364
3365/*
3366 * Note: calculating the overhead so we can be compatible with
3367 * historical BSD practice is quite difficult in the face of
3368 * clusters/bigalloc.  This is because multiple metadata blocks from
3369 * different block group can end up in the same allocation cluster.
3370 * Calculating the exact overhead in the face of clustered allocation
3371 * requires either O(all block bitmaps) in memory or O(number of block
3372 * groups**2) in time.  We will still calculate the superblock for
3373 * older file systems --- and if we come across with a bigalloc file
3374 * system with zero in s_overhead_clusters the estimate will be close to
3375 * correct especially for very large cluster sizes --- but for newer
3376 * file systems, it's better to calculate this figure once at mkfs
3377 * time, and store it in the superblock.  If the superblock value is
3378 * present (even for non-bigalloc file systems), we will use it.
3379 */
3380static int count_overhead(struct super_block *sb, ext4_group_t grp,
3381                          char *buf)
3382{
3383        struct ext4_sb_info     *sbi = EXT4_SB(sb);
3384        struct ext4_group_desc  *gdp;
3385        ext4_fsblk_t            first_block, last_block, b;
3386        ext4_group_t            i, ngroups = ext4_get_groups_count(sb);
3387        int                     s, j, count = 0;
3388
3389        if (!ext4_has_feature_bigalloc(sb))
3390                return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
3391                        sbi->s_itb_per_group + 2);
3392
3393        first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
3394                (grp * EXT4_BLOCKS_PER_GROUP(sb));
3395        last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
3396        for (i = 0; i < ngroups; i++) {
3397                gdp = ext4_get_group_desc(sb, i, NULL);
3398                b = ext4_block_bitmap(sb, gdp);
3399                if (b >= first_block && b <= last_block) {
3400                        ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3401                        count++;
3402                }
3403                b = ext4_inode_bitmap(sb, gdp);
3404                if (b >= first_block && b <= last_block) {
3405                        ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3406                        count++;
3407                }
3408                b = ext4_inode_table(sb, gdp);
3409                if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
3410                        for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
3411                                int c = EXT4_B2C(sbi, b - first_block);
3412                                ext4_set_bit(c, buf);
3413                                count++;
3414                        }
3415                if (i != grp)
3416                        continue;
3417                s = 0;
3418                if (ext4_bg_has_super(sb, grp)) {
3419                        ext4_set_bit(s++, buf);
3420                        count++;
3421                }
3422                j = ext4_bg_num_gdb(sb, grp);
3423                if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
3424                        ext4_error(sb, "Invalid number of block group "
3425                                   "descriptor blocks: %d", j);
3426                        j = EXT4_BLOCKS_PER_GROUP(sb) - s;
3427                }
3428                count += j;
3429                for (; j > 0; j--)
3430                        ext4_set_bit(EXT4_B2C(sbi, s++), buf);
3431        }
3432        if (!count)
3433                return 0;
3434        return EXT4_CLUSTERS_PER_GROUP(sb) -
3435                ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
3436}
3437
3438/*
3439 * Compute the overhead and stash it in sbi->s_overhead
3440 */
3441int ext4_calculate_overhead(struct super_block *sb)
3442{
3443        struct ext4_sb_info *sbi = EXT4_SB(sb);
3444        struct ext4_super_block *es = sbi->s_es;
3445        struct inode *j_inode;
3446        unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
3447        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3448        ext4_fsblk_t overhead = 0;
3449        char *buf = (char *) get_zeroed_page(GFP_NOFS);
3450
3451        if (!buf)
3452                return -ENOMEM;
3453
3454        /*
3455         * Compute the overhead (FS structures).  This is constant
3456         * for a given filesystem unless the number of block groups
3457         * changes so we cache the previous value until it does.
3458         */
3459
3460        /*
3461         * All of the blocks before first_data_block are overhead
3462         */
3463        overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
3464
3465        /*
3466         * Add the overhead found in each block group
3467         */
3468        for (i = 0; i < ngroups; i++) {
3469                int blks;
3470
3471                blks = count_overhead(sb, i, buf);
3472                overhead += blks;
3473                if (blks)
3474                        memset(buf, 0, PAGE_SIZE);
3475                cond_resched();
3476        }
3477
3478        /*
3479         * Add the internal journal blocks whether the journal has been
3480         * loaded or not
3481         */
3482        if (sbi->s_journal && !sbi->journal_bdev)
3483                overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
3484        else if (ext4_has_feature_journal(sb) && !sbi->s_journal) {
3485                j_inode = ext4_get_journal_inode(sb, j_inum);
3486                if (j_inode) {
3487                        j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
3488                        overhead += EXT4_NUM_B2C(sbi, j_blocks);
3489                        iput(j_inode);
3490                } else {
3491                        ext4_msg(sb, KERN_ERR, "can't get journal size");
3492                }
3493        }
3494        sbi->s_overhead = overhead;
3495        smp_wmb();
3496        free_page((unsigned long) buf);
3497        return 0;
3498}
3499
3500static void ext4_clamp_want_extra_isize(struct super_block *sb)
3501{
3502        struct ext4_sb_info *sbi = EXT4_SB(sb);
3503        struct ext4_super_block *es = sbi->s_es;
3504
3505        /* determine the minimum size of new large inodes, if present */
3506        if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
3507            sbi->s_want_extra_isize == 0) {
3508                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
3509                                                     EXT4_GOOD_OLD_INODE_SIZE;
3510                if (ext4_has_feature_extra_isize(sb)) {
3511                        if (sbi->s_want_extra_isize <
3512                            le16_to_cpu(es->s_want_extra_isize))
3513                                sbi->s_want_extra_isize =
3514                                        le16_to_cpu(es->s_want_extra_isize);
3515                        if (sbi->s_want_extra_isize <
3516                            le16_to_cpu(es->s_min_extra_isize))
3517                                sbi->s_want_extra_isize =
3518                                        le16_to_cpu(es->s_min_extra_isize);
3519                }
3520        }
3521        /* Check if enough inode space is available */
3522        if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
3523                                                        sbi->s_inode_size) {
3524                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
3525                                                       EXT4_GOOD_OLD_INODE_SIZE;
3526                ext4_msg(sb, KERN_INFO,
3527                         "required extra inode space not available");
3528        }
3529}
3530
3531static void ext4_set_resv_clusters(struct super_block *sb)
3532{
3533        ext4_fsblk_t resv_clusters;
3534        struct ext4_sb_info *sbi = EXT4_SB(sb);
3535
3536        /*
3537         * There's no need to reserve anything when we aren't using extents.
3538         * The space estimates are exact, there are no unwritten extents,
3539         * hole punching doesn't need new metadata... This is needed especially
3540         * to keep ext2/3 backward compatibility.
3541         */
3542        if (!ext4_has_feature_extents(sb))
3543                return;
3544        /*
3545         * By default we reserve 2% or 4096 clusters, whichever is smaller.
3546         * This should cover the situations where we can not afford to run
3547         * out of space like for example punch hole, or converting
3548         * unwritten extents in delalloc path. In most cases such
3549         * allocation would require 1, or 2 blocks, higher numbers are
3550         * very rare.
3551         */
3552        resv_clusters = (ext4_blocks_count(sbi->s_es) >>
3553                         sbi->s_cluster_bits);
3554
3555        do_div(resv_clusters, 50);
3556        resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
3557
3558        atomic64_set(&sbi->s_resv_clusters, resv_clusters);
3559}
3560
3561static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3562{
3563        struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
3564        char *orig_data = kstrdup(data, GFP_KERNEL);
3565        struct buffer_head *bh;
3566        struct ext4_super_block *es = NULL;
3567        struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
3568        ext4_fsblk_t block;
3569        ext4_fsblk_t sb_block = get_sb_block(&data);
3570        ext4_fsblk_t logical_sb_block;
3571        unsigned long offset = 0;
3572        unsigned long journal_devnum = 0;
3573        unsigned long def_mount_opts;
3574        struct inode *root;
3575        const char *descr;
3576        int ret = -ENOMEM;
3577        int blocksize, clustersize;
3578        unsigned int db_count;
3579        unsigned int i;
3580        int needs_recovery, has_huge_files, has_bigalloc;
3581        __u64 blocks_count;
3582        int err = 0;
3583        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3584        ext4_group_t first_not_zeroed;
3585
3586        if ((data && !orig_data) || !sbi)
3587                goto out_free_base;
3588
3589        sbi->s_daxdev = dax_dev;
3590        sbi->s_blockgroup_lock =
3591                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
3592        if (!sbi->s_blockgroup_lock)
3593                goto out_free_base;
3594
3595        sb->s_fs_info = sbi;
3596        sbi->s_sb = sb;
3597        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
3598        sbi->s_sb_block = sb_block;
3599        if (sb->s_bdev->bd_part)
3600                sbi->s_sectors_written_start =
3601                        part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
3602
3603        /* Cleanup superblock name */
3604        strreplace(sb->s_id, '/', '!');
3605
3606        /* -EINVAL is default */
3607        ret = -EINVAL;
3608        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
3609        if (!blocksize) {
3610                ext4_msg(sb, KERN_ERR, "unable to set blocksize");
3611                goto out_fail;
3612        }
3613
3614        /*
3615         * The ext4 superblock will not be buffer aligned for other than 1kB
3616         * block sizes.  We need to calculate the offset from buffer start.
3617         */
3618        if (blocksize != EXT4_MIN_BLOCK_SIZE) {
3619                logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3620                offset = do_div(logical_sb_block, blocksize);
3621        } else {
3622                logical_sb_block = sb_block;
3623        }
3624
3625        if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
3626                ext4_msg(sb, KERN_ERR, "unable to read superblock");
3627                goto out_fail;
3628        }
3629        /*
3630         * Note: s_es must be initialized as soon as possible because
3631         *       some ext4 macro-instructions depend on its value
3632         */
3633        es = (struct ext4_super_block *) (bh->b_data + offset);
3634        sbi->s_es = es;
3635        sb->s_magic = le16_to_cpu(es->s_magic);
3636        if (sb->s_magic != EXT4_SUPER_MAGIC)
3637                goto cantfind_ext4;
3638        sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
3639
3640        /* Warn if metadata_csum and gdt_csum are both set. */
3641        if (ext4_has_feature_metadata_csum(sb) &&
3642            ext4_has_feature_gdt_csum(sb))
3643                ext4_warning(sb, "metadata_csum and uninit_bg are "
3644                             "redundant flags; please run fsck.");
3645
3646        /* Check for a known checksum algorithm */
3647        if (!ext4_verify_csum_type(sb, es)) {
3648                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3649                         "unknown checksum algorithm.");
3650                silent = 1;
3651                goto cantfind_ext4;
3652        }
3653
3654        /* Load the checksum driver */
3655        sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
3656        if (IS_ERR(sbi->s_chksum_driver)) {
3657                ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
3658                ret = PTR_ERR(sbi->s_chksum_driver);
3659                sbi->s_chksum_driver = NULL;
3660                goto failed_mount;
3661        }
3662
3663        /* Check superblock checksum */
3664        if (!ext4_superblock_csum_verify(sb, es)) {
3665                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3666                         "invalid superblock checksum.  Run e2fsck?");
3667                silent = 1;
3668                ret = -EFSBADCRC;
3669                goto cantfind_ext4;
3670        }
3671
3672        /* Precompute checksum seed for all metadata */
3673        if (ext4_has_feature_csum_seed(sb))
3674                sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
3675        else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
3676                sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
3677                                               sizeof(es->s_uuid));
3678
3679        /* Set defaults before we parse the mount options */
3680        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3681        set_opt(sb, INIT_INODE_TABLE);
3682        if (def_mount_opts & EXT4_DEFM_DEBUG)
3683                set_opt(sb, DEBUG);
3684        if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
3685                set_opt(sb, GRPID);
3686        if (def_mount_opts & EXT4_DEFM_UID16)
3687                set_opt(sb, NO_UID32);
3688        /* xattr user namespace & acls are now defaulted on */
3689        set_opt(sb, XATTR_USER);
3690#ifdef CONFIG_EXT4_FS_POSIX_ACL
3691        set_opt(sb, POSIX_ACL);
3692#endif
3693        /* don't forget to enable journal_csum when metadata_csum is enabled. */
3694        if (ext4_has_metadata_csum(sb))
3695                set_opt(sb, JOURNAL_CHECKSUM);
3696
3697        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3698                set_opt(sb, JOURNAL_DATA);
3699        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
3700                set_opt(sb, ORDERED_DATA);
3701        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
3702                set_opt(sb, WRITEBACK_DATA);
3703
3704        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
3705                set_opt(sb, ERRORS_PANIC);
3706        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
3707                set_opt(sb, ERRORS_CONT);
3708        else
3709                set_opt(sb, ERRORS_RO);
3710        /* block_validity enabled by default; disable with noblock_validity */
3711        set_opt(sb, BLOCK_VALIDITY);
3712        if (def_mount_opts & EXT4_DEFM_DISCARD)
3713                set_opt(sb, DISCARD);
3714
3715        sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
3716        sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
3717        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
3718        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
3719        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
3720
3721        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
3722                set_opt(sb, BARRIER);
3723
3724        /*
3725         * enable delayed allocation by default
3726         * Use -o nodelalloc to turn it off
3727         */
3728        if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
3729            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3730                set_opt(sb, DELALLOC);
3731
3732        /*
3733         * set default s_li_wait_mult for lazyinit, for the case there is
3734         * no mount option specified.
3735         */
3736        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3737
3738        if (sbi->s_es->s_mount_opts[0]) {
3739                char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
3740                                              sizeof(sbi->s_es->s_mount_opts),
3741                                              GFP_KERNEL);
3742                if (!s_mount_opts)
3743                        goto failed_mount;
3744                if (!parse_options(s_mount_opts, sb, &journal_devnum,
3745                                   &journal_ioprio, 0)) {
3746                        ext4_msg(sb, KERN_WARNING,
3747                                 "failed to parse options in superblock: %s",
3748                                 s_mount_opts);
3749                }
3750                kfree(s_mount_opts);
3751        }
3752        sbi->s_def_mount_opt = sbi->s_mount_opt;
3753        if (!parse_options((char *) data, sb, &journal_devnum,
3754                           &journal_ioprio, 0))
3755                goto failed_mount;
3756
3757        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3758                printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
3759                            "with data=journal disables delayed "
3760                            "allocation and O_DIRECT support!\n");
3761                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
3762                        ext4_msg(sb, KERN_ERR, "can't mount with "
3763                                 "both data=journal and delalloc");
3764                        goto failed_mount;
3765                }
3766                if (test_opt(sb, DIOREAD_NOLOCK)) {
3767                        ext4_msg(sb, KERN_ERR, "can't mount with "
3768                                 "both data=journal and dioread_nolock");
3769                        goto failed_mount;
3770                }
3771                if (test_opt(sb, DAX)) {
3772                        ext4_msg(sb, KERN_ERR, "can't mount with "
3773                                 "both data=journal and dax");
3774                        goto failed_mount;
3775                }
3776                if (ext4_has_feature_encrypt(sb)) {
3777                        ext4_msg(sb, KERN_WARNING,
3778                                 "encrypted files will use data=ordered "
3779                                 "instead of data journaling mode");
3780                }
3781                if (test_opt(sb, DELALLOC))
3782                        clear_opt(sb, DELALLOC);
3783        } else {
3784                sb->s_iflags |= SB_I_CGROUPWB;
3785        }
3786
3787        sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
3788                (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
3789
3790        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
3791            (ext4_has_compat_features(sb) ||
3792             ext4_has_ro_compat_features(sb) ||
3793             ext4_has_incompat_features(sb)))
3794                ext4_msg(sb, KERN_WARNING,
3795                       "feature flags set on rev 0 fs, "
3796                       "running e2fsck is recommended");
3797
3798        if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
3799                set_opt2(sb, HURD_COMPAT);
3800                if (ext4_has_feature_64bit(sb)) {
3801                        ext4_msg(sb, KERN_ERR,
3802                                 "The Hurd can't support 64-bit file systems");
3803                        goto failed_mount;
3804                }
3805
3806                /*
3807                 * ea_inode feature uses l_i_version field which is not
3808                 * available in HURD_COMPAT mode.
3809                 */
3810                if (ext4_has_feature_ea_inode(sb)) {
3811                        ext4_msg(sb, KERN_ERR,
3812                                 "ea_inode feature is not supported for Hurd");
3813                        goto failed_mount;
3814                }
3815        }
3816
3817        if (IS_EXT2_SB(sb)) {
3818                if (ext2_feature_set_ok(sb))
3819                        ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
3820                                 "using the ext4 subsystem");
3821                else {
3822                        /*
3823                         * If we're probing be silent, if this looks like
3824                         * it's actually an ext[34] filesystem.
3825                         */
3826                        if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
3827                                goto failed_mount;
3828                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
3829                                 "to feature incompatibilities");
3830                        goto failed_mount;
3831                }
3832        }
3833
3834        if (IS_EXT3_SB(sb)) {
3835                if (ext3_feature_set_ok(sb))
3836                        ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
3837                                 "using the ext4 subsystem");
3838                else {
3839                        /*
3840                         * If we're probing be silent, if this looks like
3841                         * it's actually an ext4 filesystem.
3842                         */
3843                        if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
3844                                goto failed_mount;
3845                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
3846                                 "to feature incompatibilities");
3847                        goto failed_mount;
3848                }
3849        }
3850
3851        /*
3852         * Check feature flags regardless of the revision level, since we
3853         * previously didn't change the revision level when setting the flags,
3854         * so there is a chance incompat flags are set on a rev 0 filesystem.
3855         */
3856        if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
3857                goto failed_mount;
3858
3859        blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3860        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
3861            blocksize > EXT4_MAX_BLOCK_SIZE) {
3862                ext4_msg(sb, KERN_ERR,
3863                       "Unsupported filesystem blocksize %d (%d log_block_size)",
3864                         blocksize, le32_to_cpu(es->s_log_block_size));
3865                goto failed_mount;
3866        }
3867        if (le32_to_cpu(es->s_log_block_size) >
3868            (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
3869                ext4_msg(sb, KERN_ERR,
3870                         "Invalid log block size: %u",
3871                         le32_to_cpu(es->s_log_block_size));
3872                goto failed_mount;
3873        }
3874        if (le32_to_cpu(es->s_log_cluster_size) >
3875            (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
3876                ext4_msg(sb, KERN_ERR,
3877                         "Invalid log cluster size: %u",
3878                         le32_to_cpu(es->s_log_cluster_size));
3879                goto failed_mount;
3880        }
3881
3882        if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
3883                ext4_msg(sb, KERN_ERR,
3884                         "Number of reserved GDT blocks insanely large: %d",
3885                         le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
3886                goto failed_mount;
3887        }
3888
3889        if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
3890                static bool printed = false;
3891                if (ext4_has_feature_inline_data(sb)) {
3892                        ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
3893                                        " that may contain inline data");
3894                        goto failed_mount;
3895                }
3896                if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
3897                        ext4_msg(sb, KERN_ERR,
3898                                "DAX unsupported by block device.");
3899                        goto failed_mount;
3900                }
3901                if (!printed) {
3902                        mark_tech_preview("ext4 direct access (dax)", NULL);
3903                        printed = true;
3904                }
3905        }
3906
3907        if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
3908                ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
3909                         es->s_encryption_level);
3910                goto failed_mount;
3911        }
3912
3913        if (sb->s_blocksize != blocksize) {
3914                /* Validate the filesystem blocksize */
3915                if (!sb_set_blocksize(sb, blocksize)) {
3916                        ext4_msg(sb, KERN_ERR, "bad block size %d",
3917                                        blocksize);
3918                        goto failed_mount;
3919                }
3920
3921                brelse(bh);
3922                logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3923                offset = do_div(logical_sb_block, blocksize);
3924                bh = sb_bread_unmovable(sb, logical_sb_block);
3925                if (!bh) {
3926                        ext4_msg(sb, KERN_ERR,
3927                               "Can't read superblock on 2nd try");
3928                        goto failed_mount;
3929                }
3930                es = (struct ext4_super_block *)(bh->b_data + offset);
3931                sbi->s_es = es;
3932                if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
3933                        ext4_msg(sb, KERN_ERR,
3934                               "Magic mismatch, very weird!");
3935                        goto failed_mount;
3936                }
3937        }
3938
3939        has_huge_files = ext4_has_feature_huge_file(sb);
3940        sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
3941                                                      has_huge_files);
3942        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
3943
3944        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
3945                sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
3946                sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
3947        } else {
3948                sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
3949                sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
3950                if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
3951                        ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
3952                                 sbi->s_first_ino);
3953                        goto failed_mount;
3954                }
3955                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
3956                    (!is_power_of_2(sbi->s_inode_size)) ||
3957                    (sbi->s_inode_size > blocksize)) {
3958                        ext4_msg(sb, KERN_ERR,
3959                               "unsupported inode size: %d",
3960                               sbi->s_inode_size);
3961                        goto failed_mount;
3962                }
3963                if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
3964                        sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
3965        }
3966
3967        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
3968        if (ext4_has_feature_64bit(sb)) {
3969                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
3970                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
3971                    !is_power_of_2(sbi->s_desc_size)) {
3972                        ext4_msg(sb, KERN_ERR,
3973                               "unsupported descriptor size %lu",
3974                               sbi->s_desc_size);
3975                        goto failed_mount;
3976                }
3977        } else
3978                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
3979
3980        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
3981        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
3982
3983        sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
3984        if (sbi->s_inodes_per_block == 0)
3985                goto cantfind_ext4;
3986        if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
3987            sbi->s_inodes_per_group > blocksize * 8) {
3988                ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
3989                         sbi->s_blocks_per_group);
3990                goto failed_mount;
3991        }
3992        sbi->s_itb_per_group = sbi->s_inodes_per_group /
3993                                        sbi->s_inodes_per_block;
3994        sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
3995        sbi->s_sbh = bh;
3996        sbi->s_mount_state = le16_to_cpu(es->s_state);
3997        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
3998        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
3999
4000        for (i = 0; i < 4; i++)

4001                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
4002        sbi->s_def_hash_version = es->s_def_hash_version;
4003        if (ext4_has_feature_dir_index(sb)) {
4004                i = le32_to_cpu(es->s_flags);
4005                if (i & EXT2_FLAGS_UNSIGNED_HASH)
4006                        sbi->s_hash_unsigned = 3;
4007                else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
4008#ifdef __CHAR_UNSIGNED__
4009                        if (!sb_rdonly(sb))
4010                                es->s_flags |=
4011                                        cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
4012                        sbi->s_hash_unsigned = 3;
4013#else
4014                        if (!sb_rdonly(sb))
4015                                es->s_flags |=
4016                                        cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
4017#endif
4018                }
4019        }
4020
4021        /* Handle clustersize */
4022        clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
4023        has_bigalloc = ext4_has_feature_bigalloc(sb);
4024        if (has_bigalloc) {
4025                if (clustersize < blocksize) {
4026                        ext4_msg(sb, KERN_ERR,
4027                                 "cluster size (%d) smaller than "
4028                                 "block size (%d)", clustersize, blocksize);
4029                        goto failed_mount;
4030                }
4031                sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
4032                        le32_to_cpu(es->s_log_block_size);
4033                sbi->s_clusters_per_group =
4034                        le32_to_cpu(es->s_clusters_per_group);
4035                if (sbi->s_clusters_per_group > blocksize * 8) {
4036                        ext4_msg(sb, KERN_ERR,
4037                                 "#clusters per group too big: %lu",
4038                                 sbi->s_clusters_per_group);
4039                        goto failed_mount;
4040                }
4041                if (sbi->s_blocks_per_group !=
4042                    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
4043                        ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
4044                                 "clusters per group (%lu) inconsistent",
4045                                 sbi->s_blocks_per_group,
4046                                 sbi->s_clusters_per_group);
4047                        goto failed_mount;
4048                }
4049        } else {
4050                if (clustersize != blocksize) {
4051                        ext4_msg(sb, KERN_ERR,
4052                                 "fragment/cluster size (%d) != "
4053                                 "block size (%d)", clustersize, blocksize);
4054                        goto failed_mount;
4055                }
4056                if (sbi->s_blocks_per_group > blocksize * 8) {
4057                        ext4_msg(sb, KERN_ERR,
4058                                 "#blocks per group too big: %lu",
4059                                 sbi->s_blocks_per_group);
4060                        goto failed_mount;
4061                }
4062                sbi->s_clusters_per_group = sbi->s_blocks_per_group;
4063                sbi->s_cluster_bits = 0;
4064        }
4065        sbi->s_cluster_ratio = clustersize / blocksize;
4066
4067        /* Do we have standard group size of clustersize * 8 blocks ? */
4068        if (sbi->s_blocks_per_group == clustersize << 3)
4069                set_opt2(sb, STD_GROUP_SIZE);
4070
4071        /*
4072         * Test whether we have more sectors than will fit in sector_t,
4073         * and whether the max offset is addressable by the page cache.
4074         */
4075        err = generic_check_addressable(sb->s_blocksize_bits,
4076                                        ext4_blocks_count(es));
4077        if (err) {
4078                ext4_msg(sb, KERN_ERR, "filesystem"
4079                         " too large to mount safely on this system");
4080                goto failed_mount;
4081        }
4082
4083        if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
4084                goto cantfind_ext4;
4085
4086        /* check blocks count against device size */
4087        blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
4088        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
4089                ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
4090                       "exceeds size of device (%llu blocks)",
4091                       ext4_blocks_count(es), blocks_count);
4092                goto failed_mount;
4093        }
4094
4095        /*
4096         * It makes no sense for the first data block to be beyond the end
4097         * of the filesystem.
4098         */
4099        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
4100                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4101                         "block %u is beyond end of filesystem (%llu)",
4102                         le32_to_cpu(es->s_first_data_block),
4103                         ext4_blocks_count(es));
4104                goto failed_mount;
4105        }
4106        if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
4107            (sbi->s_cluster_ratio == 1)) {
4108                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4109                         "block is 0 with a 1k block and cluster size");
4110                goto failed_mount;
4111        }
4112
4113        blocks_count = (ext4_blocks_count(es) -
4114                        le32_to_cpu(es->s_first_data_block) +
4115                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
4116        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
4117        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
4118                ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
4119                       "(block count %llu, first data block %u, "
4120                       "blocks per group %lu)", sbi->s_groups_count,
4121                       ext4_blocks_count(es),
4122                       le32_to_cpu(es->s_first_data_block),
4123                       EXT4_BLOCKS_PER_GROUP(sb));
4124                goto failed_mount;
4125        }
4126        sbi->s_groups_count = blocks_count;
4127        sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
4128                        (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
4129        if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
4130            le32_to_cpu(es->s_inodes_count)) {
4131                ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
4132                         le32_to_cpu(es->s_inodes_count),
4133                         ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
4134                ret = -EINVAL;
4135                goto failed_mount;
4136        }
4137        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
4138                   EXT4_DESC_PER_BLOCK(sb);
4139        if (ext4_has_feature_meta_bg(sb)) {
4140                if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
4141                        ext4_msg(sb, KERN_WARNING,
4142                                 "first meta block group too large: %u "
4143                                 "(group descriptor block count %u)",
4144                                 le32_to_cpu(es->s_first_meta_bg), db_count);
4145                        goto failed_mount;
4146                }
4147        }
4148        sbi->s_group_desc = kvmalloc_array(db_count,
4149                                           sizeof(struct buffer_head *),
4150                                           GFP_KERNEL);
4151        if (sbi->s_group_desc == NULL) {
4152                ext4_msg(sb, KERN_ERR, "not enough memory");
4153                ret = -ENOMEM;
4154                goto failed_mount;
4155        }
4156
4157        bgl_lock_init(sbi->s_blockgroup_lock);
4158
4159        /* Pre-read the descriptors into the buffer cache */
4160        for (i = 0; i < db_count; i++) {
4161                block = descriptor_loc(sb, logical_sb_block, i);
4162                sb_breadahead(sb, block);
4163        }
4164
4165        for (i = 0; i < db_count; i++) {
4166                block = descriptor_loc(sb, logical_sb_block, i);
4167                sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
4168                if (!sbi->s_group_desc[i]) {
4169                        ext4_msg(sb, KERN_ERR,
4170                               "can't read group descriptor %d", i);
4171                        db_count = i;
4172                        goto failed_mount2;
4173                }
4174        }
4175        sbi->s_gdb_count = db_count;
4176        if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
4177                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
4178                ret = -EFSCORRUPTED;
4179                goto failed_mount2;
4180        }
4181
4182        timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
4183
4184        /* Register extent status tree shrinker */
4185        if (ext4_es_register_shrinker(sbi))
4186                goto failed_mount3;
4187
4188        sbi->s_stripe = ext4_get_stripe_size(sbi);
4189        sbi->s_extent_max_zeroout_kb = 32;
4190
4191        /*
4192         * set up enough so that it can read an inode
4193         */
4194        sb->s_op = &ext4_sops;
4195        sb->s_export_op = &ext4_export_ops;
4196        sb->s_xattr = ext4_xattr_handlers;
4197#ifdef CONFIG_EXT4_FS_ENCRYPTION
4198        sb->s_cop = &ext4_cryptops;
4199#endif
4200#ifdef CONFIG_QUOTA
4201        sb->dq_op = &ext4_quota_operations;
4202        if (ext4_has_feature_quota(sb))
4203                sb->s_qcop = &dquot_quotactl_sysfile_ops;
4204        else
4205                sb->s_qcop = &ext4_qctl_operations;
4206        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
4207#endif
4208        memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
4209
4210        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
4211        mutex_init(&sbi->s_orphan_lock);
4212
4213        sb->s_root = NULL;
4214
4215        needs_recovery = (es->s_last_orphan != 0 ||
4216                          ext4_has_feature_journal_needs_recovery(sb));
4217
4218        if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
4219                if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
4220                        goto failed_mount3a;
4221
4222        /*
4223         * The first inode we look at is the journal inode.  Don't try
4224         * root first: it may be modified in the journal!
4225         */
4226        if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
4227                err = ext4_load_journal(sb, es, journal_devnum);
4228                if (err)
4229                        goto failed_mount3a;
4230        } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
4231                   ext4_has_feature_journal_needs_recovery(sb)) {
4232                ext4_msg(sb, KERN_ERR, "required journal recovery "
4233                       "suppressed and not mounted read-only");
4234                goto failed_mount_wq;
4235        } else {
4236                /* Nojournal mode, all journal mount options are illegal */
4237                if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
4238                        ext4_msg(sb, KERN_ERR, "can't mount with "
4239                                 "journal_checksum, fs mounted w/o journal");
4240                        goto failed_mount_wq;
4241                }
4242                if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4243                        ext4_msg(sb, KERN_ERR, "can't mount with "
4244                                 "journal_async_commit, fs mounted w/o journal");
4245                        goto failed_mount_wq;
4246                }
4247                if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
4248                        ext4_msg(sb, KERN_ERR, "can't mount with "
4249                                 "commit=%lu, fs mounted w/o journal",
4250                                 sbi->s_commit_interval / HZ);
4251                        goto failed_mount_wq;
4252                }
4253                if (EXT4_MOUNT_DATA_FLAGS &
4254                    (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
4255                        ext4_msg(sb, KERN_ERR, "can't mount with "
4256                                 "data=, fs mounted w/o journal");
4257                        goto failed_mount_wq;
4258                }
4259                sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
4260                clear_opt(sb, JOURNAL_CHECKSUM);
4261                clear_opt(sb, DATA_FLAGS);
4262                sbi->s_journal = NULL;
4263                needs_recovery = 0;
4264                goto no_journal;
4265        }
4266
4267        if (ext4_has_feature_64bit(sb) &&
4268            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4269                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
4270                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
4271                goto failed_mount_wq;
4272        }
4273
4274        if (!set_journal_csum_feature_set(sb)) {
4275                ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
4276                         "feature set");
4277                goto failed_mount_wq;
4278        }
4279
4280        /* We have now updated the journal if required, so we can
4281         * validate the data journaling mode. */
4282        switch (test_opt(sb, DATA_FLAGS)) {
4283        case 0:
4284                /* No mode set, assume a default based on the journal
4285                 * capabilities: ORDERED_DATA if the journal can
4286                 * cope, else JOURNAL_DATA
4287                 */
4288                if (jbd2_journal_check_available_features
4289                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4290                        set_opt(sb, ORDERED_DATA);
4291                        sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
4292                } else {
4293                        set_opt(sb, JOURNAL_DATA);
4294                        sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
4295                }
4296                break;
4297
4298        case EXT4_MOUNT_ORDERED_DATA:
4299        case EXT4_MOUNT_WRITEBACK_DATA:
4300                if (!jbd2_journal_check_available_features
4301                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4302                        ext4_msg(sb, KERN_ERR, "Journal does not support "
4303                               "requested data journaling mode");
4304                        goto failed_mount_wq;
4305                }
4306        default:
4307                break;
4308        }
4309
4310        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
4311            test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4312                ext4_msg(sb, KERN_ERR, "can't mount with "
4313                        "journal_async_commit in data=ordered mode");
4314                goto failed_mount_wq;
4315        }
4316
4317        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
4318
4319        sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
4320
4321no_journal:
4322        if (!test_opt(sb, NO_MBCACHE)) {
4323                sbi->s_ea_block_cache = ext4_xattr_create_cache();
4324                if (!sbi->s_ea_block_cache) {
4325                        ext4_msg(sb, KERN_ERR,
4326                                 "Failed to create ea_block_cache");
4327                        goto failed_mount_wq;
4328                }
4329
4330                if (ext4_has_feature_ea_inode(sb)) {
4331                        sbi->s_ea_inode_cache = ext4_xattr_create_cache();
4332                        if (!sbi->s_ea_inode_cache) {
4333                                ext4_msg(sb, KERN_ERR,
4334                                         "Failed to create ea_inode_cache");
4335                                goto failed_mount_wq;
4336                        }
4337                }
4338        }
4339
4340        if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
4341            (blocksize != PAGE_SIZE)) {
4342                ext4_msg(sb, KERN_ERR,
4343                         "Unsupported blocksize for fs encryption");
4344                goto failed_mount_wq;
4345        }
4346
4347        if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
4348            !ext4_has_feature_encrypt(sb)) {
4349                ext4_set_feature_encrypt(sb);
4350                ext4_commit_super(sb, 1);
4351        }
4352
4353        /*
4354         * Get the # of file system overhead blocks from the
4355         * superblock if present.
4356         */
4357        if (es->s_overhead_clusters)
4358                sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
4359        else {
4360                err = ext4_calculate_overhead(sb);
4361                if (err)
4362                        goto failed_mount_wq;
4363        }
4364
4365        /*
4366         * The maximum number of concurrent works can be high and
4367         * concurrency isn't really necessary.  Limit it to 1.
4368         */
4369        EXT4_SB(sb)->rsv_conversion_wq =
4370                alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
4371        if (!EXT4_SB(sb)->rsv_conversion_wq) {
4372                printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
4373                ret = -ENOMEM;
4374                goto failed_mount4;
4375        }
4376
4377        /*
4378         * The jbd2_journal_load will have done any necessary log recovery,
4379         * so we can safely mount the rest of the filesystem now.
4380         */
4381
4382        root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
4383        if (IS_ERR(root)) {
4384                ext4_msg(sb, KERN_ERR, "get root inode failed");
4385                ret = PTR_ERR(root);
4386                root = NULL;
4387                goto failed_mount4;
4388        }
4389        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
4390                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
4391                iput(root);
4392                goto failed_mount4;
4393        }
4394        sb->s_root = d_make_root(root);
4395        if (!sb->s_root) {
4396                ext4_msg(sb, KERN_ERR, "get root dentry failed");
4397                ret = -ENOMEM;
4398                goto failed_mount4;
4399        }
4400
4401        ret = ext4_setup_super(sb, es, sb_rdonly(sb));
4402        if (ret == -EROFS) {
4403                sb->s_flags |= SB_RDONLY;
4404                ret = 0;
4405        } else if (ret)
4406                goto failed_mount4a;
4407
4408        ext4_clamp_want_extra_isize(sb);
4409
4410        ext4_set_resv_clusters(sb);
4411
4412        err = ext4_setup_system_zone(sb);
4413        if (err) {
4414                ext4_msg(sb, KERN_ERR, "failed to initialize system "
4415                         "zone (%d)", err);
4416                goto failed_mount4a;
4417        }
4418
4419        ext4_ext_init(sb);
4420        err = ext4_mb_init(sb);
4421        if (err) {
4422                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
4423                         err);
4424                goto failed_mount5;
4425        }
4426
4427        block = ext4_count_free_clusters(sb);
4428        ext4_free_blocks_count_set(sbi->s_es, 
4429                                   EXT4_C2B(sbi, block));
4430        ext4_superblock_csum_set(sb);
4431        err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
4432                                  GFP_KERNEL);
4433        if (!err) {
4434                unsigned long freei = ext4_count_free_inodes(sb);
4435                sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
4436                ext4_superblock_csum_set(sb);
4437                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
4438                                          GFP_KERNEL);
4439        }
4440        if (!err)
4441                err = percpu_counter_init(&sbi->s_dirs_counter,
4442                                          ext4_count_dirs(sb), GFP_KERNEL);
4443        if (!err)
4444                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
4445                                          GFP_KERNEL);
4446        if (!err)
4447                err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
4448
4449        if (err) {
4450                ext4_msg(sb, KERN_ERR, "insufficient memory");
4451                goto failed_mount6;
4452        }
4453
4454        if (ext4_has_feature_flex_bg(sb))
4455                if (!ext4_fill_flex_info(sb)) {
4456                        ext4_msg(sb, KERN_ERR,
4457                               "unable to initialize "
4458                               "flex_bg meta info!");
4459                        goto failed_mount6;
4460                }
4461
4462        err = ext4_register_li_request(sb, first_not_zeroed);
4463        if (err)
4464                goto failed_mount6;
4465
4466        err = ext4_register_sysfs(sb);
4467        if (err)
4468                goto failed_mount7;
4469
4470#ifdef CONFIG_QUOTA
4471        /* Enable quota usage during mount. */
4472        if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
4473                err = ext4_enable_quotas(sb);
4474                if (err)
4475                        goto failed_mount8;
4476        }
4477#endif  /* CONFIG_QUOTA */
4478
4479        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
4480        ext4_orphan_cleanup(sb, es);
4481        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
4482        if (needs_recovery) {
4483                ext4_msg(sb, KERN_INFO, "recovery complete");
4484                ext4_mark_recovery_complete(sb, es);
4485        }
4486        if (EXT4_SB(sb)->s_journal) {
4487                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
4488                        descr = " journalled data mode";
4489                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
4490                        descr = " ordered data mode";
4491                else
4492                        descr = " writeback data mode";
4493        } else
4494                descr = "out journal";
4495
4496        if (test_opt(sb, DISCARD)) {
4497                struct request_queue *q = bdev_get_queue(sb->s_bdev);
4498                if (!blk_queue_discard(q))
4499                        ext4_msg(sb, KERN_WARNING,
4500                                 "mounting with \"discard\" option, but "
4501                                 "the device does not support discard");
4502        }
4503
4504        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
4505                ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
4506                         "Opts: %.*s%s%s", descr,
4507                         (int) sizeof(sbi->s_es->s_mount_opts),
4508                         sbi->s_es->s_mount_opts,
4509                         *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
4510
4511        if (es->s_error_count)
4512                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
4513
4514        /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
4515        ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
4516        ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
4517        ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
4518
4519        kfree(orig_data);
4520        return 0;
4521
4522cantfind_ext4:
4523        if (!silent)
4524                ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
4525        goto failed_mount;
4526
4527#ifdef CONFIG_QUOTA
4528failed_mount8:
4529        ext4_unregister_sysfs(sb);
4530#endif
4531failed_mount7:
4532        ext4_unregister_li_request(sb);
4533failed_mount6:
4534        ext4_mb_release(sb);
4535        if (sbi->s_flex_groups)
4536                kvfree(sbi->s_flex_groups);
4537        percpu_counter_destroy(&sbi->s_freeclusters_counter);
4538        percpu_counter_destroy(&sbi->s_freeinodes_counter);
4539        percpu_counter_destroy(&sbi->s_dirs_counter);
4540        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
4541        percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
4542failed_mount5:
4543        ext4_ext_release(sb);
4544        ext4_release_system_zone(sb);
4545failed_mount4a:
4546        dput(sb->s_root);
4547        sb->s_root = NULL;
4548failed_mount4:
4549        ext4_msg(sb, KERN_ERR, "mount failed");
4550        if (EXT4_SB(sb)->rsv_conversion_wq)
4551                destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4552failed_mount_wq:
4553        ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
4554        sbi->s_ea_inode_cache = NULL;
4555
4556        ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
4557        sbi->s_ea_block_cache = NULL;
4558
4559        if (sbi->s_journal) {
4560                jbd2_journal_destroy(sbi->s_journal);
4561                sbi->s_journal = NULL;
4562        }
4563failed_mount3a:
4564        ext4_es_unregister_shrinker(sbi);
4565failed_mount3:
4566        del_timer_sync(&sbi->s_err_report);
4567        if (sbi->s_mmp_tsk)
4568                kthread_stop(sbi->s_mmp_tsk);
4569failed_mount2:
4570        for (i = 0; i < db_count; i++)
4571                brelse(sbi->s_group_desc[i]);
4572        kvfree(sbi->s_group_desc);
4573failed_mount:
4574        if (sbi->s_chksum_driver)
4575                crypto_free_shash(sbi->s_chksum_driver);
4576#ifdef CONFIG_QUOTA
4577        for (i = 0; i < EXT4_MAXQUOTAS; i++)
4578                kfree(sbi->s_qf_names[i]);
4579#endif
4580        ext4_blkdev_remove(sbi);
4581        brelse(bh);
4582out_fail:
4583        sb->s_fs_info = NULL;
4584        kfree(sbi->s_blockgroup_lock);
4585out_free_base:
4586        kfree(sbi);
4587        kfree(orig_data);
4588        fs_put_dax(dax_dev);
4589        return err ? err : ret;
4590}
4591
4592/*
4593 * Setup any per-fs journal parameters now.  We'll do this both on
4594 * initial mount, once the journal has been initialised but before we've
4595 * done any recovery; and again on any subsequent remount.
4596 */
4597static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
4598{
4599        struct ext4_sb_info *sbi = EXT4_SB(sb);
4600
4601        journal->j_commit_interval = sbi->s_commit_interval;
4602        journal->j_min_batch_time = sbi->s_min_batch_time;
4603        journal->j_max_batch_time = sbi->s_max_batch_time;
4604
4605        write_lock(&journal->j_state_lock);
4606        if (test_opt(sb, BARRIER))
4607                journal->j_flags |= JBD2_BARRIER;
4608        else
4609                journal->j_flags &= ~JBD2_BARRIER;
4610        if (test_opt(sb, DATA_ERR_ABORT))
4611                journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
4612        else
4613                journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
4614        write_unlock(&journal->j_state_lock);
4615}
4616
4617static struct inode *ext4_get_journal_inode(struct super_block *sb,
4618                                             unsigned int journal_inum)
4619{
4620        struct inode *journal_inode;
4621
4622        /*
4623         * Test for the existence of a valid inode on disk.  Bad things
4624         * happen if we iget() an unused inode, as the subsequent iput()
4625         * will try to delete it.
4626         */
4627        journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
4628        if (IS_ERR(journal_inode)) {
4629                ext4_msg(sb, KERN_ERR, "no journal found");
4630                return NULL;
4631        }
4632        if (!journal_inode->i_nlink) {
4633                make_bad_inode(journal_inode);
4634                iput(journal_inode);
4635                ext4_msg(sb, KERN_ERR, "journal inode is deleted");
4636                return NULL;
4637        }
4638
4639        jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
4640                  journal_inode, journal_inode->i_size);
4641        if (!S_ISREG(journal_inode->i_mode)) {
4642                ext4_msg(sb, KERN_ERR, "invalid journal inode");
4643                iput(journal_inode);
4644                return NULL;
4645        }
4646        return journal_inode;
4647}
4648
4649static journal_t *ext4_get_journal(struct super_block *sb,
4650                                   unsigned int journal_inum)
4651{
4652        struct inode *journal_inode;
4653        journal_t *journal;
4654
4655        BUG_ON(!ext4_has_feature_journal(sb));
4656
4657        journal_inode = ext4_get_journal_inode(sb, journal_inum);
4658        if (!journal_inode)
4659                return NULL;
4660
4661        journal = jbd2_journal_init_inode(journal_inode);
4662        if (!journal) {
4663                ext4_msg(sb, KERN_ERR, "Could not load journal inode");
4664                iput(journal_inode);
4665                return NULL;
4666        }
4667        journal->j_private = sb;
4668        ext4_init_journal_params(sb, journal);
4669        return journal;
4670}
4671
4672static journal_t *ext4_get_dev_journal(struct super_block *sb,
4673                                       dev_t j_dev)
4674{
4675        struct buffer_head *bh;
4676        journal_t *journal;
4677        ext4_fsblk_t start;
4678        ext4_fsblk_t len;
4679        int hblock, blocksize;
4680        ext4_fsblk_t sb_block;
4681        unsigned long offset;
4682        struct ext4_super_block *es;
4683        struct block_device *bdev;
4684
4685        BUG_ON(!ext4_has_feature_journal(sb));
4686
4687        bdev = ext4_blkdev_get(j_dev, sb);
4688        if (bdev == NULL)
4689                return NULL;
4690
4691        blocksize = sb->s_blocksize;
4692        hblock = bdev_logical_block_size(bdev);
4693        if (blocksize < hblock) {
4694                ext4_msg(sb, KERN_ERR,
4695                        "blocksize too small for journal device");
4696                goto out_bdev;
4697        }
4698
4699        sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
4700        offset = EXT4_MIN_BLOCK_SIZE % blocksize;
4701        set_blocksize(bdev, blocksize);
4702        if (!(bh = __bread(bdev, sb_block, blocksize))) {
4703                ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
4704                       "external journal");
4705                goto out_bdev;
4706        }
4707
4708        es = (struct ext4_super_block *) (bh->b_data + offset);
4709        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
4710            !(le32_to_cpu(es->s_feature_incompat) &
4711              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
4712                ext4_msg(sb, KERN_ERR, "external journal has "
4713                                        "bad superblock");
4714                brelse(bh);
4715                goto out_bdev;
4716        }
4717
4718        if ((le32_to_cpu(es->s_feature_ro_compat) &
4719             EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
4720            es->s_checksum != ext4_superblock_csum(sb, es)) {
4721                ext4_msg(sb, KERN_ERR, "external journal has "
4722                                       "corrupt superblock");
4723                brelse(bh);
4724                goto out_bdev;
4725        }
4726
4727        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
4728                ext4_msg(sb, KERN_ERR, "journal UUID does not match");
4729                brelse(bh);
4730                goto out_bdev;
4731        }
4732
4733        len = ext4_blocks_count(es);
4734        start = sb_block + 1;
4735        brelse(bh);     /* we're done with the superblock */
4736
4737        journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
4738                                        start, len, blocksize);
4739        if (!journal) {
4740                ext4_msg(sb, KERN_ERR, "failed to create device journal");
4741                goto out_bdev;
4742        }
4743        journal->j_private = sb;
4744        ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
4745        wait_on_buffer(journal->j_sb_buffer);
4746        if (!buffer_uptodate(journal->j_sb_buffer)) {
4747                ext4_msg(sb, KERN_ERR, "I/O error on journal device");
4748                goto out_journal;
4749        }
4750        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
4751                ext4_msg(sb, KERN_ERR, "External journal has more than one "
4752                                        "user (unsupported) - %d",
4753                        be32_to_cpu(journal->j_superblock->s_nr_users));
4754                goto out_journal;
4755        }
4756        EXT4_SB(sb)->journal_bdev = bdev;
4757        ext4_init_journal_params(sb, journal);
4758        return journal;
4759
4760out_journal:
4761        jbd2_journal_destroy(journal);
4762out_bdev:
4763        ext4_blkdev_put(bdev);
4764        return NULL;
4765}
4766
4767static int ext4_load_journal(struct super_block *sb,
4768                             struct ext4_super_block *es,
4769                             unsigned long journal_devnum)
4770{
4771        journal_t *journal;
4772        unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
4773        dev_t journal_dev;
4774        int err = 0;
4775        int really_read_only;
4776
4777        BUG_ON(!ext4_has_feature_journal(sb));
4778
4779        if (journal_devnum &&
4780            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
4781                ext4_msg(sb, KERN_INFO, "external journal device major/minor "
4782                        "numbers have changed");
4783                journal_dev = new_decode_dev(journal_devnum);
4784        } else
4785                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
4786
4787        really_read_only = bdev_read_only(sb->s_bdev);
4788
4789        /*
4790         * Are we loading a blank journal or performing recovery after a
4791         * crash?  For recovery, we need to check in advance whether we
4792         * can get read-write access to the device.
4793         */
4794        if (ext4_has_feature_journal_needs_recovery(sb)) {
4795                if (sb_rdonly(sb)) {
4796                        ext4_msg(sb, KERN_INFO, "INFO: recovery "
4797                                        "required on readonly filesystem");
4798                        if (really_read_only) {
4799                                ext4_msg(sb, KERN_ERR, "write access "
4800                                        "unavailable, cannot proceed "
4801                                        "(try mounting with noload)");
4802                                return -EROFS;
4803                        }
4804                        ext4_msg(sb, KERN_INFO, "write access will "
4805                               "be enabled during recovery");
4806                }
4807        }
4808
4809        if (journal_inum && journal_dev) {
4810                ext4_msg(sb, KERN_ERR, "filesystem has both journal "
4811                       "and inode journals!");
4812                return -EINVAL;
4813        }
4814
4815        if (journal_inum) {
4816                if (!(journal = ext4_get_journal(sb, journal_inum)))
4817                        return -EINVAL;
4818        } else {
4819                if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
4820                        return -EINVAL;
4821        }
4822
4823        if (!(journal->j_flags & JBD2_BARRIER))
4824                ext4_msg(sb, KERN_INFO, "barriers disabled");
4825
4826        if (!ext4_has_feature_journal_needs_recovery(sb))
4827                err = jbd2_journal_wipe(journal, !really_read_only);
4828        if (!err) {
4829                char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
4830                if (save)
4831                        memcpy(save, ((char *) es) +
4832                               EXT4_S_ERR_START, EXT4_S_ERR_LEN);
4833                err = jbd2_journal_load(journal);
4834                if (save)
4835                        memcpy(((char *) es) + EXT4_S_ERR_START,
4836                               save, EXT4_S_ERR_LEN);
4837                kfree(save);
4838        }
4839
4840        if (err) {
4841                ext4_msg(sb, KERN_ERR, "error loading journal");
4842                jbd2_journal_destroy(journal);
4843                return err;
4844        }
4845
4846        EXT4_SB(sb)->s_journal = journal;
4847        ext4_clear_journal_err(sb, es);
4848
4849        if (!really_read_only && journal_devnum &&
4850            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
4851                es->s_journal_dev = cpu_to_le32(journal_devnum);
4852
4853                /* Make sure we flush the recovery flag to disk. */
4854                ext4_commit_super(sb, 1);
4855        }
4856
4857        return 0;
4858}
4859
4860static int ext4_commit_super(struct super_block *sb, int sync)
4861{
4862        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
4863        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
4864        int error = 0;
4865
4866        if (!sbh || block_device_ejected(sb))
4867                return error;
4868
4869        /*
4870         * The superblock bh should be mapped, but it might not be if the
4871         * device was hot-removed. Not much we can do but fail the I/O.
4872         */
4873        if (!buffer_mapped(sbh))
4874                return error;
4875
4876        /*
4877         * If the file system is mounted read-only, don't update the
4878         * superblock write time.  This avoids updating the superblock
4879         * write time when we are mounting the root file system
4880         * read/only but we need to replay the journal; at that point,
4881         * for people who are east of GMT and who make their clock
4882         * tick in localtime for Windows bug-for-bug compatibility,
4883         * the clock is set in the future, and this will cause e2fsck
4884         * to complain and force a full file system check.
4885         */
4886        if (!(sb->s_flags & SB_RDONLY))
4887                ext4_update_tstamp(es, s_wtime);
4888        if (sb->s_bdev->bd_part)
4889                es->s_kbytes_written =
4890                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
4891                            ((part_stat_read(sb->s_bdev->bd_part,
4892                                             sectors[STAT_WRITE]) -
4893                              EXT4_SB(sb)->s_sectors_written_start) >> 1));
4894        else
4895                es->s_kbytes_written =
4896                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
4897        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
4898                ext4_free_blocks_count_set(es,
4899                        EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
4900                                &EXT4_SB(sb)->s_freeclusters_counter)));
4901        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
4902                es->s_free_inodes_count =
4903                        cpu_to_le32(percpu_counter_sum_positive(
4904                                &EXT4_SB(sb)->s_freeinodes_counter));
4905        BUFFER_TRACE(sbh, "marking dirty");
4906        ext4_superblock_csum_set(sb);
4907        if (sync)
4908                lock_buffer(sbh);
4909        if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
4910                /*
4911                 * Oh, dear.  A previous attempt to write the
4912                 * superblock failed.  This could happen because the
4913                 * USB device was yanked out.  Or it could happen to
4914                 * be a transient write error and maybe the block will
4915                 * be remapped.  Nothing we can do but to retry the
4916                 * write and hope for the best.
4917                 */
4918                ext4_msg(sb, KERN_ERR, "previous I/O error to "
4919                       "superblock detected");
4920                clear_buffer_write_io_error(sbh);
4921                set_buffer_uptodate(sbh);
4922        }
4923        mark_buffer_dirty(sbh);
4924        if (sync) {
4925                unlock_buffer(sbh);
4926                error = __sync_dirty_buffer(sbh,
4927                        REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
4928                if (buffer_write_io_error(sbh)) {
4929                        ext4_msg(sb, KERN_ERR, "I/O error while writing "
4930                               "superblock");
4931                        clear_buffer_write_io_error(sbh);
4932                        set_buffer_uptodate(sbh);
4933                }
4934        }
4935        return error;
4936}
4937
4938/*
4939 * Have we just finished recovery?  If so, and if we are mounting (or
4940 * remounting) the filesystem readonly, then we will end up with a
4941 * consistent fs on disk.  Record that fact.
4942 */
4943static void ext4_mark_recovery_complete(struct super_block *sb,
4944                                        struct ext4_super_block *es)
4945{
4946        journal_t *journal = EXT4_SB(sb)->s_journal;
4947
4948        if (!ext4_has_feature_journal(sb)) {
4949                BUG_ON(journal != NULL);
4950                return;
4951        }
4952        jbd2_journal_lock_updates(journal);
4953        if (jbd2_journal_flush(journal) < 0)
4954                goto out;
4955
4956        if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
4957                ext4_clear_feature_journal_needs_recovery(sb);
4958                ext4_commit_super(sb, 1);
4959        }
4960
4961out:
4962        jbd2_journal_unlock_updates(journal);
4963}
4964
4965/*
4966 * If we are mounting (or read-write remounting) a filesystem whose journal
4967 * has recorded an error from a previous lifetime, move that error to the
4968 * main filesystem now.
4969 */
4970static void ext4_clear_journal_err(struct super_block *sb,
4971                                   struct ext4_super_block *es)
4972{
4973        journal_t *journal;
4974        int j_errno;
4975        const char *errstr;
4976
4977        BUG_ON(!ext4_has_feature_journal(sb));
4978
4979        journal = EXT4_SB(sb)->s_journal;
4980
4981        /*
4982         * Now check for any error status which may have been recorded in the
4983         * journal by a prior ext4_error() or ext4_abort()
4984         */
4985
4986        j_errno = jbd2_journal_errno(journal);
4987        if (j_errno) {
4988                char nbuf[16];
4989
4990                errstr = ext4_decode_error(sb, j_errno, nbuf);
4991                ext4_warning(sb, "Filesystem error recorded "
4992                             "from previous mount: %s", errstr);
4993                ext4_warning(sb, "Marking fs in need of filesystem check.");
4994
4995                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
4996                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
4997                ext4_commit_super(sb, 1);
4998
4999                jbd2_journal_clear_err(journal);
5000                jbd2_journal_update_sb_errno(journal);

5001        }
5002}
5003
5004/*
5005 * Force the running and committing transactions to commit,
5006 * and wait on the commit.
5007 */
5008int ext4_force_commit(struct super_block *sb)
5009{
5010        journal_t *journal;
5011
5012        if (sb_rdonly(sb))
5013                return 0;
5014
5015        journal = EXT4_SB(sb)->s_journal;
5016        return ext4_journal_force_commit(journal);
5017}
5018
5019static int ext4_sync_fs(struct super_block *sb, int wait)
5020{
5021        int ret = 0;
5022        tid_t target;
5023        bool needs_barrier = false;
5024        struct ext4_sb_info *sbi = EXT4_SB(sb);
5025
5026        if (unlikely(ext4_forced_shutdown(sbi)))
5027                return 0;
5028
5029        trace_ext4_sync_fs(sb, wait);
5030        flush_workqueue(sbi->rsv_conversion_wq);
5031        /*
5032         * Writeback quota in non-journalled quota case - journalled quota has
5033         * no dirty dquots
5034         */
5035        dquot_writeback_dquots(sb, -1);
5036        /*
5037         * Data writeback is possible w/o journal transaction, so barrier must
5038         * being sent at the end of the function. But we can skip it if
5039         * transaction_commit will do it for us.
5040         */
5041        if (sbi->s_journal) {
5042                target = jbd2_get_latest_transaction(sbi->s_journal);
5043                if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
5044                    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
5045                        needs_barrier = true;
5046
5047                if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
5048                        if (wait)
5049                                ret = jbd2_log_wait_commit(sbi->s_journal,
5050                                                           target);
5051                }
5052        } else if (wait && test_opt(sb, BARRIER))
5053                needs_barrier = true;
5054        if (needs_barrier) {
5055                int err;
5056                err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
5057                if (!ret)
5058                        ret = err;
5059        }
5060
5061        return ret;
5062}
5063
5064/*
5065 * LVM calls this function before a (read-only) snapshot is created.  This
5066 * gives us a chance to flush the journal completely and mark the fs clean.
5067 *
5068 * Note that only this function cannot bring a filesystem to be in a clean
5069 * state independently. It relies on upper layer to stop all data & metadata
5070 * modifications.
5071 */
5072static int ext4_freeze(struct super_block *sb)
5073{
5074        int error = 0;
5075        journal_t *journal;
5076
5077        if (sb_rdonly(sb))
5078                return 0;
5079
5080        journal = EXT4_SB(sb)->s_journal;
5081
5082        if (journal) {
5083                /* Now we set up the journal barrier. */
5084                jbd2_journal_lock_updates(journal);
5085
5086                /*
5087                 * Don't clear the needs_recovery flag if we failed to
5088                 * flush the journal.
5089                 */
5090                error = jbd2_journal_flush(journal);
5091                if (error < 0)
5092                        goto out;
5093
5094                /* Journal blocked and flushed, clear needs_recovery flag. */
5095                ext4_clear_feature_journal_needs_recovery(sb);
5096        }
5097
5098        error = ext4_commit_super(sb, 1);
5099out:
5100        if (journal)
5101                /* we rely on upper layer to stop further updates */
5102                jbd2_journal_unlock_updates(journal);
5103        return error;
5104}
5105
5106/*
5107 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
5108 * flag here, even though the filesystem is not technically dirty yet.
5109 */
5110static int ext4_unfreeze(struct super_block *sb)
5111{
5112        if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb)))
5113                return 0;
5114
5115        if (EXT4_SB(sb)->s_journal) {
5116                /* Reset the needs_recovery flag before the fs is unlocked. */
5117                ext4_set_feature_journal_needs_recovery(sb);
5118        }
5119
5120        ext4_commit_super(sb, 1);
5121        return 0;
5122}
5123
5124/*
5125 * Structure to save mount options for ext4_remount's benefit
5126 */
5127struct ext4_mount_options {
5128        unsigned long s_mount_opt;
5129        unsigned long s_mount_opt2;
5130        kuid_t s_resuid;
5131        kgid_t s_resgid;
5132        unsigned long s_commit_interval;
5133        u32 s_min_batch_time, s_max_batch_time;
5134#ifdef CONFIG_QUOTA
5135        int s_jquota_fmt;
5136        char *s_qf_names[EXT4_MAXQUOTAS];
5137#endif
5138};
5139
5140static int ext4_remount(struct super_block *sb, int *flags, char *data)
5141{
5142        struct ext4_super_block *es;
5143        struct ext4_sb_info *sbi = EXT4_SB(sb);
5144        unsigned long old_sb_flags;
5145        struct ext4_mount_options old_opts;
5146        int enable_quota = 0;
5147        ext4_group_t g;
5148        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
5149        int err = 0;
5150#ifdef CONFIG_QUOTA
5151        int i, j;
5152        char *to_free[EXT4_MAXQUOTAS];
5153#endif
5154        char *orig_data = kstrdup(data, GFP_KERNEL);
5155
5156        if (data && !orig_data)
5157                return -ENOMEM;
5158
5159        /* Store the original options */
5160        old_sb_flags = sb->s_flags;
5161        old_opts.s_mount_opt = sbi->s_mount_opt;
5162        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
5163        old_opts.s_resuid = sbi->s_resuid;
5164        old_opts.s_resgid = sbi->s_resgid;
5165        old_opts.s_commit_interval = sbi->s_commit_interval;
5166        old_opts.s_min_batch_time = sbi->s_min_batch_time;
5167        old_opts.s_max_batch_time = sbi->s_max_batch_time;
5168#ifdef CONFIG_QUOTA
5169        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
5170        for (i = 0; i < EXT4_MAXQUOTAS; i++)
5171                if (sbi->s_qf_names[i]) {
5172                        char *qf_name = get_qf_name(sb, sbi, i);
5173
5174                        old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
5175                        if (!old_opts.s_qf_names[i]) {
5176                                for (j = 0; j < i; j++)
5177                                        kfree(old_opts.s_qf_names[j]);
5178                                kfree(orig_data);
5179                                return -ENOMEM;
5180                        }
5181                } else
5182                        old_opts.s_qf_names[i] = NULL;
5183#endif
5184        if (sbi->s_journal && sbi->s_journal->j_task->io_context)
5185                journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
5186
5187        if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
5188                err = -EINVAL;
5189                goto restore_opts;
5190        }
5191
5192        ext4_clamp_want_extra_isize(sb);
5193
5194        if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
5195            test_opt(sb, JOURNAL_CHECKSUM)) {
5196                ext4_msg(sb, KERN_ERR, "changing journal_checksum "
5197                         "during remount not supported; ignoring");
5198                sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
5199        }
5200
5201        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
5202                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
5203                        ext4_msg(sb, KERN_ERR, "can't mount with "
5204                                 "both data=journal and delalloc");
5205                        err = -EINVAL;
5206                        goto restore_opts;
5207                }
5208                if (test_opt(sb, DIOREAD_NOLOCK)) {
5209                        ext4_msg(sb, KERN_ERR, "can't mount with "
5210                                 "both data=journal and dioread_nolock");
5211                        err = -EINVAL;
5212                        goto restore_opts;
5213                }
5214                if (test_opt(sb, DAX)) {
5215                        ext4_msg(sb, KERN_ERR, "can't mount with "
5216                                 "both data=journal and dax");
5217                        err = -EINVAL;
5218                        goto restore_opts;
5219                }
5220        } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
5221                if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
5222                        ext4_msg(sb, KERN_ERR, "can't mount with "
5223                                "journal_async_commit in data=ordered mode");
5224                        err = -EINVAL;
5225                        goto restore_opts;
5226                }
5227        }
5228
5229        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
5230                ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
5231                err = -EINVAL;
5232                goto restore_opts;
5233        }
5234
5235        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
5236                ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
5237                        "dax flag with busy inodes while remounting");
5238                sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
5239        }
5240
5241        if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
5242                ext4_abort(sb, "Abort forced by user");
5243
5244        sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
5245                (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
5246
5247        es = sbi->s_es;
5248
5249        if (sbi->s_journal) {
5250                ext4_init_journal_params(sb, sbi->s_journal);
5251                set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
5252        }
5253
5254        if (*flags & SB_LAZYTIME)
5255                sb->s_flags |= SB_LAZYTIME;
5256
5257        if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
5258                if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
5259                        err = -EROFS;
5260                        goto restore_opts;
5261                }
5262
5263                if (*flags & SB_RDONLY) {
5264                        err = sync_filesystem(sb);
5265                        if (err < 0)
5266                                goto restore_opts;
5267                        err = dquot_suspend(sb, -1);
5268                        if (err < 0)
5269                                goto restore_opts;
5270
5271                        /*
5272                         * First of all, the unconditional stuff we have to do
5273                         * to disable replay of the journal when we next remount
5274                         */
5275                        sb->s_flags |= SB_RDONLY;
5276
5277                        /*
5278                         * OK, test if we are remounting a valid rw partition
5279                         * readonly, and if so set the rdonly flag and then
5280                         * mark the partition as valid again.
5281                         */
5282                        if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
5283                            (sbi->s_mount_state & EXT4_VALID_FS))
5284                                es->s_state = cpu_to_le16(sbi->s_mount_state);
5285
5286                        if (sbi->s_journal)
5287                                ext4_mark_recovery_complete(sb, es);
5288                        if (sbi->s_mmp_tsk)
5289                                kthread_stop(sbi->s_mmp_tsk);
5290                } else {
5291                        /* Make sure we can mount this feature set readwrite */
5292                        if (ext4_has_feature_readonly(sb) ||
5293                            !ext4_feature_set_ok(sb, 0)) {
5294                                err = -EROFS;
5295                                goto restore_opts;
5296                        }
5297                        /*
5298                         * Make sure the group descriptor checksums
5299                         * are sane.  If they aren't, refuse to remount r/w.
5300                         */
5301                        for (g = 0; g < sbi->s_groups_count; g++) {
5302                                struct ext4_group_desc *gdp =
5303                                        ext4_get_group_desc(sb, g, NULL);
5304
5305                                if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
5306                                        ext4_msg(sb, KERN_ERR,
5307               "ext4_remount: Checksum for group %u failed (%u!=%u)",
5308                g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
5309                                               le16_to_cpu(gdp->bg_checksum));
5310                                        err = -EFSBADCRC;
5311                                        goto restore_opts;
5312                                }
5313                        }
5314
5315                        /*
5316                         * If we have an unprocessed orphan list hanging
5317                         * around from a previously readonly bdev mount,
5318                         * require a full umount/remount for now.
5319                         */
5320                        if (es->s_last_orphan) {
5321                                ext4_msg(sb, KERN_WARNING, "Couldn't "
5322                                       "remount RDWR because of unprocessed "
5323                                       "orphan inode list.  Please "
5324                                       "umount/remount instead");
5325                                err = -EINVAL;
5326                                goto restore_opts;
5327                        }
5328
5329                        /*
5330                         * Mounting a RDONLY partition read-write, so reread
5331                         * and store the current valid flag.  (It may have
5332                         * been changed by e2fsck since we originally mounted
5333                         * the partition.)
5334                         */
5335                        if (sbi->s_journal)
5336                                ext4_clear_journal_err(sb, es);
5337                        sbi->s_mount_state = le16_to_cpu(es->s_state);
5338
5339                        err = ext4_setup_super(sb, es, 0);
5340                        if (err)
5341                                goto restore_opts;
5342
5343                        sb->s_flags &= ~SB_RDONLY;
5344                        if (ext4_has_feature_mmp(sb))
5345                                if (ext4_multi_mount_protect(sb,
5346                                                le64_to_cpu(es->s_mmp_block))) {
5347                                        err = -EROFS;
5348                                        goto restore_opts;
5349                                }
5350                        enable_quota = 1;
5351                }
5352        }
5353
5354        /*
5355         * Reinitialize lazy itable initialization thread based on
5356         * current settings
5357         */
5358        if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
5359                ext4_unregister_li_request(sb);
5360        else {
5361                ext4_group_t first_not_zeroed;
5362                first_not_zeroed = ext4_has_uninit_itable(sb);
5363                ext4_register_li_request(sb, first_not_zeroed);
5364        }
5365
5366        ext4_setup_system_zone(sb);
5367        if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
5368                err = ext4_commit_super(sb, 1);
5369                if (err)
5370                        goto restore_opts;
5371        }
5372
5373#ifdef CONFIG_QUOTA
5374        /* Release old quota file names */
5375        for (i = 0; i < EXT4_MAXQUOTAS; i++)
5376                kfree(old_opts.s_qf_names[i]);
5377        if (enable_quota) {
5378                if (sb_any_quota_suspended(sb))
5379                        dquot_resume(sb, -1);
5380                else if (ext4_has_feature_quota(sb)) {
5381                        err = ext4_enable_quotas(sb);
5382                        if (err)
5383                                goto restore_opts;
5384                }
5385        }
5386#endif
5387
5388        *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
5389        ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
5390        kfree(orig_data);
5391        return 0;
5392
5393restore_opts:
5394        sb->s_flags = old_sb_flags;
5395        sbi->s_mount_opt = old_opts.s_mount_opt;
5396        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
5397        sbi->s_resuid = old_opts.s_resuid;
5398        sbi->s_resgid = old_opts.s_resgid;
5399        sbi->s_commit_interval = old_opts.s_commit_interval;
5400        sbi->s_min_batch_time = old_opts.s_min_batch_time;
5401        sbi->s_max_batch_time = old_opts.s_max_batch_time;
5402#ifdef CONFIG_QUOTA
5403        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
5404        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
5405                to_free[i] = get_qf_name(sb, sbi, i);
5406                rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
5407        }
5408        synchronize_rcu();
5409        for (i = 0; i < EXT4_MAXQUOTAS; i++)
5410                kfree(to_free[i]);
5411#endif
5412        kfree(orig_data);
5413        return err;
5414}
5415
5416#ifdef CONFIG_QUOTA
5417static int ext4_statfs_project(struct super_block *sb,
5418                               kprojid_t projid, struct kstatfs *buf)
5419{
5420        struct kqid qid;
5421        struct dquot *dquot;
5422        u64 limit;
5423        u64 curblock;
5424
5425        qid = make_kqid_projid(projid);
5426        dquot = dqget(sb, qid);
5427        if (IS_ERR(dquot))
5428                return PTR_ERR(dquot);
5429        spin_lock(&dquot->dq_dqb_lock);
5430
5431        limit = (dquot->dq_dqb.dqb_bsoftlimit ?
5432                 dquot->dq_dqb.dqb_bsoftlimit :
5433                 dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
5434        if (limit && buf->f_blocks > limit) {
5435                curblock = (dquot->dq_dqb.dqb_curspace +
5436                            dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
5437                buf->f_blocks = limit;
5438                buf->f_bfree = buf->f_bavail =
5439                        (buf->f_blocks > curblock) ?
5440                         (buf->f_blocks - curblock) : 0;
5441        }
5442
5443        limit = dquot->dq_dqb.dqb_isoftlimit ?
5444                dquot->dq_dqb.dqb_isoftlimit :
5445                dquot->dq_dqb.dqb_ihardlimit;
5446        if (limit && buf->f_files > limit) {
5447                buf->f_files = limit;
5448                buf->f_ffree =
5449                        (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
5450                         (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
5451        }
5452
5453        spin_unlock(&dquot->dq_dqb_lock);
5454        dqput(dquot);
5455        return 0;
5456}
5457#endif
5458
5459static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
5460{
5461        struct super_block *sb = dentry->d_sb;
5462        struct ext4_sb_info *sbi = EXT4_SB(sb);
5463        struct ext4_super_block *es = sbi->s_es;
5464        ext4_fsblk_t overhead = 0, resv_blocks;
5465        u64 fsid;
5466        s64 bfree;
5467        resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
5468
5469        if (!test_opt(sb, MINIX_DF))
5470                overhead = sbi->s_overhead;
5471
5472        buf->f_type = EXT4_SUPER_MAGIC;
5473        buf->f_bsize = sb->s_blocksize;
5474        buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
5475        bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
5476                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
5477        /* prevent underflow in case that few free space is available */
5478        buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
5479        buf->f_bavail = buf->f_bfree -
5480                        (ext4_r_blocks_count(es) + resv_blocks);
5481        if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
5482                buf->f_bavail = 0;
5483        buf->f_files = le32_to_cpu(es->s_inodes_count);
5484        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
5485        buf->f_namelen = EXT4_NAME_LEN;
5486        fsid = le64_to_cpup((void *)es->s_uuid) ^
5487               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
5488        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
5489        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
5490
5491#ifdef CONFIG_QUOTA
5492        if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
5493            sb_has_quota_limits_enabled(sb, PRJQUOTA))
5494                ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
5495#endif
5496        return 0;
5497}
5498
5499
5500#ifdef CONFIG_QUOTA
5501
5502/*
5503 * Helper functions so that transaction is started before we acquire dqio_sem
5504 * to keep correct lock ordering of transaction > dqio_sem
5505 */
5506static inline struct inode *dquot_to_inode(struct dquot *dquot)
5507{
5508        return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
5509}
5510
5511static int ext4_write_dquot(struct dquot *dquot)
5512{
5513        int ret, err;
5514        handle_t *handle;
5515        struct inode *inode;
5516
5517        inode = dquot_to_inode(dquot);
5518        handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
5519                                    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
5520        if (IS_ERR(handle))
5521                return PTR_ERR(handle);
5522        ret = dquot_commit(dquot);
5523        err = ext4_journal_stop(handle);
5524        if (!ret)
5525                ret = err;
5526        return ret;
5527}
5528
5529static int ext4_acquire_dquot(struct dquot *dquot)
5530{
5531        int ret, err;
5532        handle_t *handle;
5533
5534        handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
5535                                    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
5536        if (IS_ERR(handle))
5537                return PTR_ERR(handle);
5538        ret = dquot_acquire(dquot);
5539        err = ext4_journal_stop(handle);
5540        if (!ret)
5541                ret = err;
5542        return ret;
5543}
5544
5545static int ext4_release_dquot(struct dquot *dquot)
5546{
5547        int ret, err;
5548        handle_t *handle;
5549
5550        handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
5551                                    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
5552        if (IS_ERR(handle)) {
5553                /* Release dquot anyway to avoid endless cycle in dqput() */
5554                dquot_release(dquot);
5555                return PTR_ERR(handle);
5556        }
5557        ret = dquot_release(dquot);
5558        err = ext4_journal_stop(handle);
5559        if (!ret)
5560                ret = err;
5561        return ret;
5562}
5563
5564static int ext4_mark_dquot_dirty(struct dquot *dquot)
5565{
5566        struct super_block *sb = dquot->dq_sb;
5567        struct ext4_sb_info *sbi = EXT4_SB(sb);
5568
5569        /* Are we journaling quotas? */
5570        if (ext4_has_feature_quota(sb) ||
5571            sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
5572                dquot_mark_dquot_dirty(dquot);
5573                return ext4_write_dquot(dquot);
5574        } else {
5575                return dquot_mark_dquot_dirty(dquot);
5576        }
5577}
5578
5579static int ext4_write_info(struct super_block *sb, int type)
5580{
5581        int ret, err;
5582        handle_t *handle;
5583
5584        /* Data block + inode block */
5585        handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
5586        if (IS_ERR(handle))
5587                return PTR_ERR(handle);
5588        ret = dquot_commit_info(sb, type);
5589        err = ext4_journal_stop(handle);
5590        if (!ret)
5591                ret = err;
5592        return ret;
5593}
5594
5595/*
5596 * Turn on quotas during mount time - we need to find
5597 * the quota file and such...
5598 */
5599static int ext4_quota_on_mount(struct super_block *sb, int type)
5600{
5601        return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
5602                                        EXT4_SB(sb)->s_jquota_fmt, type);
5603}
5604
5605static void lockdep_set_quota_inode(struct inode *inode, int subclass)
5606{
5607        struct ext4_inode_info *ei = EXT4_I(inode);
5608
5609        /* The first argument of lockdep_set_subclass has to be
5610         * *exactly* the same as the argument to init_rwsem() --- in
5611         * this case, in init_once() --- or lockdep gets unhappy
5612         * because the name of the lock is set using the
5613         * stringification of the argument to init_rwsem().
5614         */
5615        (void) ei;      /* shut up clang warning if !CONFIG_LOCKDEP */
5616        lockdep_set_subclass(&ei->i_data_sem, subclass);
5617}
5618
5619/*
5620 * Standard function to be called on quota_on
5621 */
5622static int ext4_quota_on(struct super_block *sb, int type, int format_id,
5623                         const struct path *path)
5624{
5625        int err;
5626
5627        if (!test_opt(sb, QUOTA))
5628                return -EINVAL;
5629
5630        /* Quotafile not on the same filesystem? */
5631        if (path->dentry->d_sb != sb)
5632                return -EXDEV;
5633        /* Journaling quota? */
5634        if (EXT4_SB(sb)->s_qf_names[type]) {
5635                /* Quotafile not in fs root? */
5636                if (path->dentry->d_parent != sb->s_root)
5637                        ext4_msg(sb, KERN_WARNING,
5638                                "Quota file not on filesystem root. "
5639                                "Journaled quota will not work");
5640                sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
5641        } else {
5642                /*
5643                 * Clear the flag just in case mount options changed since
5644                 * last time.
5645                 */
5646                sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
5647        }
5648
5649        /*
5650         * When we journal data on quota file, we have to flush journal to see
5651         * all updates to the file when we bypass pagecache...
5652         */
5653        if (EXT4_SB(sb)->s_journal &&
5654            ext4_should_journal_data(d_inode(path->dentry))) {
5655                /*
5656                 * We don't need to lock updates but journal_flush() could
5657                 * otherwise be livelocked...
5658                 */
5659                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
5660                err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
5661                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
5662                if (err)
5663                        return err;
5664        }
5665
5666        lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
5667        err = dquot_quota_on(sb, type, format_id, path);
5668        if (err) {
5669                lockdep_set_quota_inode(path->dentry->d_inode,
5670                                             I_DATA_SEM_NORMAL);
5671        } else {
5672                struct inode *inode = d_inode(path->dentry);
5673                handle_t *handle;
5674
5675                /*
5676                 * Set inode flags to prevent userspace from messing with quota
5677                 * files. If this fails, we return success anyway since quotas
5678                 * are already enabled and this is not a hard failure.
5679                 */
5680                inode_lock(inode);
5681                handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
5682                if (IS_ERR(handle))
5683                        goto unlock_inode;
5684                EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
5685                inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
5686                                S_NOATIME | S_IMMUTABLE);
5687                ext4_mark_inode_dirty(handle, inode);
5688                ext4_journal_stop(handle);
5689        unlock_inode:
5690                inode_unlock(inode);
5691        }
5692        return err;
5693}
5694
5695static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
5696                             unsigned int flags)
5697{
5698        int err;
5699        struct inode *qf_inode;
5700        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
5701                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
5702                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
5703                le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
5704        };
5705
5706        BUG_ON(!ext4_has_feature_quota(sb));
5707
5708        if (!qf_inums[type])
5709                return -EPERM;
5710
5711        qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
5712        if (IS_ERR(qf_inode)) {
5713                ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
5714                return PTR_ERR(qf_inode);
5715        }
5716
5717        /* Don't account quota for quota files to avoid recursion */
5718        qf_inode->i_flags |= S_NOQUOTA;
5719        lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
5720        err = dquot_enable(qf_inode, type, format_id, flags);
5721        if (err)
5722                lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
5723        iput(qf_inode);
5724
5725        return err;
5726}
5727
5728/* Enable usage tracking for all quota types. */
5729static int ext4_enable_quotas(struct super_block *sb)
5730{
5731        int type, err = 0;
5732        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
5733                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
5734                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
5735                le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
5736        };
5737        bool quota_mopt[EXT4_MAXQUOTAS] = {
5738                test_opt(sb, USRQUOTA),
5739                test_opt(sb, GRPQUOTA),
5740                test_opt(sb, PRJQUOTA),
5741        };
5742
5743        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
5744        for (type = 0; type < EXT4_MAXQUOTAS; type++) {
5745                if (qf_inums[type]) {
5746                        err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
5747                                DQUOT_USAGE_ENABLED |
5748                                (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
5749                        if (err) {
5750                                ext4_warning(sb,
5751                                        "Failed to enable quota tracking "
5752                                        "(type=%d, err=%d). Please run "
5753                                        "e2fsck to fix.", type, err);
5754                                for (type--; type >= 0; type--)
5755                                        dquot_quota_off(sb, type);
5756
5757                                return err;
5758                        }
5759                }
5760        }
5761        return 0;
5762}
5763
5764static int ext4_quota_off(struct super_block *sb, int type)
5765{
5766        struct inode *inode = sb_dqopt(sb)->files[type];
5767        handle_t *handle;
5768        int err;
5769
5770        /* Force all delayed allocation blocks to be allocated.
5771         * Caller already holds s_umount sem */
5772        if (test_opt(sb, DELALLOC))
5773                sync_filesystem(sb);
5774
5775        if (!inode || !igrab(inode))
5776                goto out;
5777
5778        err = dquot_quota_off(sb, type);
5779        if (err || ext4_has_feature_quota(sb))
5780                goto out_put;
5781
5782        inode_lock(inode);
5783        /*
5784         * Update modification times of quota files when userspace can
5785         * start looking at them. If we fail, we return success anyway since
5786         * this is not a hard failure and quotas are already disabled.
5787         */
5788        handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
5789        if (IS_ERR(handle))
5790                goto out_unlock;
5791        EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
5792        inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
5793        inode->i_mtime = inode->i_ctime = current_time(inode);
5794        ext4_mark_inode_dirty(handle, inode);
5795        ext4_journal_stop(handle);
5796out_unlock:
5797        inode_unlock(inode);
5798out_put:
5799        lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
5800        iput(inode);
5801        return err;
5802out:
5803        return dquot_quota_off(sb, type);
5804}
5805
5806/* Read data from quotafile - avoid pagecache and such because we cannot afford
5807 * acquiring the locks... As quota files are never truncated and quota code
5808 * itself serializes the operations (and no one else should touch the files)
5809 * we don't have to be afraid of races */
5810static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
5811                               size_t len, loff_t off)
5812{
5813        struct inode *inode = sb_dqopt(sb)->files[type];
5814        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
5815        int offset = off & (sb->s_blocksize - 1);
5816        int tocopy;
5817        size_t toread;
5818        struct buffer_head *bh;
5819        loff_t i_size = i_size_read(inode);
5820
5821        if (off > i_size)
5822                return 0;
5823        if (off+len > i_size)
5824                len = i_size-off;
5825        toread = len;
5826        while (toread > 0) {
5827                tocopy = sb->s_blocksize - offset < toread ?
5828                                sb->s_blocksize - offset : toread;
5829                bh = ext4_bread(NULL, inode, blk, 0);
5830                if (IS_ERR(bh))
5831                        return PTR_ERR(bh);
5832                if (!bh)        /* A hole? */
5833                        memset(data, 0, tocopy);
5834                else
5835                        memcpy(data, bh->b_data+offset, tocopy);
5836                brelse(bh);
5837                offset = 0;
5838                toread -= tocopy;
5839                data += tocopy;
5840                blk++;
5841        }
5842        return len;
5843}
5844
5845/* Write to quotafile (we know the transaction is already started and has
5846 * enough credits) */
5847static ssize_t ext4_quota_write(struct super_block *sb, int type,
5848                                const char *data, size_t len, loff_t off)
5849{
5850        struct inode *inode = sb_dqopt(sb)->files[type];
5851        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
5852        int err, offset = off & (sb->s_blocksize - 1);
5853        int retries = 0;
5854        struct buffer_head *bh;
5855        handle_t *handle = journal_current_handle();
5856
5857        if (EXT4_SB(sb)->s_journal && !handle) {
5858                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
5859                        " cancelled because transaction is not started",
5860                        (unsigned long long)off, (unsigned long long)len);
5861                return -EIO;
5862        }
5863        /*
5864         * Since we account only one data block in transaction credits,
5865         * then it is impossible to cross a block boundary.
5866         */
5867        if (sb->s_blocksize - offset < len) {
5868                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
5869                        " cancelled because not block aligned",
5870                        (unsigned long long)off, (unsigned long long)len);
5871                return -EIO;
5872        }
5873
5874        do {
5875                bh = ext4_bread(handle, inode, blk,
5876                                EXT4_GET_BLOCKS_CREATE |
5877                                EXT4_GET_BLOCKS_METADATA_NOFAIL);
5878        } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) &&
5879                 ext4_should_retry_alloc(inode->i_sb, &retries));
5880        if (IS_ERR(bh))
5881                return PTR_ERR(bh);
5882        if (!bh)
5883                goto out;
5884        BUFFER_TRACE(bh, "get write access");
5885        err = ext4_journal_get_write_access(handle, bh);
5886        if (err) {
5887                brelse(bh);
5888                return err;
5889        }
5890        lock_buffer(bh);
5891        memcpy(bh->b_data+offset, data, len);
5892        flush_dcache_page(bh->b_page);
5893        unlock_buffer(bh);
5894        err = ext4_handle_dirty_metadata(handle, NULL, bh);
5895        brelse(bh);
5896out:
5897        if (inode->i_size < off + len) {
5898                i_size_write(inode, off + len);
5899                EXT4_I(inode)->i_disksize = inode->i_size;
5900                ext4_mark_inode_dirty(handle, inode);
5901        }
5902        return len;
5903}
5904
5905static int ext4_get_next_id(struct super_block *sb, struct kqid *qid)
5906{
5907        const struct quota_format_ops   *ops;
5908
5909        if (!sb_has_quota_loaded(sb, qid->type))
5910                return -ESRCH;
5911        ops = sb_dqopt(sb)->ops[qid->type];
5912        if (!ops || !ops->get_next_id)
5913                return -ENOSYS;
5914        return dquot_get_next_id(sb, qid);
5915}
5916#endif
5917
5918static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
5919                       const char *dev_name, void *data)
5920{
5921        return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
5922}
5923
5924#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
5925static inline void register_as_ext2(void)
5926{
5927        int err = register_filesystem(&ext2_fs_type);
5928        if (err)
5929                printk(KERN_WARNING
5930                       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
5931}
5932
5933static inline void unregister_as_ext2(void)
5934{
5935        unregister_filesystem(&ext2_fs_type);
5936}
5937
5938static inline int ext2_feature_set_ok(struct super_block *sb)
5939{
5940        if (ext4_has_unknown_ext2_incompat_features(sb))
5941                return 0;
5942        if (sb_rdonly(sb))
5943                return 1;
5944        if (ext4_has_unknown_ext2_ro_compat_features(sb))
5945                return 0;
5946        return 1;
5947}
5948#else
5949static inline void register_as_ext2(void) { }
5950static inline void unregister_as_ext2(void) { }
5951static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
5952#endif
5953
5954static inline void register_as_ext3(void)
5955{
5956        int err = register_filesystem(&ext3_fs_type);
5957        if (err)
5958                printk(KERN_WARNING
5959                       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
5960}
5961
5962static inline void unregister_as_ext3(void)
5963{
5964        unregister_filesystem(&ext3_fs_type);
5965}
5966
5967static inline int ext3_feature_set_ok(struct super_block *sb)
5968{
5969        if (ext4_has_unknown_ext3_incompat_features(sb))
5970                return 0;
5971        if (!ext4_has_feature_journal(sb))
5972                return 0;
5973        if (sb_rdonly(sb))
5974                return 1;
5975        if (ext4_has_unknown_ext3_ro_compat_features(sb))
5976                return 0;
5977        return 1;
5978}
5979
5980static struct file_system_type ext4_fs_type = {
5981        .owner          = THIS_MODULE,
5982        .name           = "ext4",
5983        .mount          = ext4_mount,
5984        .kill_sb        = kill_block_super,
5985        .fs_flags       = FS_REQUIRES_DEV,
5986};
5987MODULE_ALIAS_FS("ext4");
5988
5989/* Shared across all ext4 file systems */
5990wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
5991
5992static int __init ext4_init_fs(void)
5993{
5994        int i, err;
5995
5996        ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
5997        ext4_li_info = NULL;
5998        mutex_init(&ext4_li_mtx);
5999
6000        /* Build-time check for flags consistency */

6001        ext4_check_flag_values();
6002
6003        for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
6004                init_waitqueue_head(&ext4__ioend_wq[i]);
6005
6006        err = ext4_init_es();
6007        if (err)
6008                return err;
6009
6010        err = ext4_init_pending();
6011        if (err)
6012                goto out6;
6013
6014        err = ext4_init_pageio();
6015        if (err)
6016                goto out5;
6017
6018        err = ext4_init_system_zone();
6019        if (err)
6020                goto out4;
6021
6022        err = ext4_init_sysfs();
6023        if (err)
6024                goto out3;
6025
6026        err = ext4_init_mballoc();
6027        if (err)
6028                goto out2;
6029        err = init_inodecache();
6030        if (err)
6031                goto out1;
6032        register_as_ext3();
6033        register_as_ext2();
6034        err = register_filesystem(&ext4_fs_type);
6035        if (err)
6036                goto out;
6037
6038        return 0;
6039out:
6040        unregister_as_ext2();
6041        unregister_as_ext3();
6042        destroy_inodecache();
6043out1:
6044        ext4_exit_mballoc();
6045out2:
6046        ext4_exit_sysfs();
6047out3:
6048        ext4_exit_system_zone();
6049out4:
6050        ext4_exit_pageio();
6051out5:
6052        ext4_exit_pending();
6053out6:
6054        ext4_exit_es();
6055
6056        return err;
6057}
6058
6059static void __exit ext4_exit_fs(void)
6060{
6061        ext4_destroy_lazyinit_thread();
6062        unregister_as_ext2();
6063        unregister_as_ext3();
6064        unregister_filesystem(&ext4_fs_type);
6065        destroy_inodecache();
6066        ext4_exit_mballoc();
6067        ext4_exit_sysfs();
6068        ext4_exit_system_zone();
6069        ext4_exit_pageio();
6070        ext4_exit_es();
6071        ext4_exit_pending();
6072}
6073
6074MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
6075MODULE_DESCRIPTION("Fourth Extended Filesystem");
6076MODULE_LICENSE("GPL");
6077MODULE_SOFTDEP("pre: crc32c");
6078module_init(ext4_init_fs)
6079module_exit(ext4_exit_fs)
6080