linux/fs/overlayfs/super.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *
   4 * Copyright (C) 2011 Novell Inc.
   5 */
   6
   7#include <uapi/linux/magic.h>
   8#include <linux/fs.h>
   9#include <linux/namei.h>
  10#include <linux/xattr.h>
  11#include <linux/mount.h>
  12#include <linux/parser.h>
  13#include <linux/module.h>
  14#include <linux/statfs.h>
  15#include <linux/seq_file.h>
  16#include <linux/posix_acl_xattr.h>
  17#include <linux/exportfs.h>
  18#include "overlayfs.h"
  19
  20MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
  21MODULE_DESCRIPTION("Overlay filesystem");
  22MODULE_LICENSE("GPL");
  23
  24
  25struct ovl_dir_cache;
  26
  27#define OVL_MAX_STACK 500
  28
  29static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR);
  30module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644);
  31MODULE_PARM_DESC(redirect_dir,
  32                 "Default to on or off for the redirect_dir feature");
  33
  34static bool ovl_redirect_always_follow =
  35        IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW);
  36module_param_named(redirect_always_follow, ovl_redirect_always_follow,
  37                   bool, 0644);
  38MODULE_PARM_DESC(redirect_always_follow,
  39                 "Follow redirects even if redirect_dir feature is turned off");
  40
  41static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX);
  42module_param_named(index, ovl_index_def, bool, 0644);
  43MODULE_PARM_DESC(index,
  44                 "Default to on or off for the inodes index feature");
  45
  46static bool ovl_nfs_export_def = IS_ENABLED(CONFIG_OVERLAY_FS_NFS_EXPORT);
  47module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644);
  48MODULE_PARM_DESC(nfs_export,
  49                 "Default to on or off for the NFS export feature");
  50
  51static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO);
  52module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644);
  53MODULE_PARM_DESC(xino_auto,
  54                 "Auto enable xino feature");
  55
  56static void ovl_entry_stack_free(struct ovl_entry *oe)
  57{
  58        unsigned int i;
  59
  60        for (i = 0; i < oe->numlower; i++)
  61                dput(oe->lowerstack[i].dentry);
  62}
  63
  64static bool ovl_metacopy_def = IS_ENABLED(CONFIG_OVERLAY_FS_METACOPY);
  65module_param_named(metacopy, ovl_metacopy_def, bool, 0644);
  66MODULE_PARM_DESC(metacopy,
  67                 "Default to on or off for the metadata only copy up feature");
  68
  69static void ovl_dentry_release(struct dentry *dentry)
  70{
  71        struct ovl_entry *oe = dentry->d_fsdata;
  72
  73        if (oe) {
  74                ovl_entry_stack_free(oe);
  75                kfree_rcu(oe, rcu);
  76        }
  77}
  78
  79static struct dentry *ovl_d_real(struct dentry *dentry,
  80                                 const struct inode *inode)
  81{
  82        struct dentry *real;
  83
  84        /* It's an overlay file */
  85        if (inode && d_inode(dentry) == inode)
  86                return dentry;
  87
  88        if (!d_is_reg(dentry)) {
  89                if (!inode || inode == d_inode(dentry))
  90                        return dentry;
  91                goto bug;
  92        }
  93
  94        real = ovl_dentry_upper(dentry);
  95        if (real && (inode == d_inode(real)))
  96                return real;
  97
  98        if (real && !inode && ovl_has_upperdata(d_inode(dentry)))
  99                return real;
 100
 101        real = ovl_dentry_lowerdata(dentry);
 102        if (!real)
 103                goto bug;
 104
 105        /* Handle recursion */
 106        real = d_real(real, inode);
 107
 108        if (!inode || inode == d_inode(real))
 109                return real;
 110bug:
 111        WARN(1, "ovl_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
 112             inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
 113        return dentry;
 114}
 115
 116static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
 117{
 118        int ret = 1;
 119
 120        if (weak) {
 121                if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE)
 122                        ret =  d->d_op->d_weak_revalidate(d, flags);
 123        } else if (d->d_flags & DCACHE_OP_REVALIDATE) {
 124                ret = d->d_op->d_revalidate(d, flags);
 125                if (!ret) {
 126                        if (!(flags & LOOKUP_RCU))
 127                                d_invalidate(d);
 128                        ret = -ESTALE;
 129                }
 130        }
 131        return ret;
 132}
 133
 134static int ovl_dentry_revalidate_common(struct dentry *dentry,
 135                                        unsigned int flags, bool weak)
 136{
 137        struct ovl_entry *oe = dentry->d_fsdata;
 138        struct dentry *upper;
 139        unsigned int i;
 140        int ret = 1;
 141
 142        upper = ovl_dentry_upper(dentry);
 143        if (upper)
 144                ret = ovl_revalidate_real(upper, flags, weak);
 145
 146        for (i = 0; ret > 0 && i < oe->numlower; i++) {
 147                ret = ovl_revalidate_real(oe->lowerstack[i].dentry, flags,
 148                                          weak);
 149        }
 150        return ret;
 151}
 152
 153static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 154{
 155        return ovl_dentry_revalidate_common(dentry, flags, false);
 156}
 157
 158static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
 159{
 160        return ovl_dentry_revalidate_common(dentry, flags, true);
 161}
 162
 163static const struct dentry_operations ovl_dentry_operations = {
 164        .d_release = ovl_dentry_release,
 165        .d_real = ovl_d_real,
 166        .d_revalidate = ovl_dentry_revalidate,
 167        .d_weak_revalidate = ovl_dentry_weak_revalidate,
 168};
 169
 170static struct kmem_cache *ovl_inode_cachep;
 171
 172static struct inode *ovl_alloc_inode(struct super_block *sb)
 173{
 174        struct ovl_inode *oi = kmem_cache_alloc(ovl_inode_cachep, GFP_KERNEL);
 175
 176        if (!oi)
 177                return NULL;
 178
 179        oi->cache = NULL;
 180        oi->redirect = NULL;
 181        oi->version = 0;
 182        oi->flags = 0;
 183        oi->__upperdentry = NULL;
 184        oi->lower = NULL;
 185        oi->lowerdata = NULL;
 186        mutex_init(&oi->lock);
 187
 188        return &oi->vfs_inode;
 189}
 190
 191static void ovl_free_inode(struct inode *inode)
 192{
 193        struct ovl_inode *oi = OVL_I(inode);
 194
 195        kfree(oi->redirect);
 196        mutex_destroy(&oi->lock);
 197        kmem_cache_free(ovl_inode_cachep, oi);
 198}
 199
 200static void ovl_destroy_inode(struct inode *inode)
 201{
 202        struct ovl_inode *oi = OVL_I(inode);
 203
 204        dput(oi->__upperdentry);
 205        iput(oi->lower);
 206        if (S_ISDIR(inode->i_mode))
 207                ovl_dir_cache_free(inode);
 208        else
 209                iput(oi->lowerdata);
 210}
 211
 212static void ovl_free_fs(struct ovl_fs *ofs)
 213{
 214        struct vfsmount **mounts;
 215        unsigned i;
 216
 217        iput(ofs->workbasedir_trap);
 218        iput(ofs->indexdir_trap);
 219        iput(ofs->workdir_trap);
 220        dput(ofs->whiteout);
 221        dput(ofs->indexdir);
 222        dput(ofs->workdir);
 223        if (ofs->workdir_locked)
 224                ovl_inuse_unlock(ofs->workbasedir);
 225        dput(ofs->workbasedir);
 226        if (ofs->upperdir_locked)
 227                ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root);
 228
 229        /* Hack!  Reuse ofs->layers as a vfsmount array before freeing it */
 230        mounts = (struct vfsmount **) ofs->layers;
 231        for (i = 0; i < ofs->numlayer; i++) {
 232                iput(ofs->layers[i].trap);
 233                mounts[i] = ofs->layers[i].mnt;
 234        }
 235        kern_unmount_array(mounts, ofs->numlayer);
 236        kfree(ofs->layers);
 237        for (i = 0; i < ofs->numfs; i++)
 238                free_anon_bdev(ofs->fs[i].pseudo_dev);
 239        kfree(ofs->fs);
 240
 241        kfree(ofs->config.lowerdir);
 242        kfree(ofs->config.upperdir);
 243        kfree(ofs->config.workdir);
 244        kfree(ofs->config.redirect_mode);
 245        if (ofs->creator_cred)
 246                put_cred(ofs->creator_cred);
 247        kfree(ofs);
 248}
 249
 250static void ovl_put_super(struct super_block *sb)
 251{
 252        struct ovl_fs *ofs = sb->s_fs_info;
 253
 254        ovl_free_fs(ofs);
 255}
 256
 257/* Sync real dirty inodes in upper filesystem (if it exists) */
 258static int ovl_sync_fs(struct super_block *sb, int wait)
 259{
 260        struct ovl_fs *ofs = sb->s_fs_info;
 261        struct super_block *upper_sb;
 262        int ret;
 263
 264        if (!ovl_upper_mnt(ofs))
 265                return 0;
 266
 267        /*
 268         * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC).
 269         * All the super blocks will be iterated, including upper_sb.
 270         *
 271         * If this is a syncfs(2) call, then we do need to call
 272         * sync_filesystem() on upper_sb, but enough if we do it when being
 273         * called with wait == 1.
 274         */
 275        if (!wait)
 276                return 0;
 277
 278        upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
 279
 280        down_read(&upper_sb->s_umount);
 281        ret = sync_filesystem(upper_sb);
 282        up_read(&upper_sb->s_umount);
 283
 284        return ret;
 285}
 286
 287/**
 288 * ovl_statfs
 289 * @sb: The overlayfs super block
 290 * @buf: The struct kstatfs to fill in with stats
 291 *
 292 * Get the filesystem statistics.  As writes always target the upper layer
 293 * filesystem pass the statfs to the upper filesystem (if it exists)
 294 */
 295static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
 296{
 297        struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
 298        struct dentry *root_dentry = dentry->d_sb->s_root;
 299        struct path path;
 300        int err;
 301
 302        ovl_path_real(root_dentry, &path);
 303
 304        err = vfs_statfs(&path, buf);
 305        if (!err) {
 306                buf->f_namelen = ofs->namelen;
 307                buf->f_type = OVERLAYFS_SUPER_MAGIC;
 308        }
 309
 310        return err;
 311}
 312
 313/* Will this overlay be forced to mount/remount ro? */
 314static bool ovl_force_readonly(struct ovl_fs *ofs)
 315{
 316        return (!ovl_upper_mnt(ofs) || !ofs->workdir);
 317}
 318
 319static const char *ovl_redirect_mode_def(void)
 320{
 321        return ovl_redirect_dir_def ? "on" : "off";
 322}
 323
 324static const char * const ovl_xino_str[] = {
 325        "off",
 326        "auto",
 327        "on",
 328};
 329
 330static inline int ovl_xino_def(void)
 331{
 332        return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF;
 333}
 334
 335/**
 336 * ovl_show_options
 337 *
 338 * Prints the mount options for a given superblock.
 339 * Returns zero; does not fail.
 340 */
 341static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
 342{
 343        struct super_block *sb = dentry->d_sb;
 344        struct ovl_fs *ofs = sb->s_fs_info;
 345
 346        seq_show_option(m, "lowerdir", ofs->config.lowerdir);
 347        if (ofs->config.upperdir) {
 348                seq_show_option(m, "upperdir", ofs->config.upperdir);
 349                seq_show_option(m, "workdir", ofs->config.workdir);
 350        }
 351        if (ofs->config.default_permissions)
 352                seq_puts(m, ",default_permissions");
 353        if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0)
 354                seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode);
 355        if (ofs->config.index != ovl_index_def)
 356                seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off");
 357        if (ofs->config.nfs_export != ovl_nfs_export_def)
 358                seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ?
 359                                                "on" : "off");
 360        if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(sb))
 361                seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]);
 362        if (ofs->config.metacopy != ovl_metacopy_def)
 363                seq_printf(m, ",metacopy=%s",
 364                           ofs->config.metacopy ? "on" : "off");
 365        return 0;
 366}
 367
 368static int ovl_remount(struct super_block *sb, int *flags, char *data)
 369{
 370        struct ovl_fs *ofs = sb->s_fs_info;
 371        struct super_block *upper_sb;
 372        int ret = 0;
 373
 374        if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs))
 375                return -EROFS;
 376
 377        if (*flags & SB_RDONLY && !sb_rdonly(sb)) {
 378                upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
 379                down_read(&upper_sb->s_umount);
 380                ret = sync_filesystem(upper_sb);
 381                up_read(&upper_sb->s_umount);
 382        }
 383
 384        return ret;
 385}
 386
 387static const struct super_operations ovl_super_operations = {
 388        .alloc_inode    = ovl_alloc_inode,
 389        .free_inode     = ovl_free_inode,
 390        .destroy_inode  = ovl_destroy_inode,
 391        .drop_inode     = generic_delete_inode,
 392        .put_super      = ovl_put_super,
 393        .sync_fs        = ovl_sync_fs,
 394        .statfs         = ovl_statfs,
 395        .show_options   = ovl_show_options,
 396        .remount_fs     = ovl_remount,
 397};
 398
 399enum {
 400        OPT_LOWERDIR,
 401        OPT_UPPERDIR,
 402        OPT_WORKDIR,
 403        OPT_DEFAULT_PERMISSIONS,
 404        OPT_REDIRECT_DIR,
 405        OPT_INDEX_ON,
 406        OPT_INDEX_OFF,
 407        OPT_NFS_EXPORT_ON,
 408        OPT_NFS_EXPORT_OFF,
 409        OPT_XINO_ON,
 410        OPT_XINO_OFF,
 411        OPT_XINO_AUTO,
 412        OPT_METACOPY_ON,
 413        OPT_METACOPY_OFF,
 414        OPT_ERR,
 415};
 416
 417static const match_table_t ovl_tokens = {
 418        {OPT_LOWERDIR,                  "lowerdir=%s"},
 419        {OPT_UPPERDIR,                  "upperdir=%s"},
 420        {OPT_WORKDIR,                   "workdir=%s"},
 421        {OPT_DEFAULT_PERMISSIONS,       "default_permissions"},
 422        {OPT_REDIRECT_DIR,              "redirect_dir=%s"},
 423        {OPT_INDEX_ON,                  "index=on"},
 424        {OPT_INDEX_OFF,                 "index=off"},
 425        {OPT_NFS_EXPORT_ON,             "nfs_export=on"},
 426        {OPT_NFS_EXPORT_OFF,            "nfs_export=off"},
 427        {OPT_XINO_ON,                   "xino=on"},
 428        {OPT_XINO_OFF,                  "xino=off"},
 429        {OPT_XINO_AUTO,                 "xino=auto"},
 430        {OPT_METACOPY_ON,               "metacopy=on"},
 431        {OPT_METACOPY_OFF,              "metacopy=off"},
 432        {OPT_ERR,                       NULL}
 433};
 434
 435static char *ovl_next_opt(char **s)
 436{
 437        char *sbegin = *s;
 438        char *p;
 439
 440        if (sbegin == NULL)
 441                return NULL;
 442
 443        for (p = sbegin; *p; p++) {
 444                if (*p == '\\') {
 445                        p++;
 446                        if (!*p)
 447                                break;
 448                } else if (*p == ',') {
 449                        *p = '\0';
 450                        *s = p + 1;
 451                        return sbegin;
 452                }
 453        }
 454        *s = NULL;
 455        return sbegin;
 456}
 457
 458static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode)
 459{
 460        if (strcmp(mode, "on") == 0) {
 461                config->redirect_dir = true;
 462                /*
 463                 * Does not make sense to have redirect creation without
 464                 * redirect following.
 465                 */
 466                config->redirect_follow = true;
 467        } else if (strcmp(mode, "follow") == 0) {
 468                config->redirect_follow = true;
 469        } else if (strcmp(mode, "off") == 0) {
 470                if (ovl_redirect_always_follow)
 471                        config->redirect_follow = true;
 472        } else if (strcmp(mode, "nofollow") != 0) {
 473                pr_err("bad mount option \"redirect_dir=%s\"\n",
 474                       mode);
 475                return -EINVAL;
 476        }
 477
 478        return 0;
 479}
 480
 481static int ovl_parse_opt(char *opt, struct ovl_config *config)
 482{
 483        char *p;
 484        int err;
 485        bool metacopy_opt = false, redirect_opt = false;
 486        bool nfs_export_opt = false, index_opt = false;
 487
 488        config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL);
 489        if (!config->redirect_mode)
 490                return -ENOMEM;
 491
 492        while ((p = ovl_next_opt(&opt)) != NULL) {
 493                int token;
 494                substring_t args[MAX_OPT_ARGS];
 495
 496                if (!*p)
 497                        continue;
 498
 499                token = match_token(p, ovl_tokens, args);
 500                switch (token) {
 501                case OPT_UPPERDIR:
 502                        kfree(config->upperdir);
 503                        config->upperdir = match_strdup(&args[0]);
 504                        if (!config->upperdir)
 505                                return -ENOMEM;
 506                        break;
 507
 508                case OPT_LOWERDIR:
 509                        kfree(config->lowerdir);
 510                        config->lowerdir = match_strdup(&args[0]);
 511                        if (!config->lowerdir)
 512                                return -ENOMEM;
 513                        break;
 514
 515                case OPT_WORKDIR:
 516                        kfree(config->workdir);
 517                        config->workdir = match_strdup(&args[0]);
 518                        if (!config->workdir)
 519                                return -ENOMEM;
 520                        break;
 521
 522                case OPT_DEFAULT_PERMISSIONS:
 523                        config->default_permissions = true;
 524                        break;
 525
 526                case OPT_REDIRECT_DIR:
 527                        kfree(config->redirect_mode);
 528                        config->redirect_mode = match_strdup(&args[0]);
 529                        if (!config->redirect_mode)
 530                                return -ENOMEM;
 531                        redirect_opt = true;
 532                        break;
 533
 534                case OPT_INDEX_ON:
 535                        config->index = true;
 536                        index_opt = true;
 537                        break;
 538
 539                case OPT_INDEX_OFF:
 540                        config->index = false;
 541                        index_opt = true;
 542                        break;
 543
 544                case OPT_NFS_EXPORT_ON:
 545                        config->nfs_export = true;
 546                        nfs_export_opt = true;
 547                        break;
 548
 549                case OPT_NFS_EXPORT_OFF:
 550                        config->nfs_export = false;
 551                        nfs_export_opt = true;
 552                        break;
 553
 554                case OPT_XINO_ON:
 555                        config->xino = OVL_XINO_ON;
 556                        break;
 557
 558                case OPT_XINO_OFF:
 559                        config->xino = OVL_XINO_OFF;
 560                        break;
 561
 562                case OPT_XINO_AUTO:
 563                        config->xino = OVL_XINO_AUTO;
 564                        break;
 565
 566                case OPT_METACOPY_ON:
 567                        config->metacopy = true;
 568                        metacopy_opt = true;
 569                        break;
 570
 571                case OPT_METACOPY_OFF:
 572                        config->metacopy = false;
 573                        metacopy_opt = true;
 574                        break;
 575
 576                default:
 577                        pr_err("unrecognized mount option \"%s\" or missing value\n",
 578                                        p);
 579                        return -EINVAL;
 580                }
 581        }
 582
 583        /* Workdir/index are useless in non-upper mount */
 584        if (!config->upperdir) {
 585                if (config->workdir) {
 586                        pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
 587                                config->workdir);
 588                        kfree(config->workdir);
 589                        config->workdir = NULL;
 590                }
 591                if (config->index && index_opt) {
 592                        pr_info("option \"index=on\" is useless in a non-upper mount, ignore\n");
 593                        index_opt = false;
 594                }
 595                config->index = false;
 596        }
 597
 598        err = ovl_parse_redirect_mode(config, config->redirect_mode);
 599        if (err)
 600                return err;
 601
 602        /*
 603         * This is to make the logic below simpler.  It doesn't make any other
 604         * difference, since config->redirect_dir is only used for upper.
 605         */
 606        if (!config->upperdir && config->redirect_follow)
 607                config->redirect_dir = true;
 608
 609        /* Resolve metacopy -> redirect_dir dependency */
 610        if (config->metacopy && !config->redirect_dir) {
 611                if (metacopy_opt && redirect_opt) {
 612                        pr_err("conflicting options: metacopy=on,redirect_dir=%s\n",
 613                               config->redirect_mode);
 614                        return -EINVAL;
 615                }
 616                if (redirect_opt) {
 617                        /*
 618                         * There was an explicit redirect_dir=... that resulted
 619                         * in this conflict.
 620                         */
 621                        pr_info("disabling metacopy due to redirect_dir=%s\n",
 622                                config->redirect_mode);
 623                        config->metacopy = false;
 624                } else {
 625                        /* Automatically enable redirect otherwise. */
 626                        config->redirect_follow = config->redirect_dir = true;
 627                }
 628        }
 629
 630        /* Resolve nfs_export -> index dependency */
 631        if (config->nfs_export && !config->index) {
 632                if (!config->upperdir && config->redirect_follow) {
 633                        pr_info("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n");
 634                        config->nfs_export = false;
 635                } else if (nfs_export_opt && index_opt) {
 636                        pr_err("conflicting options: nfs_export=on,index=off\n");
 637                        return -EINVAL;
 638                } else if (index_opt) {
 639                        /*
 640                         * There was an explicit index=off that resulted
 641                         * in this conflict.
 642                         */
 643                        pr_info("disabling nfs_export due to index=off\n");
 644                        config->nfs_export = false;
 645                } else {
 646                        /* Automatically enable index otherwise. */
 647                        config->index = true;
 648                }
 649        }
 650
 651        /* Resolve nfs_export -> !metacopy dependency */
 652        if (config->nfs_export && config->metacopy) {
 653                if (nfs_export_opt && metacopy_opt) {
 654                        pr_err("conflicting options: nfs_export=on,metacopy=on\n");
 655                        return -EINVAL;
 656                }
 657                if (metacopy_opt) {
 658                        /*
 659                         * There was an explicit metacopy=on that resulted
 660                         * in this conflict.
 661                         */
 662                        pr_info("disabling nfs_export due to metacopy=on\n");
 663                        config->nfs_export = false;
 664                } else {
 665                        /*
 666                         * There was an explicit nfs_export=on that resulted
 667                         * in this conflict.
 668                         */
 669                        pr_info("disabling metacopy due to nfs_export=on\n");
 670                        config->metacopy = false;
 671                }
 672        }
 673
 674        return 0;
 675}
 676
 677#define OVL_WORKDIR_NAME "work"
 678#define OVL_INDEXDIR_NAME "index"
 679
 680static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
 681                                         const char *name, bool persist)
 682{
 683        struct inode *dir =  ofs->workbasedir->d_inode;
 684        struct vfsmount *mnt = ovl_upper_mnt(ofs);
 685        struct dentry *work;
 686        int err;
 687        bool retried = false;
 688
 689        inode_lock_nested(dir, I_MUTEX_PARENT);
 690retry:
 691        work = lookup_one_len(name, ofs->workbasedir, strlen(name));
 692
 693        if (!IS_ERR(work)) {
 694                struct iattr attr = {
 695                        .ia_valid = ATTR_MODE,
 696                        .ia_mode = S_IFDIR | 0,
 697                };
 698
 699                if (work->d_inode) {
 700                        err = -EEXIST;
 701                        if (retried)
 702                                goto out_dput;
 703
 704                        if (persist)
 705                                goto out_unlock;
 706
 707                        retried = true;
 708                        ovl_workdir_cleanup(dir, mnt, work, 0);
 709                        dput(work);
 710                        goto retry;
 711                }
 712
 713                work = ovl_create_real(dir, work, OVL_CATTR(attr.ia_mode));
 714                err = PTR_ERR(work);
 715                if (IS_ERR(work))
 716                        goto out_err;
 717
 718                /*
 719                 * Try to remove POSIX ACL xattrs from workdir.  We are good if:
 720                 *
 721                 * a) success (there was a POSIX ACL xattr and was removed)
 722                 * b) -ENODATA (there was no POSIX ACL xattr)
 723                 * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported)
 724                 *
 725                 * There are various other error values that could effectively
 726                 * mean that the xattr doesn't exist (e.g. -ERANGE is returned
 727                 * if the xattr name is too long), but the set of filesystems
 728                 * allowed as upper are limited to "normal" ones, where checking
 729                 * for the above two errors is sufficient.
 730                 */
 731                err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT);
 732                if (err && err != -ENODATA && err != -EOPNOTSUPP)
 733                        goto out_dput;
 734
 735                err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS);
 736                if (err && err != -ENODATA && err != -EOPNOTSUPP)
 737                        goto out_dput;
 738
 739                /* Clear any inherited mode bits */
 740                inode_lock(work->d_inode);
 741                err = notify_change(work, &attr, NULL);
 742                inode_unlock(work->d_inode);
 743                if (err)
 744                        goto out_dput;
 745        } else {
 746                err = PTR_ERR(work);
 747                goto out_err;
 748        }
 749out_unlock:
 750        inode_unlock(dir);
 751        return work;
 752
 753out_dput:
 754        dput(work);
 755out_err:
 756        pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n",
 757                ofs->config.workdir, name, -err);
 758        work = NULL;
 759        goto out_unlock;
 760}
 761
 762static void ovl_unescape(char *s)
 763{
 764        char *d = s;
 765
 766        for (;; s++, d++) {
 767                if (*s == '\\')
 768                        s++;
 769                *d = *s;
 770                if (!*s)
 771                        break;
 772        }
 773}
 774
 775static int ovl_mount_dir_noesc(const char *name, struct path *path)
 776{
 777        int err = -EINVAL;
 778
 779        if (!*name) {
 780                pr_err("empty lowerdir\n");
 781                goto out;
 782        }
 783        err = kern_path(name, LOOKUP_FOLLOW, path);
 784        if (err) {
 785                pr_err("failed to resolve '%s': %i\n", name, err);
 786                goto out;
 787        }
 788        err = -EINVAL;
 789        if (ovl_dentry_weird(path->dentry)) {
 790                pr_err("filesystem on '%s' not supported\n", name);
 791                goto out_put;
 792        }
 793        if (!d_is_dir(path->dentry)) {
 794                pr_err("'%s' not a directory\n", name);
 795                goto out_put;
 796        }
 797        return 0;
 798
 799out_put:
 800        path_put_init(path);
 801out:
 802        return err;
 803}
 804
 805static int ovl_mount_dir(const char *name, struct path *path)
 806{
 807        int err = -ENOMEM;
 808        char *tmp = kstrdup(name, GFP_KERNEL);
 809
 810        if (tmp) {
 811                ovl_unescape(tmp);
 812                err = ovl_mount_dir_noesc(tmp, path);
 813
 814                if (!err && path->dentry->d_flags & DCACHE_OP_REAL) {
 815                        pr_err("filesystem on '%s' not supported as upperdir\n",
 816                               tmp);
 817                        path_put_init(path);
 818                        err = -EINVAL;
 819                }
 820                kfree(tmp);
 821        }
 822        return err;
 823}
 824
 825static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
 826                             const char *name)
 827{
 828        struct kstatfs statfs;
 829        int err = vfs_statfs(path, &statfs);
 830
 831        if (err)
 832                pr_err("statfs failed on '%s'\n", name);
 833        else
 834                ofs->namelen = max(ofs->namelen, statfs.f_namelen);
 835
 836        return err;
 837}
 838
 839static int ovl_lower_dir(const char *name, struct path *path,
 840                         struct ovl_fs *ofs, int *stack_depth)
 841{
 842        int fh_type;
 843        int err;
 844
 845        err = ovl_mount_dir_noesc(name, path);
 846        if (err)
 847                return err;
 848
 849        err = ovl_check_namelen(path, ofs, name);
 850        if (err)
 851                return err;
 852
 853        *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
 854
 855        /*
 856         * The inodes index feature and NFS export need to encode and decode
 857         * file handles, so they require that all layers support them.
 858         */
 859        fh_type = ovl_can_decode_fh(path->dentry->d_sb);
 860        if ((ofs->config.nfs_export ||
 861             (ofs->config.index && ofs->config.upperdir)) && !fh_type) {
 862                ofs->config.index = false;
 863                ofs->config.nfs_export = false;
 864                pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n",
 865                        name);
 866        }
 867
 868        /* Check if lower fs has 32bit inode numbers */
 869        if (fh_type != FILEID_INO32_GEN)
 870                ofs->xino_mode = -1;
 871
 872        return 0;
 873}
 874
 875/* Workdir should not be subdir of upperdir and vice versa */
 876static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
 877{
 878        bool ok = false;
 879
 880        if (workdir != upperdir) {
 881                ok = (lock_rename(workdir, upperdir) == NULL);
 882                unlock_rename(workdir, upperdir);
 883        }
 884        return ok;
 885}
 886
 887static unsigned int ovl_split_lowerdirs(char *str)
 888{
 889        unsigned int ctr = 1;
 890        char *s, *d;
 891
 892        for (s = d = str;; s++, d++) {
 893                if (*s == '\\') {
 894                        s++;
 895                } else if (*s == ':') {
 896                        *d = '\0';
 897                        ctr++;
 898                        continue;
 899                }
 900                *d = *s;
 901                if (!*s)
 902                        break;
 903        }
 904        return ctr;
 905}
 906
 907static int __maybe_unused
 908ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
 909                        struct dentry *dentry, struct inode *inode,
 910                        const char *name, void *buffer, size_t size)
 911{
 912        return ovl_xattr_get(dentry, inode, handler->name, buffer, size);
 913}
 914
 915static int __maybe_unused
 916ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
 917                        struct dentry *dentry, struct inode *inode,
 918                        const char *name, const void *value,
 919                        size_t size, int flags)
 920{
 921        struct dentry *workdir = ovl_workdir(dentry);
 922        struct inode *realinode = ovl_inode_real(inode);
 923        struct posix_acl *acl = NULL;
 924        int err;
 925
 926        /* Check that everything is OK before copy-up */
 927        if (value) {
 928                acl = posix_acl_from_xattr(&init_user_ns, value, size);
 929                if (IS_ERR(acl))
 930                        return PTR_ERR(acl);
 931        }
 932        err = -EOPNOTSUPP;
 933        if (!IS_POSIXACL(d_inode(workdir)))
 934                goto out_acl_release;
 935        if (!realinode->i_op->set_acl)
 936                goto out_acl_release;
 937        if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) {
 938                err = acl ? -EACCES : 0;
 939                goto out_acl_release;
 940        }
 941        err = -EPERM;
 942        if (!inode_owner_or_capable(inode))
 943                goto out_acl_release;
 944
 945        posix_acl_release(acl);
 946
 947        /*
 948         * Check if sgid bit needs to be cleared (actual setacl operation will
 949         * be done with mounter's capabilities and so that won't do it for us).
 950         */
 951        if (unlikely(inode->i_mode & S_ISGID) &&
 952            handler->flags == ACL_TYPE_ACCESS &&
 953            !in_group_p(inode->i_gid) &&
 954            !capable_wrt_inode_uidgid(inode, CAP_FSETID)) {
 955                struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
 956
 957                err = ovl_setattr(dentry, &iattr);
 958                if (err)
 959                        return err;
 960        }
 961
 962        err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags);
 963        if (!err)
 964                ovl_copyattr(ovl_inode_real(inode), inode);
 965
 966        return err;
 967
 968out_acl_release:
 969        posix_acl_release(acl);
 970        return err;
 971}
 972
 973static int ovl_own_xattr_get(const struct xattr_handler *handler,
 974                             struct dentry *dentry, struct inode *inode,
 975                             const char *name, void *buffer, size_t size)
 976{
 977        return -EOPNOTSUPP;
 978}
 979
 980static int ovl_own_xattr_set(const struct xattr_handler *handler,
 981                             struct dentry *dentry, struct inode *inode,
 982                             const char *name, const void *value,
 983                             size_t size, int flags)
 984{
 985        return -EOPNOTSUPP;
 986}
 987
 988static int ovl_other_xattr_get(const struct xattr_handler *handler,
 989                               struct dentry *dentry, struct inode *inode,
 990                               const char *name, void *buffer, size_t size)
 991{
 992        return ovl_xattr_get(dentry, inode, name, buffer, size);
 993}
 994
 995static int ovl_other_xattr_set(const struct xattr_handler *handler,
 996                               struct dentry *dentry, struct inode *inode,
 997                               const char *name, const void *value,
 998                               size_t size, int flags)
 999{
1000        return ovl_xattr_set(dentry, inode, name, value, size, flags);
1001}
1002
1003static const struct xattr_handler __maybe_unused
1004ovl_posix_acl_access_xattr_handler = {
1005        .name = XATTR_NAME_POSIX_ACL_ACCESS,
1006        .flags = ACL_TYPE_ACCESS,
1007        .get = ovl_posix_acl_xattr_get,
1008        .set = ovl_posix_acl_xattr_set,
1009};
1010
1011static const struct xattr_handler __maybe_unused
1012ovl_posix_acl_default_xattr_handler = {
1013        .name = XATTR_NAME_POSIX_ACL_DEFAULT,
1014        .flags = ACL_TYPE_DEFAULT,
1015        .get = ovl_posix_acl_xattr_get,
1016        .set = ovl_posix_acl_xattr_set,
1017};
1018
1019static const struct xattr_handler ovl_own_xattr_handler = {
1020        .prefix = OVL_XATTR_PREFIX,
1021        .get = ovl_own_xattr_get,
1022        .set = ovl_own_xattr_set,
1023};
1024
1025static const struct xattr_handler ovl_other_xattr_handler = {
1026        .prefix = "", /* catch all */
1027        .get = ovl_other_xattr_get,
1028        .set = ovl_other_xattr_set,
1029};
1030
1031static const struct xattr_handler *ovl_xattr_handlers[] = {
1032#ifdef CONFIG_FS_POSIX_ACL
1033        &ovl_posix_acl_access_xattr_handler,
1034        &ovl_posix_acl_default_xattr_handler,
1035#endif
1036        &ovl_own_xattr_handler,
1037        &ovl_other_xattr_handler,
1038        NULL
1039};
1040
1041static int ovl_setup_trap(struct super_block *sb, struct dentry *dir,
1042                          struct inode **ptrap, const char *name)
1043{
1044        struct inode *trap;
1045        int err;
1046
1047        trap = ovl_get_trap_inode(sb, dir);
1048        err = PTR_ERR_OR_ZERO(trap);
1049        if (err) {
1050                if (err == -ELOOP)
1051                        pr_err("conflicting %s path\n", name);
1052                return err;
1053        }
1054
1055        *ptrap = trap;
1056        return 0;
1057}
1058
1059/*
1060 * Determine how we treat concurrent use of upperdir/workdir based on the
1061 * index feature. This is papering over mount leaks of container runtimes,
1062 * for example, an old overlay mount is leaked and now its upperdir is
1063 * attempted to be used as a lower layer in a new overlay mount.
1064 */
1065static int ovl_report_in_use(struct ovl_fs *ofs, const char *name)
1066{
1067        if (ofs->config.index) {
1068                pr_err("%s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n",
1069                       name);
1070                return -EBUSY;
1071        } else {
1072                pr_warn("%s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n",
1073                        name);
1074                return 0;
1075        }
1076}
1077
1078static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
1079                         struct ovl_layer *upper_layer, struct path *upperpath)
1080{
1081        struct vfsmount *upper_mnt;
1082        int err;
1083
1084        err = ovl_mount_dir(ofs->config.upperdir, upperpath);
1085        if (err)
1086                goto out;
1087
1088        /* Upper fs should not be r/o */
1089        if (sb_rdonly(upperpath->mnt->mnt_sb)) {
1090                pr_err("upper fs is r/o, try multi-lower layers mount\n");
1091                err = -EINVAL;
1092                goto out;
1093        }
1094
1095        err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir);
1096        if (err)
1097                goto out;
1098
1099        err = ovl_setup_trap(sb, upperpath->dentry, &upper_layer->trap,
1100                             "upperdir");
1101        if (err)
1102                goto out;
1103
1104        upper_mnt = clone_private_mount(upperpath);
1105        err = PTR_ERR(upper_mnt);
1106        if (IS_ERR(upper_mnt)) {
1107                pr_err("failed to clone upperpath\n");
1108                goto out;
1109        }
1110
1111        /* Don't inherit atime flags */
1112        upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
1113        upper_layer->mnt = upper_mnt;
1114        upper_layer->idx = 0;
1115        upper_layer->fsid = 0;
1116
1117        /*
1118         * Inherit SB_NOSEC flag from upperdir.
1119         *
1120         * This optimization changes behavior when a security related attribute
1121         * (suid/sgid/security.*) is changed on an underlying layer.  This is
1122         * okay because we don't yet have guarantees in that case, but it will
1123         * need careful treatment once we want to honour changes to underlying
1124         * filesystems.
1125         */
1126        if (upper_mnt->mnt_sb->s_flags & SB_NOSEC)
1127                sb->s_flags |= SB_NOSEC;
1128
1129        if (ovl_inuse_trylock(ovl_upper_mnt(ofs)->mnt_root)) {
1130                ofs->upperdir_locked = true;
1131        } else {
1132                err = ovl_report_in_use(ofs, "upperdir");
1133                if (err)
1134                        goto out;
1135        }
1136
1137        err = 0;
1138out:
1139        return err;
1140}
1141
1142/*
1143 * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and
1144 * negative values if error is encountered.
1145 */
1146static int ovl_check_rename_whiteout(struct dentry *workdir)
1147{
1148        struct inode *dir = d_inode(workdir);
1149        struct dentry *temp;
1150        struct dentry *dest;
1151        struct dentry *whiteout;
1152        struct name_snapshot name;
1153        int err;
1154
1155        inode_lock_nested(dir, I_MUTEX_PARENT);
1156
1157        temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0));
1158        err = PTR_ERR(temp);
1159        if (IS_ERR(temp))
1160                goto out_unlock;
1161
1162        dest = ovl_lookup_temp(workdir);
1163        err = PTR_ERR(dest);
1164        if (IS_ERR(dest)) {
1165                dput(temp);
1166                goto out_unlock;
1167        }
1168
1169        /* Name is inline and stable - using snapshot as a copy helper */
1170        take_dentry_name_snapshot(&name, temp);
1171        err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT);
1172        if (err) {
1173                if (err == -EINVAL)
1174                        err = 0;
1175                goto cleanup_temp;
1176        }
1177
1178        whiteout = lookup_one_len(name.name.name, workdir, name.name.len);
1179        err = PTR_ERR(whiteout);
1180        if (IS_ERR(whiteout))
1181                goto cleanup_temp;
1182
1183        err = ovl_is_whiteout(whiteout);
1184
1185        /* Best effort cleanup of whiteout and temp file */
1186        if (err)
1187                ovl_cleanup(dir, whiteout);
1188        dput(whiteout);
1189
1190cleanup_temp:
1191        ovl_cleanup(dir, temp);
1192        release_dentry_name_snapshot(&name);
1193        dput(temp);
1194        dput(dest);
1195
1196out_unlock:
1197        inode_unlock(dir);
1198
1199        return err;
1200}
1201
1202static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
1203                            struct path *workpath)
1204{
1205        struct vfsmount *mnt = ovl_upper_mnt(ofs);
1206        struct dentry *temp;
1207        bool rename_whiteout;
1208        bool d_type;
1209        int fh_type;
1210        int err;
1211
1212        err = mnt_want_write(mnt);
1213        if (err)
1214                return err;
1215
1216        ofs->workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false);
1217        if (!ofs->workdir)
1218                goto out;
1219
1220        err = ovl_setup_trap(sb, ofs->workdir, &ofs->workdir_trap, "workdir");
1221        if (err)
1222                goto out;
1223
1224        /*
1225         * Upper should support d_type, else whiteouts are visible.  Given
1226         * workdir and upper are on same fs, we can do iterate_dir() on
1227         * workdir. This check requires successful creation of workdir in
1228         * previous step.
1229         */
1230        err = ovl_check_d_type_supported(workpath);
1231        if (err < 0)
1232                goto out;
1233
1234        d_type = err;
1235        if (!d_type)
1236                pr_warn("upper fs needs to support d_type.\n");
1237
1238        /* Check if upper/work fs supports O_TMPFILE */
1239        temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0);
1240        ofs->tmpfile = !IS_ERR(temp);
1241        if (ofs->tmpfile)
1242                dput(temp);
1243        else
1244                pr_warn("upper fs does not support tmpfile.\n");
1245
1246
1247        /* Check if upper/work fs supports RENAME_WHITEOUT */
1248        err = ovl_check_rename_whiteout(ofs->workdir);
1249        if (err < 0)
1250                goto out;
1251
1252        rename_whiteout = err;
1253        if (!rename_whiteout)
1254                pr_warn("upper fs does not support RENAME_WHITEOUT.\n");
1255
1256        /*
1257         * Check if upper/work fs supports trusted.overlay.* xattr
1258         */
1259        err = ovl_do_setxattr(ofs->workdir, OVL_XATTR_OPAQUE, "0", 1, 0);
1260        if (err) {
1261                ofs->noxattr = true;
1262                ofs->config.index = false;
1263                ofs->config.metacopy = false;
1264                pr_warn("upper fs does not support xattr, falling back to index=off and metacopy=off.\n");
1265                err = 0;
1266        } else {
1267                vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE);
1268        }
1269
1270        /*
1271         * We allowed sub-optimal upper fs configuration and don't want to break
1272         * users over kernel upgrade, but we never allowed remote upper fs, so
1273         * we can enforce strict requirements for remote upper fs.
1274         */
1275        if (ovl_dentry_remote(ofs->workdir) &&
1276            (!d_type || !rename_whiteout || ofs->noxattr)) {
1277                pr_err("upper fs missing required features.\n");
1278                err = -EINVAL;
1279                goto out;
1280        }
1281
1282        /* Check if upper/work fs supports file handles */
1283        fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
1284        if (ofs->config.index && !fh_type) {
1285                ofs->config.index = false;
1286                pr_warn("upper fs does not support file handles, falling back to index=off.\n");
1287        }
1288
1289        /* Check if upper fs has 32bit inode numbers */
1290        if (fh_type != FILEID_INO32_GEN)
1291                ofs->xino_mode = -1;
1292
1293        /* NFS export of r/w mount depends on index */
1294        if (ofs->config.nfs_export && !ofs->config.index) {
1295                pr_warn("NFS export requires \"index=on\", falling back to nfs_export=off.\n");
1296                ofs->config.nfs_export = false;
1297        }
1298out:
1299        mnt_drop_write(mnt);
1300        return err;
1301}
1302
1303static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs,
1304                           struct path *upperpath)
1305{
1306        int err;
1307        struct path workpath = { };
1308
1309        err = ovl_mount_dir(ofs->config.workdir, &workpath);
1310        if (err)
1311                goto out;
1312
1313        err = -EINVAL;
1314        if (upperpath->mnt != workpath.mnt) {
1315                pr_err("workdir and upperdir must reside under the same mount\n");
1316                goto out;
1317        }
1318        if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) {
1319                pr_err("workdir and upperdir must be separate subtrees\n");
1320                goto out;
1321        }
1322
1323        ofs->workbasedir = dget(workpath.dentry);
1324
1325        if (ovl_inuse_trylock(ofs->workbasedir)) {
1326                ofs->workdir_locked = true;
1327        } else {
1328                err = ovl_report_in_use(ofs, "workdir");
1329                if (err)
1330                        goto out;
1331        }
1332
1333        err = ovl_setup_trap(sb, ofs->workbasedir, &ofs->workbasedir_trap,
1334                             "workdir");
1335        if (err)
1336                goto out;
1337
1338        err = ovl_make_workdir(sb, ofs, &workpath);
1339
1340out:
1341        path_put(&workpath);
1342
1343        return err;
1344}
1345
1346static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
1347                            struct ovl_entry *oe, struct path *upperpath)
1348{
1349        struct vfsmount *mnt = ovl_upper_mnt(ofs);
1350        int err;
1351
1352        err = mnt_want_write(mnt);
1353        if (err)
1354                return err;
1355
1356        /* Verify lower root is upper root origin */
1357        err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry,
1358                                true);
1359        if (err) {
1360                pr_err("failed to verify upper root origin\n");
1361                goto out;
1362        }
1363
1364        /* index dir will act also as workdir */
1365        iput(ofs->workdir_trap);
1366        ofs->workdir_trap = NULL;
1367        dput(ofs->workdir);
1368        ofs->workdir = NULL;
1369        ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
1370        if (ofs->indexdir) {
1371                ofs->workdir = dget(ofs->indexdir);
1372
1373                err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap,
1374                                     "indexdir");
1375                if (err)
1376                        goto out;
1377
1378                /*
1379                 * Verify upper root is exclusively associated with index dir.
1380                 * Older kernels stored upper fh in "trusted.overlay.origin"
1381                 * xattr. If that xattr exists, verify that it is a match to
1382                 * upper dir file handle. In any case, verify or set xattr
1383                 * "trusted.overlay.upper" to indicate that index may have
1384                 * directory entries.
1385                 */
1386                if (ovl_check_origin_xattr(ofs->indexdir)) {
1387                        err = ovl_verify_set_fh(ofs->indexdir, OVL_XATTR_ORIGIN,
1388                                                upperpath->dentry, true, false);
1389                        if (err)
1390                                pr_err("failed to verify index dir 'origin' xattr\n");
1391                }
1392                err = ovl_verify_upper(ofs->indexdir, upperpath->dentry, true);
1393                if (err)
1394                        pr_err("failed to verify index dir 'upper' xattr\n");
1395
1396                /* Cleanup bad/stale/orphan index entries */
1397                if (!err)
1398                        err = ovl_indexdir_cleanup(ofs);
1399        }
1400        if (err || !ofs->indexdir)
1401                pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
1402
1403out:
1404        mnt_drop_write(mnt);
1405        return err;
1406}
1407
1408static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
1409{
1410        unsigned int i;
1411
1412        if (!ofs->config.nfs_export && !ovl_upper_mnt(ofs))
1413                return true;
1414
1415        /*
1416         * We allow using single lower with null uuid for index and nfs_export
1417         * for example to support those features with single lower squashfs.
1418         * To avoid regressions in setups of overlay with re-formatted lower
1419         * squashfs, do not allow decoding origin with lower null uuid unless
1420         * user opted-in to one of the new features that require following the
1421         * lower inode of non-dir upper.
1422         */
1423        if (!ofs->config.index && !ofs->config.metacopy && !ofs->config.xino &&
1424            uuid_is_null(uuid))
1425                return false;
1426
1427        for (i = 0; i < ofs->numfs; i++) {
1428                /*
1429                 * We use uuid to associate an overlay lower file handle with a
1430                 * lower layer, so we can accept lower fs with null uuid as long
1431                 * as all lower layers with null uuid are on the same fs.
1432                 * if we detect multiple lower fs with the same uuid, we
1433                 * disable lower file handle decoding on all of them.
1434                 */
1435                if (ofs->fs[i].is_lower &&
1436                    uuid_equal(&ofs->fs[i].sb->s_uuid, uuid)) {
1437                        ofs->fs[i].bad_uuid = true;
1438                        return false;
1439                }
1440        }
1441        return true;
1442}
1443
1444/* Get a unique fsid for the layer */
1445static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
1446{
1447        struct super_block *sb = path->mnt->mnt_sb;
1448        unsigned int i;
1449        dev_t dev;
1450        int err;
1451        bool bad_uuid = false;
1452
1453        for (i = 0; i < ofs->numfs; i++) {
1454                if (ofs->fs[i].sb == sb)
1455                        return i;
1456        }
1457
1458        if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) {
1459                bad_uuid = true;
1460                if (ofs->config.index || ofs->config.nfs_export) {
1461                        ofs->config.index = false;
1462                        ofs->config.nfs_export = false;
1463                        pr_warn("%s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n",
1464                                uuid_is_null(&sb->s_uuid) ? "null" :
1465                                                            "conflicting",
1466                                path->dentry);
1467                }
1468        }
1469
1470        err = get_anon_bdev(&dev);
1471        if (err) {
1472                pr_err("failed to get anonymous bdev for lowerpath\n");
1473                return err;
1474        }
1475
1476        ofs->fs[ofs->numfs].sb = sb;
1477        ofs->fs[ofs->numfs].pseudo_dev = dev;
1478        ofs->fs[ofs->numfs].bad_uuid = bad_uuid;
1479
1480        return ofs->numfs++;
1481}
1482
1483static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
1484                          struct path *stack, unsigned int numlower,
1485                          struct ovl_layer *layers)
1486{
1487        int err;
1488        unsigned int i;
1489
1490        err = -ENOMEM;
1491        ofs->fs = kcalloc(numlower + 1, sizeof(struct ovl_sb), GFP_KERNEL);
1492        if (ofs->fs == NULL)
1493                goto out;
1494
1495        /* idx/fsid 0 are reserved for upper fs even with lower only overlay */
1496        ofs->numfs++;
1497
1498        /*
1499         * All lower layers that share the same fs as upper layer, use the same
1500         * pseudo_dev as upper layer.  Allocate fs[0].pseudo_dev even for lower
1501         * only overlay to simplify ovl_fs_free().
1502         * is_lower will be set if upper fs is shared with a lower layer.
1503         */
1504        err = get_anon_bdev(&ofs->fs[0].pseudo_dev);
1505        if (err) {
1506                pr_err("failed to get anonymous bdev for upper fs\n");
1507                goto out;
1508        }
1509
1510        if (ovl_upper_mnt(ofs)) {
1511                ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb;
1512                ofs->fs[0].is_lower = false;
1513        }
1514
1515        for (i = 0; i < numlower; i++) {
1516                struct vfsmount *mnt;
1517                struct inode *trap;
1518                int fsid;
1519
1520                err = fsid = ovl_get_fsid(ofs, &stack[i]);
1521                if (err < 0)
1522                        goto out;
1523
1524                /*
1525                 * Check if lower root conflicts with this overlay layers before
1526                 * checking if it is in-use as upperdir/workdir of "another"
1527                 * mount, because we do not bother to check in ovl_is_inuse() if
1528                 * the upperdir/workdir is in fact in-use by our
1529                 * upperdir/workdir.
1530                 */
1531                err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir");
1532                if (err)
1533                        goto out;
1534
1535                if (ovl_is_inuse(stack[i].dentry)) {
1536                        err = ovl_report_in_use(ofs, "lowerdir");
1537                        if (err) {
1538                                iput(trap);
1539                                goto out;
1540                        }
1541                }
1542
1543                mnt = clone_private_mount(&stack[i]);
1544                err = PTR_ERR(mnt);
1545                if (IS_ERR(mnt)) {
1546                        pr_err("failed to clone lowerpath\n");
1547                        iput(trap);
1548                        goto out;
1549                }
1550
1551                /*
1552                 * Make lower layers R/O.  That way fchmod/fchown on lower file
1553                 * will fail instead of modifying lower fs.
1554                 */
1555                mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
1556
1557                layers[ofs->numlayer].trap = trap;
1558                layers[ofs->numlayer].mnt = mnt;
1559                layers[ofs->numlayer].idx = ofs->numlayer;
1560                layers[ofs->numlayer].fsid = fsid;
1561                layers[ofs->numlayer].fs = &ofs->fs[fsid];
1562                ofs->numlayer++;
1563                ofs->fs[fsid].is_lower = true;
1564        }
1565
1566        /*
1567         * When all layers on same fs, overlay can use real inode numbers.
1568         * With mount option "xino=<on|auto>", mounter declares that there are
1569         * enough free high bits in underlying fs to hold the unique fsid.
1570         * If overlayfs does encounter underlying inodes using the high xino
1571         * bits reserved for fsid, it emits a warning and uses the original
1572         * inode number or a non persistent inode number allocated from a
1573         * dedicated range.
1574         */
1575        if (ofs->numfs - !ovl_upper_mnt(ofs) == 1) {
1576                if (ofs->config.xino == OVL_XINO_ON)
1577                        pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n");
1578                ofs->xino_mode = 0;
1579        } else if (ofs->config.xino == OVL_XINO_OFF) {
1580                ofs->xino_mode = -1;
1581        } else if (ofs->xino_mode < 0) {
1582                /*
1583                 * This is a roundup of number of bits needed for encoding
1584                 * fsid, where fsid 0 is reserved for upper fs (even with
1585                 * lower only overlay) +1 extra bit is reserved for the non
1586                 * persistent inode number range that is used for resolving
1587                 * xino lower bits overflow.
1588                 */
1589                BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30);
1590                ofs->xino_mode = ilog2(ofs->numfs - 1) + 2;
1591        }
1592
1593        if (ofs->xino_mode > 0) {
1594                pr_info("\"xino\" feature enabled using %d upper inode bits.\n",
1595                        ofs->xino_mode);
1596        }
1597
1598        err = 0;
1599out:
1600        return err;
1601}
1602
1603static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
1604                                const char *lower, unsigned int numlower,
1605                                struct ovl_fs *ofs, struct ovl_layer *layers)
1606{
1607        int err;
1608        struct path *stack = NULL;
1609        unsigned int i;
1610        struct ovl_entry *oe;
1611
1612        if (!ofs->config.upperdir && numlower == 1) {
1613                pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n");
1614                return ERR_PTR(-EINVAL);
1615        }
1616
1617        stack = kcalloc(numlower, sizeof(struct path), GFP_KERNEL);
1618        if (!stack)
1619                return ERR_PTR(-ENOMEM);
1620
1621        err = -EINVAL;
1622        for (i = 0; i < numlower; i++) {
1623                err = ovl_lower_dir(lower, &stack[i], ofs, &sb->s_stack_depth);
1624                if (err)
1625                        goto out_err;
1626
1627                lower = strchr(lower, '\0') + 1;
1628        }
1629
1630        err = -EINVAL;
1631        sb->s_stack_depth++;
1632        if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
1633                pr_err("maximum fs stacking depth exceeded\n");
1634                goto out_err;
1635        }
1636
1637        err = ovl_get_layers(sb, ofs, stack, numlower, layers);
1638        if (err)
1639                goto out_err;
1640
1641        err = -ENOMEM;
1642        oe = ovl_alloc_entry(numlower);
1643        if (!oe)
1644                goto out_err;
1645
1646        for (i = 0; i < numlower; i++) {
1647                oe->lowerstack[i].dentry = dget(stack[i].dentry);
1648                oe->lowerstack[i].layer = &ofs->layers[i+1];
1649        }
1650
1651out:
1652        for (i = 0; i < numlower; i++)
1653                path_put(&stack[i]);
1654        kfree(stack);
1655
1656        return oe;
1657
1658out_err:
1659        oe = ERR_PTR(err);
1660        goto out;
1661}
1662
1663/*
1664 * Check if this layer root is a descendant of:
1665 * - another layer of this overlayfs instance
1666 * - upper/work dir of any overlayfs instance
1667 */
1668static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs,
1669                           struct dentry *dentry, const char *name)
1670{
1671        struct dentry *next = dentry, *parent;
1672        int err = 0;
1673
1674        if (!dentry)
1675                return 0;
1676
1677        parent = dget_parent(next);
1678
1679        /* Walk back ancestors to root (inclusive) looking for traps */
1680        while (!err && parent != next) {
1681                if (ovl_lookup_trap_inode(sb, parent)) {
1682                        err = -ELOOP;
1683                        pr_err("overlapping %s path\n", name);
1684                } else if (ovl_is_inuse(parent)) {
1685                        err = ovl_report_in_use(ofs, name);
1686                }
1687                next = parent;
1688                parent = dget_parent(next);
1689                dput(next);
1690        }
1691
1692        dput(parent);
1693
1694        return err;
1695}
1696
1697/*
1698 * Check if any of the layers or work dirs overlap.
1699 */
1700static int ovl_check_overlapping_layers(struct super_block *sb,
1701                                        struct ovl_fs *ofs)
1702{
1703        int i, err;
1704
1705        if (ovl_upper_mnt(ofs)) {
1706                err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root,
1707                                      "upperdir");
1708                if (err)
1709                        return err;
1710
1711                /*
1712                 * Checking workbasedir avoids hitting ovl_is_inuse(parent) of
1713                 * this instance and covers overlapping work and index dirs,
1714                 * unless work or index dir have been moved since created inside
1715                 * workbasedir.  In that case, we already have their traps in
1716                 * inode cache and we will catch that case on lookup.
1717                 */
1718                err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir");
1719                if (err)
1720                        return err;
1721        }
1722
1723        for (i = 1; i < ofs->numlayer; i++) {
1724                err = ovl_check_layer(sb, ofs,
1725                                      ofs->layers[i].mnt->mnt_root,
1726                                      "lowerdir");
1727                if (err)
1728                        return err;
1729        }
1730
1731        return 0;
1732}
1733
1734static struct dentry *ovl_get_root(struct super_block *sb,
1735                                   struct dentry *upperdentry,
1736                                   struct ovl_entry *oe)
1737{
1738        struct dentry *root;
1739        struct ovl_path *lowerpath = &oe->lowerstack[0];
1740        unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
1741        int fsid = lowerpath->layer->fsid;
1742        struct ovl_inode_params oip = {
1743                .upperdentry = upperdentry,
1744                .lowerpath = lowerpath,
1745        };
1746
1747        root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
1748        if (!root)
1749                return NULL;
1750
1751        root->d_fsdata = oe;
1752
1753        if (upperdentry) {
1754                /* Root inode uses upper st_ino/i_ino */
1755                ino = d_inode(upperdentry)->i_ino;
1756                fsid = 0;
1757                ovl_dentry_set_upper_alias(root);
1758                if (ovl_is_impuredir(upperdentry))
1759                        ovl_set_flag(OVL_IMPURE, d_inode(root));
1760        }
1761
1762        /* Root is always merge -> can have whiteouts */
1763        ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
1764        ovl_dentry_set_flag(OVL_E_CONNECTED, root);
1765        ovl_set_upperdata(d_inode(root));
1766        ovl_inode_init(d_inode(root), &oip, ino, fsid);
1767        ovl_dentry_update_reval(root, upperdentry, DCACHE_OP_WEAK_REVALIDATE);
1768
1769        return root;
1770}
1771
1772static int ovl_fill_super(struct super_block *sb, void *data, int silent)
1773{
1774        struct path upperpath = { };
1775        struct dentry *root_dentry;
1776        struct ovl_entry *oe;
1777        struct ovl_fs *ofs;
1778        struct ovl_layer *layers;
1779        struct cred *cred;
1780        char *splitlower = NULL;
1781        unsigned int numlower;
1782        int err;
1783
1784        sb->s_d_op = &ovl_dentry_operations;
1785
1786        err = -ENOMEM;
1787        ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
1788        if (!ofs)
1789                goto out;
1790
1791        ofs->creator_cred = cred = prepare_creds();
1792        if (!cred)
1793                goto out_err;
1794
1795        /* Is there a reason anyone would want not to share whiteouts? */
1796        ofs->share_whiteout = true;
1797
1798        ofs->config.index = ovl_index_def;
1799        ofs->config.nfs_export = ovl_nfs_export_def;
1800        ofs->config.xino = ovl_xino_def();
1801        ofs->config.metacopy = ovl_metacopy_def;
1802        err = ovl_parse_opt((char *) data, &ofs->config);
1803        if (err)
1804                goto out_err;
1805
1806        err = -EINVAL;
1807        if (!ofs->config.lowerdir) {
1808                if (!silent)
1809                        pr_err("missing 'lowerdir'\n");
1810                goto out_err;
1811        }
1812
1813        err = -ENOMEM;
1814        splitlower = kstrdup(ofs->config.lowerdir, GFP_KERNEL);
1815        if (!splitlower)
1816                goto out_err;
1817
1818        numlower = ovl_split_lowerdirs(splitlower);
1819        if (numlower > OVL_MAX_STACK) {
1820                pr_err("too many lower directories, limit is %d\n",
1821                       OVL_MAX_STACK);
1822                goto out_err;
1823        }
1824
1825        layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL);
1826        if (!layers)
1827                goto out_err;
1828
1829        ofs->layers = layers;
1830        /* Layer 0 is reserved for upper even if there's no upper */
1831        ofs->numlayer = 1;
1832
1833        sb->s_stack_depth = 0;
1834        sb->s_maxbytes = MAX_LFS_FILESIZE;
1835        atomic_long_set(&ofs->last_ino, 1);
1836        /* Assume underlaying fs uses 32bit inodes unless proven otherwise */
1837        if (ofs->config.xino != OVL_XINO_OFF) {
1838                ofs->xino_mode = BITS_PER_LONG - 32;
1839                if (!ofs->xino_mode) {
1840                        pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n");
1841                        ofs->config.xino = OVL_XINO_OFF;
1842                }
1843        }
1844
1845        /* alloc/destroy_inode needed for setting up traps in inode cache */
1846        sb->s_op = &ovl_super_operations;
1847
1848        if (ofs->config.upperdir) {
1849                if (!ofs->config.workdir) {
1850                        pr_err("missing 'workdir'\n");
1851                        goto out_err;
1852                }
1853
1854                err = ovl_get_upper(sb, ofs, &layers[0], &upperpath);
1855                if (err)
1856                        goto out_err;
1857
1858                err = ovl_get_workdir(sb, ofs, &upperpath);
1859                if (err)
1860                        goto out_err;
1861
1862                if (!ofs->workdir)
1863                        sb->s_flags |= SB_RDONLY;
1864
1865                sb->s_stack_depth = ovl_upper_mnt(ofs)->mnt_sb->s_stack_depth;
1866                sb->s_time_gran = ovl_upper_mnt(ofs)->mnt_sb->s_time_gran;
1867
1868        }
1869        oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers);
1870        err = PTR_ERR(oe);
1871        if (IS_ERR(oe))
1872                goto out_err;
1873
1874        /* If the upper fs is nonexistent, we mark overlayfs r/o too */
1875        if (!ovl_upper_mnt(ofs))
1876                sb->s_flags |= SB_RDONLY;
1877
1878        if (!ovl_force_readonly(ofs) && ofs->config.index) {
1879                err = ovl_get_indexdir(sb, ofs, oe, &upperpath);
1880                if (err)
1881                        goto out_free_oe;
1882
1883                /* Force r/o mount with no index dir */
1884                if (!ofs->indexdir)
1885                        sb->s_flags |= SB_RDONLY;
1886        }
1887
1888        err = ovl_check_overlapping_layers(sb, ofs);
1889        if (err)
1890                goto out_free_oe;
1891
1892        /* Show index=off in /proc/mounts for forced r/o mount */
1893        if (!ofs->indexdir) {
1894                ofs->config.index = false;
1895                if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) {
1896                        pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n");
1897                        ofs->config.nfs_export = false;
1898                }
1899        }
1900
1901        if (ofs->config.metacopy && ofs->config.nfs_export) {
1902                pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n");
1903                ofs->config.nfs_export = false;
1904        }
1905
1906        if (ofs->config.nfs_export)
1907                sb->s_export_op = &ovl_export_operations;
1908
1909        /* Never override disk quota limits or use reserved space */
1910        cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
1911
1912        sb->s_magic = OVERLAYFS_SUPER_MAGIC;
1913        sb->s_xattr = ovl_xattr_handlers;
1914        sb->s_fs_info = ofs;
1915        sb->s_flags |= SB_POSIXACL;
1916        sb->s_iflags |= SB_I_SKIP_SYNC;
1917
1918        err = -ENOMEM;
1919        root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
1920        if (!root_dentry)
1921                goto out_free_oe;
1922
1923        mntput(upperpath.mnt);
1924        kfree(splitlower);
1925
1926        sb->s_root = root_dentry;
1927
1928        return 0;
1929
1930out_free_oe:
1931        ovl_entry_stack_free(oe);
1932        kfree(oe);
1933out_err:
1934        kfree(splitlower);
1935        path_put(&upperpath);
1936        ovl_free_fs(ofs);
1937out:
1938        return err;
1939}
1940
1941static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
1942                                const char *dev_name, void *raw_data)
1943{
1944        return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
1945}
1946
1947static struct file_system_type ovl_fs_type = {
1948        .owner          = THIS_MODULE,
1949        .name           = "overlay",
1950        .mount          = ovl_mount,
1951        .kill_sb        = kill_anon_super,
1952};
1953MODULE_ALIAS_FS("overlay");
1954
1955static void ovl_inode_init_once(void *foo)
1956{
1957        struct ovl_inode *oi = foo;
1958
1959        inode_init_once(&oi->vfs_inode);
1960}
1961
1962static int __init ovl_init(void)
1963{
1964        int err;
1965
1966        ovl_inode_cachep = kmem_cache_create("ovl_inode",
1967                                             sizeof(struct ovl_inode), 0,
1968                                             (SLAB_RECLAIM_ACCOUNT|
1969                                              SLAB_MEM_SPREAD|SLAB_ACCOUNT),
1970                                             ovl_inode_init_once);
1971        if (ovl_inode_cachep == NULL)
1972                return -ENOMEM;
1973
1974        err = ovl_aio_request_cache_init();
1975        if (!err) {
1976                err = register_filesystem(&ovl_fs_type);
1977                if (!err)
1978                        return 0;
1979
1980                ovl_aio_request_cache_destroy();
1981        }
1982        kmem_cache_destroy(ovl_inode_cachep);
1983
1984        return err;
1985}
1986
1987static void __exit ovl_exit(void)
1988{
1989        unregister_filesystem(&ovl_fs_type);
1990
1991        /*
1992         * Make sure all delayed rcu free inodes are flushed before we
1993         * destroy cache.
1994         */
1995        rcu_barrier();
1996        kmem_cache_destroy(ovl_inode_cachep);
1997        ovl_aio_request_cache_destroy();
1998}
1999
2000module_init(ovl_init);
2001module_exit(ovl_exit);
2002