linux/fs/namei.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/namei.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * Some corrections by tytso.
   9 */
  10
  11/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  12 * lookup logic.
  13 */
  14/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  15 */
  16
  17#include <linux/init.h>
  18#include <linux/export.h>
  19#include <linux/kernel.h>
  20#include <linux/slab.h>
  21#include <linux/fs.h>
  22#include <linux/namei.h>
  23#include <linux/pagemap.h>
  24#include <linux/fsnotify.h>
  25#include <linux/personality.h>
  26#include <linux/security.h>
  27#include <linux/ima.h>
  28#include <linux/syscalls.h>
  29#include <linux/mount.h>
  30#include <linux/audit.h>
  31#include <linux/capability.h>
  32#include <linux/file.h>
  33#include <linux/fcntl.h>
  34#include <linux/device_cgroup.h>
  35#include <linux/fs_struct.h>
  36#include <linux/posix_acl.h>
  37#include <linux/hash.h>
  38#include <linux/bitops.h>
  39#include <linux/init_task.h>
  40#include <asm/uaccess.h>
  41
  42#include "internal.h"
  43#include "mount.h"
  44
  45/* [Feb-1997 T. Schoebel-Theuer]
  46 * Fundamental changes in the pathname lookup mechanisms (namei)
  47 * were necessary because of omirr.  The reason is that omirr needs
  48 * to know the _real_ pathname, not the user-supplied one, in case
  49 * of symlinks (and also when transname replacements occur).
  50 *
  51 * The new code replaces the old recursive symlink resolution with
  52 * an iterative one (in case of non-nested symlink chains).  It does
  53 * this with calls to <fs>_follow_link().
  54 * As a side effect, dir_namei(), _namei() and follow_link() are now 
  55 * replaced with a single function lookup_dentry() that can handle all 
  56 * the special cases of the former code.
  57 *
  58 * With the new dcache, the pathname is stored at each inode, at least as
  59 * long as the refcount of the inode is positive.  As a side effect, the
  60 * size of the dcache depends on the inode cache and thus is dynamic.
  61 *
  62 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  63 * resolution to correspond with current state of the code.
  64 *
  65 * Note that the symlink resolution is not *completely* iterative.
  66 * There is still a significant amount of tail- and mid- recursion in
  67 * the algorithm.  Also, note that <fs>_readlink() is not used in
  68 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  69 * may return different results than <fs>_follow_link().  Many virtual
  70 * filesystems (including /proc) exhibit this behavior.
  71 */
  72
  73/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  74 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  75 * and the name already exists in form of a symlink, try to create the new
  76 * name indicated by the symlink. The old code always complained that the
  77 * name already exists, due to not following the symlink even if its target
  78 * is nonexistent.  The new semantics affects also mknod() and link() when
  79 * the name is a symlink pointing to a non-existent name.
  80 *
  81 * I don't know which semantics is the right one, since I have no access
  82 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  83 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  84 * "old" one. Personally, I think the new semantics is much more logical.
  85 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  86 * file does succeed in both HP-UX and SunOs, but not in Solaris
  87 * and in the old Linux semantics.
  88 */
  89
  90/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  91 * semantics.  See the comments in "open_namei" and "do_link" below.
  92 *
  93 * [10-Sep-98 Alan Modra] Another symlink change.
  94 */
  95
  96/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  97 *      inside the path - always follow.
  98 *      in the last component in creation/removal/renaming - never follow.
  99 *      if LOOKUP_FOLLOW passed - follow.
 100 *      if the pathname has trailing slashes - follow.
 101 *      otherwise - don't follow.
 102 * (applied in that order).
 103 *
 104 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 105 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 106 * During the 2.4 we need to fix the userland stuff depending on it -
 107 * hopefully we will be able to get rid of that wart in 2.5. So far only
 108 * XEmacs seems to be relying on it...
 109 */
 110/*
 111 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 112 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 113 * any extra contention...
 114 */
 115
 116/* In order to reduce some races, while at the same time doing additional
 117 * checking and hopefully speeding things up, we copy filenames to the
 118 * kernel data space before using them..
 119 *
 120 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 121 * PATH_MAX includes the nul terminator --RR.
 122 */
 123
 124#define EMBEDDED_NAME_MAX       (PATH_MAX - offsetof(struct filename, iname))
 125
 126struct filename *
 127getname_flags(const char __user *filename, int flags, int *empty)
 128{
 129        struct filename *result;
 130        char *kname;
 131        int len;
 132
 133        result = audit_reusename(filename);
 134        if (result)
 135                return result;
 136
 137        result = __getname();
 138        if (unlikely(!result))
 139                return ERR_PTR(-ENOMEM);
 140
 141        /*
 142         * First, try to embed the struct filename inside the names_cache
 143         * allocation
 144         */
 145        kname = (char *)result->iname;
 146        result->name = kname;
 147
 148        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
 149        if (unlikely(len < 0)) {
 150                __putname(result);
 151                return ERR_PTR(len);
 152        }
 153
 154        /*
 155         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
 156         * separate struct filename so we can dedicate the entire
 157         * names_cache allocation for the pathname, and re-do the copy from
 158         * userland.
 159         */
 160        if (unlikely(len == EMBEDDED_NAME_MAX)) {
 161                const size_t size = offsetof(struct filename, iname[1]);
 162                kname = (char *)result;
 163
 164                /*
 165                 * size is chosen that way we to guarantee that
 166                 * result->iname[0] is within the same object and that
 167                 * kname can't be equal to result->iname, no matter what.
 168                 */
 169                result = kzalloc(size, GFP_KERNEL);
 170                if (unlikely(!result)) {
 171                        __putname(kname);
 172                        return ERR_PTR(-ENOMEM);
 173                }
 174                result->name = kname;
 175                len = strncpy_from_user(kname, filename, PATH_MAX);
 176                if (unlikely(len < 0)) {
 177                        __putname(kname);
 178                        kfree(result);
 179                        return ERR_PTR(len);
 180                }
 181                if (unlikely(len == PATH_MAX)) {
 182                        __putname(kname);
 183                        kfree(result);
 184                        return ERR_PTR(-ENAMETOOLONG);
 185                }
 186        }
 187
 188        result->refcnt = 1;
 189        /* The empty path is special. */
 190        if (unlikely(!len)) {
 191                if (empty)
 192                        *empty = 1;
 193                if (!(flags & LOOKUP_EMPTY)) {
 194                        putname(result);
 195                        return ERR_PTR(-ENOENT);
 196                }
 197        }
 198
 199        result->uptr = filename;
 200        result->aname = NULL;
 201        audit_getname(result);
 202        return result;
 203}
 204
 205struct filename *
 206getname(const char __user * filename)
 207{
 208        return getname_flags(filename, 0, NULL);
 209}
 210
 211struct filename *
 212getname_kernel(const char * filename)
 213{
 214        struct filename *result;
 215        int len = strlen(filename) + 1;
 216
 217        result = __getname();
 218        if (unlikely(!result))
 219                return ERR_PTR(-ENOMEM);
 220
 221        if (len <= EMBEDDED_NAME_MAX) {
 222                result->name = (char *)result->iname;
 223        } else if (len <= PATH_MAX) {
 224                struct filename *tmp;
 225
 226                tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
 227                if (unlikely(!tmp)) {
 228                        __putname(result);
 229                        return ERR_PTR(-ENOMEM);
 230                }
 231                tmp->name = (char *)result;
 232                result = tmp;
 233        } else {
 234                __putname(result);
 235                return ERR_PTR(-ENAMETOOLONG);
 236        }
 237        memcpy((char *)result->name, filename, len);
 238        result->uptr = NULL;
 239        result->aname = NULL;
 240        result->refcnt = 1;
 241        audit_getname(result);
 242
 243        return result;
 244}
 245
 246void putname(struct filename *name)
 247{
 248        BUG_ON(name->refcnt <= 0);
 249
 250        if (--name->refcnt > 0)
 251                return;
 252
 253        if (name->name != name->iname) {
 254                __putname(name->name);
 255                kfree(name);
 256        } else
 257                __putname(name);
 258}
 259
 260static int check_acl(struct inode *inode, int mask)
 261{
 262#ifdef CONFIG_FS_POSIX_ACL
 263        struct posix_acl *acl;
 264
 265        if (mask & MAY_NOT_BLOCK) {
 266                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
 267                if (!acl)
 268                        return -EAGAIN;
 269                /* no ->get_acl() calls in RCU mode... */
 270                if (is_uncached_acl(acl))
 271                        return -ECHILD;
 272                return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
 273        }
 274
 275        acl = get_acl(inode, ACL_TYPE_ACCESS);
 276        if (IS_ERR(acl))
 277                return PTR_ERR(acl);
 278        if (acl) {
 279                int error = posix_acl_permission(inode, acl, mask);
 280                posix_acl_release(acl);
 281                return error;
 282        }
 283#endif
 284
 285        return -EAGAIN;
 286}
 287
 288/*
 289 * This does the basic permission checking
 290 */
 291static int acl_permission_check(struct inode *inode, int mask)
 292{
 293        unsigned int mode = inode->i_mode;
 294
 295        if (likely(uid_eq(current_fsuid(), inode->i_uid)))
 296                mode >>= 6;
 297        else {
 298                if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
 299                        int error = check_acl(inode, mask);
 300                        if (error != -EAGAIN)
 301                                return error;
 302                }
 303
 304                if (in_group_p(inode->i_gid))
 305                        mode >>= 3;
 306        }
 307
 308        /*
 309         * If the DACs are ok we don't need any capability check.
 310         */
 311        if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 312                return 0;
 313        return -EACCES;
 314}
 315
 316/**
 317 * generic_permission -  check for access rights on a Posix-like filesystem
 318 * @inode:      inode to check access rights for
 319 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
 320 *
 321 * Used to check for read/write/execute permissions on a file.
 322 * We use "fsuid" for this, letting us set arbitrary permissions
 323 * for filesystem access without changing the "normal" uids which
 324 * are used for other things.
 325 *
 326 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 327 * request cannot be satisfied (eg. requires blocking or too much complexity).
 328 * It would then be called again in ref-walk mode.
 329 */
 330int generic_permission(struct inode *inode, int mask)
 331{
 332        int ret;
 333
 334        /*
 335         * Do the basic permission checks.
 336         */
 337        ret = acl_permission_check(inode, mask);
 338        if (ret != -EACCES)
 339                return ret;
 340
 341        if (S_ISDIR(inode->i_mode)) {
 342                /* DACs are overridable for directories */
 343                if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
 344                        return 0;
 345                if (!(mask & MAY_WRITE))
 346                        if (capable_wrt_inode_uidgid(inode,
 347                                                     CAP_DAC_READ_SEARCH))
 348                                return 0;
 349                return -EACCES;
 350        }
 351        /*
 352         * Read/write DACs are always overridable.
 353         * Executable DACs are overridable when there is
 354         * at least one exec bit set.
 355         */
 356        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
 357                if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
 358                        return 0;
 359
 360        /*
 361         * Searching includes executable on directories, else just read.
 362         */
 363        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 364        if (mask == MAY_READ)
 365                if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
 366                        return 0;
 367
 368        return -EACCES;
 369}
 370EXPORT_SYMBOL(generic_permission);
 371
 372/*
 373 * We _really_ want to just do "generic_permission()" without
 374 * even looking at the inode->i_op values. So we keep a cache
 375 * flag in inode->i_opflags, that says "this has not special
 376 * permission function, use the fast case".
 377 */
 378static inline int do_inode_permission(struct inode *inode, int mask)
 379{
 380        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 381                if (likely(inode->i_op->permission))
 382                        return inode->i_op->permission(inode, mask);
 383
 384                /* This gets set once for the inode lifetime */
 385                spin_lock(&inode->i_lock);
 386                inode->i_opflags |= IOP_FASTPERM;
 387                spin_unlock(&inode->i_lock);
 388        }
 389        return generic_permission(inode, mask);
 390}
 391
 392/**
 393 * __inode_permission - Check for access rights to a given inode
 394 * @inode: Inode to check permission on
 395 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 396 *
 397 * Check for read/write/execute permissions on an inode.
 398 *
 399 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 400 *
 401 * This does not check for a read-only file system.  You probably want
 402 * inode_permission().
 403 */
 404int __inode_permission(struct inode *inode, int mask)
 405{
 406        int retval;
 407
 408        if (unlikely(mask & MAY_WRITE)) {
 409                /*
 410                 * Nobody gets write access to an immutable file.
 411                 */
 412                if (IS_IMMUTABLE(inode))
 413                        return -EPERM;
 414
 415                /*
 416                 * Updating mtime will likely cause i_uid and i_gid to be
 417                 * written back improperly if their true value is unknown
 418                 * to the vfs.
 419                 */
 420                if (HAS_UNMAPPED_ID(inode))
 421                        return -EACCES;
 422        }
 423
 424        retval = do_inode_permission(inode, mask);
 425        if (retval)
 426                return retval;
 427
 428        retval = devcgroup_inode_permission(inode, mask);
 429        if (retval)
 430                return retval;
 431
 432        return security_inode_permission(inode, mask);
 433}
 434EXPORT_SYMBOL(__inode_permission);
 435
 436/**
 437 * sb_permission - Check superblock-level permissions
 438 * @sb: Superblock of inode to check permission on
 439 * @inode: Inode to check permission on
 440 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 441 *
 442 * Separate out file-system wide checks from inode-specific permission checks.
 443 */
 444static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 445{
 446        if (unlikely(mask & MAY_WRITE)) {
 447                umode_t mode = inode->i_mode;
 448
 449                /* Nobody gets write access to a read-only fs. */
 450                if ((sb->s_flags & MS_RDONLY) &&
 451                    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 452                        return -EROFS;
 453        }
 454        return 0;
 455}
 456
 457/**
 458 * inode_permission - Check for access rights to a given inode
 459 * @inode: Inode to check permission on
 460 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 461 *
 462 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 463 * this, letting us set arbitrary permissions for filesystem access without
 464 * changing the "normal" UIDs which are used for other things.
 465 *
 466 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 467 */
 468int inode_permission(struct inode *inode, int mask)
 469{
 470        int retval;
 471
 472        retval = sb_permission(inode->i_sb, inode, mask);
 473        if (retval)
 474                return retval;
 475        return __inode_permission(inode, mask);
 476}
 477EXPORT_SYMBOL(inode_permission);
 478
 479/**
 480 * path_get - get a reference to a path
 481 * @path: path to get the reference to
 482 *
 483 * Given a path increment the reference count to the dentry and the vfsmount.
 484 */
 485void path_get(const struct path *path)
 486{
 487        mntget(path->mnt);
 488        dget(path->dentry);
 489}
 490EXPORT_SYMBOL(path_get);
 491
 492/**
 493 * path_put - put a reference to a path
 494 * @path: path to put the reference to
 495 *
 496 * Given a path decrement the reference count to the dentry and the vfsmount.
 497 */
 498void path_put(const struct path *path)
 499{
 500        dput(path->dentry);
 501        mntput(path->mnt);
 502}
 503EXPORT_SYMBOL(path_put);
 504
 505#define EMBEDDED_LEVELS 2
 506struct nameidata {
 507        struct path     path;
 508        struct qstr     last;
 509        struct path     root;
 510        struct inode    *inode; /* path.dentry.d_inode */
 511        unsigned int    flags;
 512        unsigned        seq, m_seq;
 513        int             last_type;
 514        unsigned        depth;
 515        int             total_link_count;
 516        struct saved {
 517                struct path link;
 518                struct delayed_call done;
 519                const char *name;
 520                unsigned seq;
 521        } *stack, internal[EMBEDDED_LEVELS];
 522        struct filename *name;
 523        struct nameidata *saved;
 524        struct inode    *link_inode;
 525        unsigned        root_seq;
 526        int             dfd;
 527};
 528
 529static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 530{
 531        struct nameidata *old = current->nameidata;
 532        p->stack = p->internal;
 533        p->dfd = dfd;
 534        p->name = name;
 535        p->total_link_count = old ? old->total_link_count : 0;
 536        p->saved = old;
 537        current->nameidata = p;
 538}
 539
 540static void restore_nameidata(void)
 541{
 542        struct nameidata *now = current->nameidata, *old = now->saved;
 543
 544        current->nameidata = old;
 545        if (old)
 546                old->total_link_count = now->total_link_count;
 547        if (now->stack != now->internal)
 548                kfree(now->stack);
 549}
 550
 551static int __nd_alloc_stack(struct nameidata *nd)
 552{
 553        struct saved *p;
 554
 555        if (nd->flags & LOOKUP_RCU) {
 556                p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
 557                                  GFP_ATOMIC);
 558                if (unlikely(!p))
 559                        return -ECHILD;
 560        } else {
 561                p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
 562                                  GFP_KERNEL);
 563                if (unlikely(!p))
 564                        return -ENOMEM;
 565        }
 566        memcpy(p, nd->internal, sizeof(nd->internal));
 567        nd->stack = p;
 568        return 0;
 569}
 570
 571/**
 572 * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
 573 * @path: nameidate to verify
 574 *
 575 * Rename can sometimes move a file or directory outside of a bind
 576 * mount, path_connected allows those cases to be detected.
 577 */
 578static bool path_connected(const struct path *path)
 579{
 580        struct vfsmount *mnt = path->mnt;
 581
 582        /* Only bind mounts can have disconnected paths */
 583        if (mnt->mnt_root == mnt->mnt_sb->s_root)
 584                return true;
 585
 586        return is_subdir(path->dentry, mnt->mnt_root);
 587}
 588
 589static inline int nd_alloc_stack(struct nameidata *nd)
 590{
 591        if (likely(nd->depth != EMBEDDED_LEVELS))
 592                return 0;
 593        if (likely(nd->stack != nd->internal))
 594                return 0;
 595        return __nd_alloc_stack(nd);
 596}
 597
 598static void drop_links(struct nameidata *nd)
 599{
 600        int i = nd->depth;
 601        while (i--) {
 602                struct saved *last = nd->stack + i;
 603                do_delayed_call(&last->done);
 604                clear_delayed_call(&last->done);
 605        }
 606}
 607
 608static void terminate_walk(struct nameidata *nd)
 609{
 610        drop_links(nd);
 611        if (!(nd->flags & LOOKUP_RCU)) {
 612                int i;
 613                path_put(&nd->path);
 614                for (i = 0; i < nd->depth; i++)
 615                        path_put(&nd->stack[i].link);
 616                if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 617                        path_put(&nd->root);
 618                        nd->root.mnt = NULL;
 619                }
 620        } else {
 621                nd->flags &= ~LOOKUP_RCU;
 622                if (!(nd->flags & LOOKUP_ROOT))
 623                        nd->root.mnt = NULL;
 624                rcu_read_unlock();
 625        }
 626        nd->depth = 0;
 627}
 628
 629/* path_put is needed afterwards regardless of success or failure */
 630static bool legitimize_path(struct nameidata *nd,
 631                            struct path *path, unsigned seq)
 632{
 633        int res = __legitimize_mnt(path->mnt, nd->m_seq);
 634        if (unlikely(res)) {
 635                if (res > 0)
 636                        path->mnt = NULL;
 637                path->dentry = NULL;
 638                return false;
 639        }
 640        if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
 641                path->dentry = NULL;
 642                return false;
 643        }
 644        return !read_seqcount_retry(&path->dentry->d_seq, seq);
 645}
 646
 647static bool legitimize_links(struct nameidata *nd)
 648{
 649        int i;
 650        for (i = 0; i < nd->depth; i++) {
 651                struct saved *last = nd->stack + i;
 652                if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
 653                        drop_links(nd);
 654                        nd->depth = i + 1;
 655                        return false;
 656                }
 657        }
 658        return true;
 659}
 660
 661/*
 662 * Path walking has 2 modes, rcu-walk and ref-walk (see
 663 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 664 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 665 * normal reference counts on dentries and vfsmounts to transition to ref-walk
 666 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 667 * got stuck, so ref-walk may continue from there. If this is not successful
 668 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 669 * to restart the path walk from the beginning in ref-walk mode.
 670 */
 671
 672/**
 673 * unlazy_walk - try to switch to ref-walk mode.
 674 * @nd: nameidata pathwalk data
 675 * @dentry: child of nd->path.dentry or NULL
 676 * @seq: seq number to check dentry against
 677 * Returns: 0 on success, -ECHILD on failure
 678 *
 679 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
 680 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 681 * @nd or NULL.  Must be called from rcu-walk context.
 682 * Nothing should touch nameidata between unlazy_walk() failure and
 683 * terminate_walk().
 684 */
 685static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq)
 686{
 687        struct dentry *parent = nd->path.dentry;
 688
 689        BUG_ON(!(nd->flags & LOOKUP_RCU));
 690
 691        nd->flags &= ~LOOKUP_RCU;
 692        if (unlikely(!legitimize_links(nd)))
 693                goto out2;
 694        if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
 695                goto out2;
 696        if (unlikely(!lockref_get_not_dead(&parent->d_lockref)))
 697                goto out1;
 698
 699        /*
 700         * For a negative lookup, the lookup sequence point is the parents
 701         * sequence point, and it only needs to revalidate the parent dentry.
 702         *
 703         * For a positive lookup, we need to move both the parent and the
 704         * dentry from the RCU domain to be properly refcounted. And the
 705         * sequence number in the dentry validates *both* dentry counters,
 706         * since we checked the sequence number of the parent after we got
 707         * the child sequence number. So we know the parent must still
 708         * be valid if the child sequence number is still valid.
 709         */
 710        if (!dentry) {
 711                if (read_seqcount_retry(&parent->d_seq, nd->seq))
 712                        goto out;
 713                BUG_ON(nd->inode != parent->d_inode);
 714        } else {
 715                if (!lockref_get_not_dead(&dentry->d_lockref))
 716                        goto out;
 717                if (read_seqcount_retry(&dentry->d_seq, seq))
 718                        goto drop_dentry;
 719        }
 720
 721        /*
 722         * Sequence counts matched. Now make sure that the root is
 723         * still valid and get it if required.
 724         */
 725        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 726                if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
 727                        rcu_read_unlock();
 728                        dput(dentry);
 729                        return -ECHILD;
 730                }
 731        }
 732
 733        rcu_read_unlock();
 734        return 0;
 735
 736drop_dentry:
 737        rcu_read_unlock();
 738        dput(dentry);
 739        goto drop_root_mnt;
 740out2:
 741        nd->path.mnt = NULL;
 742out1:
 743        nd->path.dentry = NULL;
 744out:
 745        rcu_read_unlock();
 746drop_root_mnt:
 747        if (!(nd->flags & LOOKUP_ROOT))
 748                nd->root.mnt = NULL;
 749        return -ECHILD;
 750}
 751
 752static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq)
 753{
 754        if (unlikely(!legitimize_path(nd, link, seq))) {
 755                drop_links(nd);
 756                nd->depth = 0;
 757                nd->flags &= ~LOOKUP_RCU;
 758                nd->path.mnt = NULL;
 759                nd->path.dentry = NULL;
 760                if (!(nd->flags & LOOKUP_ROOT))
 761                        nd->root.mnt = NULL;
 762                rcu_read_unlock();
 763        } else if (likely(unlazy_walk(nd, NULL, 0)) == 0) {
 764                return 0;
 765        }
 766        path_put(link);
 767        return -ECHILD;
 768}
 769
 770static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
 771{
 772        return dentry->d_op->d_revalidate(dentry, flags);
 773}
 774
 775/**
 776 * complete_walk - successful completion of path walk
 777 * @nd:  pointer nameidata
 778 *
 779 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 780 * Revalidate the final result, unless we'd already done that during
 781 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 782 * success, -error on failure.  In case of failure caller does not
 783 * need to drop nd->path.
 784 */
 785static int complete_walk(struct nameidata *nd)
 786{
 787        struct dentry *dentry = nd->path.dentry;
 788        int status;
 789
 790        if (nd->flags & LOOKUP_RCU) {
 791                if (!(nd->flags & LOOKUP_ROOT))
 792                        nd->root.mnt = NULL;
 793                if (unlikely(unlazy_walk(nd, NULL, 0)))
 794                        return -ECHILD;
 795        }
 796
 797        if (likely(!(nd->flags & LOOKUP_JUMPED)))
 798                return 0;
 799
 800        if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
 801                return 0;
 802
 803        status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
 804        if (status > 0)
 805                return 0;
 806
 807        if (!status)
 808                status = -ESTALE;
 809
 810        return status;
 811}
 812
 813static void set_root(struct nameidata *nd)
 814{
 815        struct fs_struct *fs = current->fs;
 816
 817        if (nd->flags & LOOKUP_RCU) {
 818                unsigned seq;
 819
 820                do {
 821                        seq = read_seqcount_begin(&fs->seq);
 822                        nd->root = fs->root;
 823                        nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
 824                } while (read_seqcount_retry(&fs->seq, seq));
 825        } else {
 826                get_fs_root(fs, &nd->root);
 827        }
 828}
 829
 830static void path_put_conditional(struct path *path, struct nameidata *nd)
 831{
 832        dput(path->dentry);
 833        if (path->mnt != nd->path.mnt)
 834                mntput(path->mnt);
 835}
 836
 837static inline void path_to_nameidata(const struct path *path,
 838                                        struct nameidata *nd)
 839{
 840        if (!(nd->flags & LOOKUP_RCU)) {
 841                dput(nd->path.dentry);
 842                if (nd->path.mnt != path->mnt)
 843                        mntput(nd->path.mnt);
 844        }
 845        nd->path.mnt = path->mnt;
 846        nd->path.dentry = path->dentry;
 847}
 848
 849static int nd_jump_root(struct nameidata *nd)
 850{
 851        if (nd->flags & LOOKUP_RCU) {
 852                struct dentry *d;
 853                nd->path = nd->root;
 854                d = nd->path.dentry;
 855                nd->inode = d->d_inode;
 856                nd->seq = nd->root_seq;
 857                if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
 858                        return -ECHILD;
 859        } else {
 860                path_put(&nd->path);
 861                nd->path = nd->root;
 862                path_get(&nd->path);
 863                nd->inode = nd->path.dentry->d_inode;
 864        }
 865        nd->flags |= LOOKUP_JUMPED;
 866        return 0;
 867}
 868
 869/*
 870 * Helper to directly jump to a known parsed path from ->get_link,
 871 * caller must have taken a reference to path beforehand.
 872 */
 873void nd_jump_link(struct path *path)
 874{
 875        struct nameidata *nd = current->nameidata;
 876        path_put(&nd->path);
 877
 878        nd->path = *path;
 879        nd->inode = nd->path.dentry->d_inode;
 880        nd->flags |= LOOKUP_JUMPED;
 881}
 882
 883static inline void put_link(struct nameidata *nd)
 884{
 885        struct saved *last = nd->stack + --nd->depth;
 886        do_delayed_call(&last->done);
 887        if (!(nd->flags & LOOKUP_RCU))
 888                path_put(&last->link);
 889}
 890
 891int sysctl_protected_symlinks __read_mostly = 0;
 892int sysctl_protected_hardlinks __read_mostly = 0;
 893
 894/**
 895 * may_follow_link - Check symlink following for unsafe situations
 896 * @nd: nameidata pathwalk data
 897 *
 898 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 899 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 900 * in a sticky world-writable directory. This is to protect privileged
 901 * processes from failing races against path names that may change out
 902 * from under them by way of other users creating malicious symlinks.
 903 * It will permit symlinks to be followed only when outside a sticky
 904 * world-writable directory, or when the uid of the symlink and follower
 905 * match, or when the directory owner matches the symlink's owner.
 906 *
 907 * Returns 0 if following the symlink is allowed, -ve on error.
 908 */
 909static inline int may_follow_link(struct nameidata *nd)
 910{
 911        const struct inode *inode;
 912        const struct inode *parent;
 913        kuid_t puid;
 914
 915        if (!sysctl_protected_symlinks)
 916                return 0;
 917
 918        /* Allowed if owner and follower match. */
 919        inode = nd->link_inode;
 920        if (uid_eq(current_cred()->fsuid, inode->i_uid))
 921                return 0;
 922
 923        /* Allowed if parent directory not sticky and world-writable. */
 924        parent = nd->inode;
 925        if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
 926                return 0;
 927
 928        /* Allowed if parent directory and link owner match. */
 929        puid = parent->i_uid;
 930        if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
 931                return 0;
 932
 933        if (nd->flags & LOOKUP_RCU)
 934                return -ECHILD;
 935
 936        audit_log_link_denied("follow_link", &nd->stack[0].link);
 937        return -EACCES;
 938}
 939
 940/**
 941 * safe_hardlink_source - Check for safe hardlink conditions
 942 * @inode: the source inode to hardlink from
 943 *
 944 * Return false if at least one of the following conditions:
 945 *    - inode is not a regular file
 946 *    - inode is setuid
 947 *    - inode is setgid and group-exec
 948 *    - access failure for read and write
 949 *
 950 * Otherwise returns true.
 951 */
 952static bool safe_hardlink_source(struct inode *inode)
 953{
 954        umode_t mode = inode->i_mode;
 955
 956        /* Special files should not get pinned to the filesystem. */
 957        if (!S_ISREG(mode))
 958                return false;
 959
 960        /* Setuid files should not get pinned to the filesystem. */
 961        if (mode & S_ISUID)
 962                return false;
 963
 964        /* Executable setgid files should not get pinned to the filesystem. */
 965        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
 966                return false;
 967
 968        /* Hardlinking to unreadable or unwritable sources is dangerous. */
 969        if (inode_permission(inode, MAY_READ | MAY_WRITE))
 970                return false;
 971
 972        return true;
 973}
 974
 975/**
 976 * may_linkat - Check permissions for creating a hardlink
 977 * @link: the source to hardlink from
 978 *
 979 * Block hardlink when all of:
 980 *  - sysctl_protected_hardlinks enabled
 981 *  - fsuid does not match inode
 982 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 983 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
 984 *
 985 * Returns 0 if successful, -ve on error.
 986 */
 987static int may_linkat(struct path *link)
 988{
 989        struct inode *inode;
 990
 991        if (!sysctl_protected_hardlinks)
 992                return 0;
 993
 994        inode = link->dentry->d_inode;
 995
 996        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 997         * otherwise, it must be a safe source.
 998         */
 999        if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
1000                return 0;
1001
1002        audit_log_link_denied("linkat", link);
1003        return -EPERM;
1004}
1005
1006static __always_inline
1007const char *get_link(struct nameidata *nd)
1008{
1009        struct saved *last = nd->stack + nd->depth - 1;
1010        struct dentry *dentry = last->link.dentry;
1011        struct inode *inode = nd->link_inode;
1012        int error;
1013        const char *res;
1014
1015        if (!(nd->flags & LOOKUP_RCU)) {
1016                touch_atime(&last->link);
1017                cond_resched();
1018        } else if (atime_needs_update_rcu(&last->link, inode)) {
1019                if (unlikely(unlazy_walk(nd, NULL, 0)))
1020                        return ERR_PTR(-ECHILD);
1021                touch_atime(&last->link);
1022        }
1023
1024        error = security_inode_follow_link(dentry, inode,
1025                                           nd->flags & LOOKUP_RCU);
1026        if (unlikely(error))
1027                return ERR_PTR(error);
1028
1029        nd->last_type = LAST_BIND;
1030        res = inode->i_link;
1031        if (!res) {
1032                const char * (*get)(struct dentry *, struct inode *,
1033                                struct delayed_call *);
1034                get = inode->i_op->get_link;
1035                if (nd->flags & LOOKUP_RCU) {
1036                        res = get(NULL, inode, &last->done);
1037                        if (res == ERR_PTR(-ECHILD)) {
1038                                if (unlikely(unlazy_walk(nd, NULL, 0)))
1039                                        return ERR_PTR(-ECHILD);
1040                                res = get(dentry, inode, &last->done);
1041                        }
1042                } else {
1043                        res = get(dentry, inode, &last->done);
1044                }
1045                if (IS_ERR_OR_NULL(res))
1046                        return res;
1047        }
1048        if (*res == '/') {
1049                if (!nd->root.mnt)
1050                        set_root(nd);
1051                if (unlikely(nd_jump_root(nd)))
1052                        return ERR_PTR(-ECHILD);
1053                while (unlikely(*++res == '/'))
1054                        ;
1055        }
1056        if (!*res)
1057                res = NULL;
1058        return res;
1059}
1060
1061/*
1062 * follow_up - Find the mountpoint of path's vfsmount
1063 *
1064 * Given a path, find the mountpoint of its source file system.
1065 * Replace @path with the path of the mountpoint in the parent mount.
1066 * Up is towards /.
1067 *
1068 * Return 1 if we went up a level and 0 if we were already at the
1069 * root.
1070 */
1071int follow_up(struct path *path)
1072{
1073        struct mount *mnt = real_mount(path->mnt);
1074        struct mount *parent;
1075        struct dentry *mountpoint;
1076
1077        read_seqlock_excl(&mount_lock);
1078        parent = mnt->mnt_parent;
1079        if (parent == mnt) {
1080                read_sequnlock_excl(&mount_lock);
1081                return 0;
1082        }
1083        mntget(&parent->mnt);
1084        mountpoint = dget(mnt->mnt_mountpoint);
1085        read_sequnlock_excl(&mount_lock);
1086        dput(path->dentry);
1087        path->dentry = mountpoint;
1088        mntput(path->mnt);
1089        path->mnt = &parent->mnt;
1090        return 1;
1091}
1092EXPORT_SYMBOL(follow_up);
1093
1094/*
1095 * Perform an automount
1096 * - return -EISDIR to tell follow_managed() to stop and return the path we
1097 *   were called with.
1098 */
1099static int follow_automount(struct path *path, struct nameidata *nd,
1100                            bool *need_mntput)
1101{
1102        struct vfsmount *mnt;
1103        const struct cred *old_cred;
1104        int err;
1105
1106        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
1107                return -EREMOTE;
1108
1109        /* We don't want to mount if someone's just doing a stat -
1110         * unless they're stat'ing a directory and appended a '/' to
1111         * the name.
1112         *
1113         * We do, however, want to mount if someone wants to open or
1114         * create a file of any type under the mountpoint, wants to
1115         * traverse through the mountpoint or wants to open the
1116         * mounted directory.  Also, autofs may mark negative dentries
1117         * as being automount points.  These will need the attentions
1118         * of the daemon to instantiate them before they can be used.
1119         */
1120        if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1121                           LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1122            path->dentry->d_inode)
1123                return -EISDIR;
1124
1125        if (path->dentry->d_sb->s_user_ns != &init_user_ns)
1126                return -EACCES;
1127
1128        nd->total_link_count++;
1129        if (nd->total_link_count >= 40)
1130                return -ELOOP;
1131
1132        old_cred = override_creds(&init_cred);
1133        mnt = path->dentry->d_op->d_automount(path);
1134        revert_creds(old_cred);
1135        if (IS_ERR(mnt)) {
1136                /*
1137                 * The filesystem is allowed to return -EISDIR here to indicate
1138                 * it doesn't want to automount.  For instance, autofs would do
1139                 * this so that its userspace daemon can mount on this dentry.
1140                 *
1141                 * However, we can only permit this if it's a terminal point in
1142                 * the path being looked up; if it wasn't then the remainder of
1143                 * the path is inaccessible and we should say so.
1144                 */
1145                if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
1146                        return -EREMOTE;
1147                return PTR_ERR(mnt);
1148        }
1149
1150        if (!mnt) /* mount collision */
1151                return 0;
1152
1153        if (!*need_mntput) {
1154                /* lock_mount() may release path->mnt on error */
1155                mntget(path->mnt);
1156                *need_mntput = true;
1157        }
1158        err = finish_automount(mnt, path);
1159
1160        switch (err) {
1161        case -EBUSY:
1162                /* Someone else made a mount here whilst we were busy */
1163                return 0;
1164        case 0:
1165                path_put(path);
1166                path->mnt = mnt;
1167                path->dentry = dget(mnt->mnt_root);
1168                return 0;
1169        default:
1170                return err;
1171        }
1172
1173}
1174
1175/*
1176 * Handle a dentry that is managed in some way.
1177 * - Flagged for transit management (autofs)
1178 * - Flagged as mountpoint
1179 * - Flagged as automount point
1180 *
1181 * This may only be called in refwalk mode.
1182 *
1183 * Serialization is taken care of in namespace.c
1184 */
1185static int follow_managed(struct path *path, struct nameidata *nd)
1186{
1187        struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1188        unsigned managed;
1189        bool need_mntput = false;
1190        int ret = 0;
1191
1192        /* Given that we're not holding a lock here, we retain the value in a
1193         * local variable for each dentry as we look at it so that we don't see
1194         * the components of that value change under us */
1195        while (managed = ACCESS_ONCE(path->dentry->d_flags),
1196               managed &= DCACHE_MANAGED_DENTRY,
1197               unlikely(managed != 0)) {
1198                /* Allow the filesystem to manage the transit without i_mutex
1199                 * being held. */
1200                if (managed & DCACHE_MANAGE_TRANSIT) {
1201                        BUG_ON(!path->dentry->d_op);
1202                        BUG_ON(!path->dentry->d_op->d_manage);
1203                        ret = path->dentry->d_op->d_manage(path->dentry, false);
1204                        if (ret < 0)
1205                                break;
1206                }
1207
1208                /* Transit to a mounted filesystem. */
1209                if (managed & DCACHE_MOUNTED) {
1210                        struct vfsmount *mounted = lookup_mnt(path);
1211                        if (mounted) {
1212                                dput(path->dentry);
1213                                if (need_mntput)
1214                                        mntput(path->mnt);
1215                                path->mnt = mounted;
1216                                path->dentry = dget(mounted->mnt_root);
1217                                need_mntput = true;
1218                                continue;
1219                        }
1220
1221                        /* Something is mounted on this dentry in another
1222                         * namespace and/or whatever was mounted there in this
1223                         * namespace got unmounted before lookup_mnt() could
1224                         * get it */
1225                }
1226
1227                /* Handle an automount point */
1228                if (managed & DCACHE_NEED_AUTOMOUNT) {
1229                        ret = follow_automount(path, nd, &need_mntput);
1230                        if (ret < 0)
1231                                break;
1232                        continue;
1233                }
1234
1235                /* We didn't change the current path point */
1236                break;
1237        }
1238
1239        if (need_mntput && path->mnt == mnt)
1240                mntput(path->mnt);
1241        if (ret == -EISDIR || !ret)
1242                ret = 1;
1243        if (need_mntput)
1244                nd->flags |= LOOKUP_JUMPED;
1245        if (unlikely(ret < 0))
1246                path_put_conditional(path, nd);
1247        return ret;
1248}
1249
1250int follow_down_one(struct path *path)
1251{
1252        struct vfsmount *mounted;
1253
1254        mounted = lookup_mnt(path);
1255        if (mounted) {
1256                dput(path->dentry);
1257                mntput(path->mnt);
1258                path->mnt = mounted;
1259                path->dentry = dget(mounted->mnt_root);
1260                return 1;
1261        }
1262        return 0;
1263}
1264EXPORT_SYMBOL(follow_down_one);
1265
1266static inline int managed_dentry_rcu(struct dentry *dentry)
1267{
1268        return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
1269                dentry->d_op->d_manage(dentry, true) : 0;
1270}
1271
1272/*
1273 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
1274 * we meet a managed dentry that would need blocking.
1275 */
1276static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1277                               struct inode **inode, unsigned *seqp)
1278{
1279        for (;;) {
1280                struct mount *mounted;
1281                /*
1282                 * Don't forget we might have a non-mountpoint managed dentry
1283                 * that wants to block transit.
1284                 */
1285                switch (managed_dentry_rcu(path->dentry)) {
1286                case -ECHILD:
1287                default:
1288                        return false;
1289                case -EISDIR:
1290                        return true;
1291                case 0:
1292                        break;
1293                }
1294
1295                if (!d_mountpoint(path->dentry))
1296                        return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1297
1298                mounted = __lookup_mnt(path->mnt, path->dentry);
1299                if (!mounted)
1300                        break;
1301                path->mnt = &mounted->mnt;
1302                path->dentry = mounted->mnt.mnt_root;
1303                nd->flags |= LOOKUP_JUMPED;
1304                *seqp = read_seqcount_begin(&path->dentry->d_seq);
1305                /*
1306                 * Update the inode too. We don't need to re-check the
1307                 * dentry sequence number here after this d_inode read,
1308                 * because a mount-point is always pinned.
1309                 */
1310                *inode = path->dentry->d_inode;
1311        }
1312        return !read_seqretry(&mount_lock, nd->m_seq) &&
1313                !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1314}
1315
1316static int follow_dotdot_rcu(struct nameidata *nd)
1317{
1318        struct inode *inode = nd->inode;
1319
1320        while (1) {
1321                if (path_equal(&nd->path, &nd->root))
1322                        break;
1323                if (nd->path.dentry != nd->path.mnt->mnt_root) {
1324                        struct dentry *old = nd->path.dentry;
1325                        struct dentry *parent = old->d_parent;
1326                        unsigned seq;
1327
1328                        inode = parent->d_inode;
1329                        seq = read_seqcount_begin(&parent->d_seq);
1330                        if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
1331                                return -ECHILD;
1332                        nd->path.dentry = parent;
1333                        nd->seq = seq;
1334                        if (unlikely(!path_connected(&nd->path)))
1335                                return -ENOENT;
1336                        break;
1337                } else {
1338                        struct mount *mnt = real_mount(nd->path.mnt);
1339                        struct mount *mparent = mnt->mnt_parent;
1340                        struct dentry *mountpoint = mnt->mnt_mountpoint;
1341                        struct inode *inode2 = mountpoint->d_inode;
1342                        unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
1343                        if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1344                                return -ECHILD;
1345                        if (&mparent->mnt == nd->path.mnt)
1346                                break;
1347                        /* we know that mountpoint was pinned */
1348                        nd->path.dentry = mountpoint;
1349                        nd->path.mnt = &mparent->mnt;
1350                        inode = inode2;
1351                        nd->seq = seq;
1352                }
1353        }
1354        while (unlikely(d_mountpoint(nd->path.dentry))) {
1355                struct mount *mounted;
1356                mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
1357                if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1358                        return -ECHILD;
1359                if (!mounted)
1360                        break;
1361                nd->path.mnt = &mounted->mnt;
1362                nd->path.dentry = mounted->mnt.mnt_root;
1363                inode = nd->path.dentry->d_inode;
1364                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1365        }
1366        nd->inode = inode;
1367        return 0;
1368}
1369
1370/*
1371 * Follow down to the covering mount currently visible to userspace.  At each
1372 * point, the filesystem owning that dentry may be queried as to whether the
1373 * caller is permitted to proceed or not.
1374 */
1375int follow_down(struct path *path)
1376{
1377        unsigned managed;
1378        int ret;
1379
1380        while (managed = ACCESS_ONCE(path->dentry->d_flags),
1381               unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1382                /* Allow the filesystem to manage the transit without i_mutex
1383                 * being held.
1384                 *
1385                 * We indicate to the filesystem if someone is trying to mount
1386                 * something here.  This gives autofs the chance to deny anyone
1387                 * other than its daemon the right to mount on its
1388                 * superstructure.
1389                 *
1390                 * The filesystem may sleep at this point.
1391                 */
1392                if (managed & DCACHE_MANAGE_TRANSIT) {
1393                        BUG_ON(!path->dentry->d_op);
1394                        BUG_ON(!path->dentry->d_op->d_manage);
1395                        ret = path->dentry->d_op->d_manage(
1396                                path->dentry, false);
1397                        if (ret < 0)
1398                                return ret == -EISDIR ? 0 : ret;
1399                }
1400
1401                /* Transit to a mounted filesystem. */
1402                if (managed & DCACHE_MOUNTED) {
1403                        struct vfsmount *mounted = lookup_mnt(path);
1404                        if (!mounted)
1405                                break;
1406                        dput(path->dentry);
1407                        mntput(path->mnt);
1408                        path->mnt = mounted;
1409                        path->dentry = dget(mounted->mnt_root);
1410                        continue;
1411                }
1412
1413                /* Don't handle automount points here */
1414                break;
1415        }
1416        return 0;
1417}
1418EXPORT_SYMBOL(follow_down);
1419
1420/*
1421 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1422 */
1423static void follow_mount(struct path *path)
1424{
1425        while (d_mountpoint(path->dentry)) {
1426                struct vfsmount *mounted = lookup_mnt(path);
1427                if (!mounted)
1428                        break;
1429                dput(path->dentry);
1430                mntput(path->mnt);
1431                path->mnt = mounted;
1432                path->dentry = dget(mounted->mnt_root);
1433        }
1434}
1435
1436static int path_parent_directory(struct path *path)
1437{
1438        struct dentry *old = path->dentry;
1439        /* rare case of legitimate dget_parent()... */
1440        path->dentry = dget_parent(path->dentry);
1441        dput(old);
1442        if (unlikely(!path_connected(path)))
1443                return -ENOENT;
1444        return 0;
1445}
1446
1447static int follow_dotdot(struct nameidata *nd)
1448{
1449        while(1) {
1450                if (nd->path.dentry == nd->root.dentry &&
1451                    nd->path.mnt == nd->root.mnt) {
1452                        break;
1453                }
1454                if (nd->path.dentry != nd->path.mnt->mnt_root) {
1455                        int ret = path_parent_directory(&nd->path);
1456                        if (ret)
1457                                return ret;
1458                        break;
1459                }
1460                if (!follow_up(&nd->path))
1461                        break;
1462        }
1463        follow_mount(&nd->path);
1464        nd->inode = nd->path.dentry->d_inode;
1465        return 0;
1466}
1467
1468/*
1469 * This looks up the name in dcache and possibly revalidates the found dentry.
1470 * NULL is returned if the dentry does not exist in the cache.
1471 */
1472static struct dentry *lookup_dcache(const struct qstr *name,
1473                                    struct dentry *dir,
1474                                    unsigned int flags)
1475{
1476        struct dentry *dentry;
1477        int error;
1478
1479        dentry = d_lookup(dir, name);
1480        if (dentry) {
1481                if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1482                        error = d_revalidate(dentry, flags);
1483                        if (unlikely(error <= 0)) {
1484                                if (!error)
1485                                        d_invalidate(dentry);
1486                                dput(dentry);
1487                                return ERR_PTR(error);
1488                        }
1489                }
1490        }
1491        return dentry;
1492}
1493
1494/*
1495 * Call i_op->lookup on the dentry.  The dentry must be negative and
1496 * unhashed.
1497 *
1498 * dir->d_inode->i_mutex must be held
1499 */
1500static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
1501                                  unsigned int flags)
1502{
1503        struct dentry *old;
1504
1505        /* Don't create child dentry for a dead directory. */
1506        if (unlikely(IS_DEADDIR(dir))) {
1507                dput(dentry);
1508                return ERR_PTR(-ENOENT);
1509        }
1510
1511        old = dir->i_op->lookup(dir, dentry, flags);
1512        if (unlikely(old)) {
1513                dput(dentry);
1514                dentry = old;
1515        }
1516        return dentry;
1517}
1518
1519static struct dentry *__lookup_hash(const struct qstr *name,
1520                struct dentry *base, unsigned int flags)
1521{
1522        struct dentry *dentry = lookup_dcache(name, base, flags);
1523
1524        if (dentry)
1525                return dentry;
1526
1527        dentry = d_alloc(base, name);
1528        if (unlikely(!dentry))
1529                return ERR_PTR(-ENOMEM);
1530
1531        return lookup_real(base->d_inode, dentry, flags);
1532}
1533
1534static int lookup_fast(struct nameidata *nd,
1535                       struct path *path, struct inode **inode,
1536                       unsigned *seqp)
1537{
1538        struct vfsmount *mnt = nd->path.mnt;
1539        struct dentry *dentry, *parent = nd->path.dentry;
1540        int status = 1;
1541        int err;
1542
1543        /*
1544         * Rename seqlock is not required here because in the off chance
1545         * of a false negative due to a concurrent rename, the caller is
1546         * going to fall back to non-racy lookup.
1547         */
1548        if (nd->flags & LOOKUP_RCU) {
1549                unsigned seq;
1550                bool negative;
1551                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1552                if (unlikely(!dentry)) {
1553                        if (unlazy_walk(nd, NULL, 0))
1554                                return -ECHILD;
1555                        return 0;
1556                }
1557
1558                /*
1559                 * This sequence count validates that the inode matches
1560                 * the dentry name information from lookup.
1561                 */
1562                *inode = d_backing_inode(dentry);
1563                negative = d_is_negative(dentry);
1564                if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1565                        return -ECHILD;
1566
1567                /*
1568                 * This sequence count validates that the parent had no
1569                 * changes while we did the lookup of the dentry above.
1570                 *
1571                 * The memory barrier in read_seqcount_begin of child is
1572                 *  enough, we can use __read_seqcount_retry here.
1573                 */
1574                if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
1575                        return -ECHILD;
1576
1577                *seqp = seq;
1578                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
1579                        status = d_revalidate(dentry, nd->flags);
1580                if (unlikely(status <= 0)) {
1581                        if (unlazy_walk(nd, dentry, seq))
1582                                return -ECHILD;
1583                        if (status == -ECHILD)
1584                                status = d_revalidate(dentry, nd->flags);
1585                } else {
1586                        /*
1587                         * Note: do negative dentry check after revalidation in
1588                         * case that drops it.
1589                         */
1590                        if (unlikely(negative))
1591                                return -ENOENT;
1592                        path->mnt = mnt;
1593                        path->dentry = dentry;
1594                        if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1595                                return 1;
1596                        if (unlazy_walk(nd, dentry, seq))
1597                                return -ECHILD;
1598                }
1599        } else {
1600                dentry = __d_lookup(parent, &nd->last);
1601                if (unlikely(!dentry))
1602                        return 0;
1603                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
1604                        status = d_revalidate(dentry, nd->flags);
1605        }
1606        if (unlikely(status <= 0)) {
1607                if (!status)
1608                        d_invalidate(dentry);
1609                dput(dentry);
1610                return status;
1611        }
1612        if (unlikely(d_is_negative(dentry))) {
1613                dput(dentry);
1614                return -ENOENT;
1615        }
1616
1617        path->mnt = mnt;
1618        path->dentry = dentry;
1619        err = follow_managed(path, nd);
1620        if (likely(err > 0))
1621                *inode = d_backing_inode(path->dentry);
1622        return err;
1623}
1624
1625/* Fast lookup failed, do it the slow way */
1626static struct dentry *lookup_slow(const struct qstr *name,
1627                                  struct dentry *dir,
1628                                  unsigned int flags)
1629{
1630        struct dentry *dentry = ERR_PTR(-ENOENT), *old;
1631        struct inode *inode = dir->d_inode;
1632        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1633
1634        inode_lock_shared(inode);
1635        /* Don't go there if it's already dead */
1636        if (unlikely(IS_DEADDIR(inode)))
1637                goto out;
1638again:
1639        dentry = d_alloc_parallel(dir, name, &wq);
1640        if (IS_ERR(dentry))
1641                goto out;
1642        if (unlikely(!d_in_lookup(dentry))) {
1643                if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
1644                    !(flags & LOOKUP_NO_REVAL)) {
1645                        int error = d_revalidate(dentry, flags);
1646                        if (unlikely(error <= 0)) {
1647                                if (!error) {
1648                                        d_invalidate(dentry);
1649                                        dput(dentry);
1650                                        goto again;
1651                                }
1652                                dput(dentry);
1653                                dentry = ERR_PTR(error);
1654                        }
1655                }
1656        } else {
1657                old = inode->i_op->lookup(inode, dentry, flags);
1658                d_lookup_done(dentry);
1659                if (unlikely(old)) {
1660                        dput(dentry);
1661                        dentry = old;
1662                }
1663        }
1664out:
1665        inode_unlock_shared(inode);
1666        return dentry;
1667}
1668
1669static inline int may_lookup(struct nameidata *nd)
1670{
1671        if (nd->flags & LOOKUP_RCU) {
1672                int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1673                if (err != -ECHILD)
1674                        return err;
1675                if (unlazy_walk(nd, NULL, 0))
1676                        return -ECHILD;
1677        }
1678        return inode_permission(nd->inode, MAY_EXEC);
1679}
1680
1681static inline int handle_dots(struct nameidata *nd, int type)
1682{
1683        if (type == LAST_DOTDOT) {
1684                if (!nd->root.mnt)
1685                        set_root(nd);
1686                if (nd->flags & LOOKUP_RCU) {
1687                        return follow_dotdot_rcu(nd);
1688                } else
1689                        return follow_dotdot(nd);
1690        }
1691        return 0;
1692}
1693
1694static int pick_link(struct nameidata *nd, struct path *link,
1695                     struct inode *inode, unsigned seq)
1696{
1697        int error;
1698        struct saved *last;
1699        if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
1700                path_to_nameidata(link, nd);
1701                return -ELOOP;
1702        }
1703        if (!(nd->flags & LOOKUP_RCU)) {
1704                if (link->mnt == nd->path.mnt)
1705                        mntget(link->mnt);
1706        }
1707        error = nd_alloc_stack(nd);
1708        if (unlikely(error)) {
1709                if (error == -ECHILD) {
1710                        if (unlikely(unlazy_link(nd, link, seq)))
1711                                return -ECHILD;
1712                        error = nd_alloc_stack(nd);
1713                }
1714                if (error) {
1715                        path_put(link);
1716                        return error;
1717                }
1718        }
1719
1720        last = nd->stack + nd->depth++;
1721        last->link = *link;
1722        clear_delayed_call(&last->done);
1723        nd->link_inode = inode;
1724        last->seq = seq;
1725        return 1;
1726}
1727
1728/*
1729 * Do we need to follow links? We _really_ want to be able
1730 * to do this check without having to look at inode->i_op,
1731 * so we keep a cache of "no, this doesn't need follow_link"
1732 * for the common case.
1733 */
1734static inline int should_follow_link(struct nameidata *nd, struct path *link,
1735                                     int follow,
1736                                     struct inode *inode, unsigned seq)
1737{
1738        if (likely(!d_is_symlink(link->dentry)))
1739                return 0;
1740        if (!follow)
1741                return 0;
1742        /* make sure that d_is_symlink above matches inode */
1743        if (nd->flags & LOOKUP_RCU) {
1744                if (read_seqcount_retry(&link->dentry->d_seq, seq))
1745                        return -ECHILD;
1746        }
1747        return pick_link(nd, link, inode, seq);
1748}
1749
1750enum {WALK_GET = 1, WALK_PUT = 2};
1751
1752static int walk_component(struct nameidata *nd, int flags)
1753{
1754        struct path path;
1755        struct inode *inode;
1756        unsigned seq;
1757        int err;
1758        /*
1759         * "." and ".." are special - ".." especially so because it has
1760         * to be able to know about the current root directory and
1761         * parent relationships.
1762         */
1763        if (unlikely(nd->last_type != LAST_NORM)) {
1764                err = handle_dots(nd, nd->last_type);
1765                if (flags & WALK_PUT)
1766                        put_link(nd);
1767                return err;
1768        }
1769        err = lookup_fast(nd, &path, &inode, &seq);
1770        if (unlikely(err <= 0)) {
1771                if (err < 0)
1772                        return err;
1773                path.dentry = lookup_slow(&nd->last, nd->path.dentry,
1774                                          nd->flags);
1775                if (IS_ERR(path.dentry))
1776                        return PTR_ERR(path.dentry);
1777
1778                path.mnt = nd->path.mnt;
1779                err = follow_managed(&path, nd);
1780                if (unlikely(err < 0))
1781                        return err;
1782
1783                if (unlikely(d_is_negative(path.dentry))) {
1784                        path_to_nameidata(&path, nd);
1785                        return -ENOENT;
1786                }
1787
1788                seq = 0;        /* we are already out of RCU mode */
1789                inode = d_backing_inode(path.dentry);
1790        }
1791
1792        if (flags & WALK_PUT)
1793                put_link(nd);
1794        err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
1795        if (unlikely(err))
1796                return err;
1797        path_to_nameidata(&path, nd);
1798        nd->inode = inode;
1799        nd->seq = seq;
1800        return 0;
1801}
1802
1803/*
1804 * We can do the critical dentry name comparison and hashing
1805 * operations one word at a time, but we are limited to:
1806 *
1807 * - Architectures with fast unaligned word accesses. We could
1808 *   do a "get_unaligned()" if this helps and is sufficiently
1809 *   fast.
1810 *
1811 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
1812 *   do not trap on the (extremely unlikely) case of a page
1813 *   crossing operation.
1814 *
1815 * - Furthermore, we need an efficient 64-bit compile for the
1816 *   64-bit case in order to generate the "number of bytes in
1817 *   the final mask". Again, that could be replaced with a
1818 *   efficient population count instruction or similar.
1819 */
1820#ifdef CONFIG_DCACHE_WORD_ACCESS
1821
1822#include <asm/word-at-a-time.h>
1823
1824#ifdef HASH_MIX
1825
1826/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1827
1828#elif defined(CONFIG_64BIT)
1829/*
1830 * Register pressure in the mixing function is an issue, particularly
1831 * on 32-bit x86, but almost any function requires one state value and
1832 * one temporary.  Instead, use a function designed for two state values
1833 * and no temporaries.
1834 *
1835 * This function cannot create a collision in only two iterations, so
1836 * we have two iterations to achieve avalanche.  In those two iterations,
1837 * we have six layers of mixing, which is enough to spread one bit's
1838 * influence out to 2^6 = 64 state bits.
1839 *
1840 * Rotate constants are scored by considering either 64 one-bit input
1841 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
1842 * probability of that delta causing a change to each of the 128 output
1843 * bits, using a sample of random initial states.
1844 *
1845 * The Shannon entropy of the computed probabilities is then summed
1846 * to produce a score.  Ideally, any input change has a 50% chance of
1847 * toggling any given output bit.
1848 *
1849 * Mixing scores (in bits) for (12,45):
1850 * Input delta: 1-bit      2-bit
1851 * 1 round:     713.3    42542.6
1852 * 2 rounds:   2753.7   140389.8
1853 * 3 rounds:   5954.1   233458.2
1854 * 4 rounds:   7862.6   256672.2
1855 * Perfect:    8192     258048
1856 *            (64*128) (64*63/2 * 128)
1857 */
1858#define HASH_MIX(x, y, a)       \
1859        (       x ^= (a),       \
1860        y ^= x, x = rol64(x,12),\
1861        x += y, y = rol64(y,45),\
1862        y *= 9                  )
1863
1864/*
1865 * Fold two longs into one 32-bit hash value.  This must be fast, but
1866 * latency isn't quite as critical, as there is a fair bit of additional
1867 * work done before the hash value is used.
1868 */
1869static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1870{
1871        y ^= x * GOLDEN_RATIO_64;
1872        y *= GOLDEN_RATIO_64;
1873        return y >> 32;
1874}
1875
1876#else   /* 32-bit case */
1877
1878/*
1879 * Mixing scores (in bits) for (7,20):
1880 * Input delta: 1-bit      2-bit
1881 * 1 round:     330.3     9201.6
1882 * 2 rounds:   1246.4    25475.4
1883 * 3 rounds:   1907.1    31295.1
1884 * 4 rounds:   2042.3    31718.6
1885 * Perfect:    2048      31744
1886 *            (32*64)   (32*31/2 * 64)
1887 */
1888#define HASH_MIX(x, y, a)       \
1889        (       x ^= (a),       \
1890        y ^= x, x = rol32(x, 7),\
1891        x += y, y = rol32(y,20),\
1892        y *= 9                  )
1893
1894static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1895{
1896        /* Use arch-optimized multiply if one exists */
1897        return __hash_32(y ^ __hash_32(x));
1898}
1899
1900#endif
1901
1902/*
1903 * Return the hash of a string of known length.  This is carfully
1904 * designed to match hash_name(), which is the more critical function.
1905 * In particular, we must end by hashing a final word containing 0..7
1906 * payload bytes, to match the way that hash_name() iterates until it
1907 * finds the delimiter after the name.
1908 */
1909unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
1910{
1911        unsigned long a, x = 0, y = (unsigned long)salt;
1912
1913        for (;;) {
1914                if (!len)
1915                        goto done;
1916                a = load_unaligned_zeropad(name);
1917                if (len < sizeof(unsigned long))
1918                        break;
1919                HASH_MIX(x, y, a);
1920                name += sizeof(unsigned long);
1921                len -= sizeof(unsigned long);
1922        }
1923        x ^= a & bytemask_from_count(len);
1924done:
1925        return fold_hash(x, y);
1926}
1927EXPORT_SYMBOL(full_name_hash);
1928
1929/* Return the "hash_len" (hash and length) of a null-terminated string */
1930u64 hashlen_string(const void *salt, const char *name)
1931{
1932        unsigned long a = 0, x = 0, y = (unsigned long)salt;
1933        unsigned long adata, mask, len;
1934        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1935
1936        len = 0;
1937        goto inside;
1938
1939        do {
1940                HASH_MIX(x, y, a);
1941                len += sizeof(unsigned long);
1942inside:
1943                a = load_unaligned_zeropad(name+len);
1944        } while (!has_zero(a, &adata, &constants));
1945
1946        adata = prep_zero_mask(a, adata, &constants);
1947        mask = create_zero_mask(adata);
1948        x ^= a & zero_bytemask(mask);
1949
1950        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
1951}
1952EXPORT_SYMBOL(hashlen_string);
1953
1954/*
1955 * Calculate the length and hash of the path component, and
1956 * return the "hash_len" as the result.
1957 */
1958static inline u64 hash_name(const void *salt, const char *name)
1959{
1960        unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
1961        unsigned long adata, bdata, mask, len;
1962        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1963
1964        len = 0;
1965        goto inside;
1966
1967        do {
1968                HASH_MIX(x, y, a);
1969                len += sizeof(unsigned long);
1970inside:
1971                a = load_unaligned_zeropad(name+len);
1972                b = a ^ REPEAT_BYTE('/');
1973        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
1974
1975        adata = prep_zero_mask(a, adata, &constants);
1976        bdata = prep_zero_mask(b, bdata, &constants);
1977        mask = create_zero_mask(adata | bdata);
1978        x ^= a & zero_bytemask(mask);
1979
1980        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
1981}
1982
1983#else   /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
1984
1985/* Return the hash of a string of known length */
1986unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
1987{
1988        unsigned long hash = init_name_hash(salt);
1989        while (len--)
1990                hash = partial_name_hash((unsigned char)*name++, hash);
1991        return end_name_hash(hash);
1992}
1993EXPORT_SYMBOL(full_name_hash);
1994
1995/* Return the "hash_len" (hash and length) of a null-terminated string */
1996u64 hashlen_string(const void *salt, const char *name)
1997{
1998        unsigned long hash = init_name_hash(salt);
1999        unsigned long len = 0, c;
2000
2001        c = (unsigned char)*name;
2002        while (c) {
2003                len++;
2004                hash = partial_name_hash(c, hash);
2005                c = (unsigned char)name[len];
2006        }
2007        return hashlen_create(end_name_hash(hash), len);
2008}
2009EXPORT_SYMBOL(hashlen_string);
2010
2011/*
2012 * We know there's a real path component here of at least
2013 * one character.
2014 */
2015static inline u64 hash_name(const void *salt, const char *name)
2016{
2017        unsigned long hash = init_name_hash(salt);
2018        unsigned long len = 0, c;
2019
2020        c = (unsigned char)*name;
2021        do {
2022                len++;
2023                hash = partial_name_hash(c, hash);
2024                c = (unsigned char)name[len];
2025        } while (c && c != '/');
2026        return hashlen_create(end_name_hash(hash), len);
2027}
2028
2029#endif
2030
2031/*
2032 * Name resolution.
2033 * This is the basic name resolution function, turning a pathname into
2034 * the final dentry. We expect 'base' to be positive and a directory.
2035 *
2036 * Returns 0 and nd will have valid dentry and mnt on success.
2037 * Returns error and drops reference to input namei data on failure.
2038 */
2039static int link_path_walk(const char *name, struct nameidata *nd)
2040{
2041        int err;
2042
2043        while (*name=='/')
2044                name++;
2045        if (!*name)
2046                return 0;
2047
2048        /* At this point we know we have a real path component. */
2049        for(;;) {
2050                u64 hash_len;
2051                int type;
2052
2053                err = may_lookup(nd);
2054                if (err)
2055                        return err;
2056
2057                hash_len = hash_name(nd->path.dentry, name);
2058
2059                type = LAST_NORM;
2060                if (name[0] == '.') switch (hashlen_len(hash_len)) {
2061                        case 2:
2062                                if (name[1] == '.') {
2063                                        type = LAST_DOTDOT;
2064                                        nd->flags |= LOOKUP_JUMPED;
2065                                }
2066                                break;
2067                        case 1:
2068                                type = LAST_DOT;
2069                }
2070                if (likely(type == LAST_NORM)) {
2071                        struct dentry *parent = nd->path.dentry;
2072                        nd->flags &= ~LOOKUP_JUMPED;
2073                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2074                                struct qstr this = { { .hash_len = hash_len }, .name = name };
2075                                err = parent->d_op->d_hash(parent, &this);
2076                                if (err < 0)
2077                                        return err;
2078                                hash_len = this.hash_len;
2079                                name = this.name;
2080                        }
2081                }
2082
2083                nd->last.hash_len = hash_len;
2084                nd->last.name = name;
2085                nd->last_type = type;
2086
2087                name += hashlen_len(hash_len);
2088                if (!*name)
2089                        goto OK;
2090                /*
2091                 * If it wasn't NUL, we know it was '/'. Skip that
2092                 * slash, and continue until no more slashes.
2093                 */
2094                do {
2095                        name++;
2096                } while (unlikely(*name == '/'));
2097                if (unlikely(!*name)) {
2098OK:
2099                        /* pathname body, done */
2100                        if (!nd->depth)
2101                                return 0;
2102                        name = nd->stack[nd->depth - 1].name;
2103                        /* trailing symlink, done */
2104                        if (!name)
2105                                return 0;
2106                        /* last component of nested symlink */
2107                        err = walk_component(nd, WALK_GET | WALK_PUT);
2108                } else {
2109                        err = walk_component(nd, WALK_GET);
2110                }
2111                if (err < 0)
2112                        return err;
2113
2114                if (err) {
2115                        const char *s = get_link(nd);
2116
2117                        if (IS_ERR(s))
2118                                return PTR_ERR(s);
2119                        err = 0;
2120                        if (unlikely(!s)) {
2121                                /* jumped */
2122                                put_link(nd);
2123                        } else {
2124                                nd->stack[nd->depth - 1].name = name;
2125                                name = s;
2126                                continue;
2127                        }
2128                }
2129                if (unlikely(!d_can_lookup(nd->path.dentry))) {
2130                        if (nd->flags & LOOKUP_RCU) {
2131                                if (unlazy_walk(nd, NULL, 0))
2132                                        return -ECHILD;
2133                        }
2134                        return -ENOTDIR;
2135                }
2136        }
2137}
2138
2139static const char *path_init(struct nameidata *nd, unsigned flags)
2140{
2141        int retval = 0;
2142        const char *s = nd->name->name;
2143
2144        nd->last_type = LAST_ROOT; /* if there are only slashes... */
2145        nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
2146        nd->depth = 0;
2147        if (flags & LOOKUP_ROOT) {
2148                struct dentry *root = nd->root.dentry;
2149                struct inode *inode = root->d_inode;
2150                if (*s) {
2151                        if (!d_can_lookup(root))
2152                                return ERR_PTR(-ENOTDIR);
2153                        retval = inode_permission(inode, MAY_EXEC);
2154                        if (retval)
2155                                return ERR_PTR(retval);
2156                }
2157                nd->path = nd->root;
2158                nd->inode = inode;
2159                if (flags & LOOKUP_RCU) {
2160                        rcu_read_lock();
2161                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2162                        nd->root_seq = nd->seq;
2163                        nd->m_seq = read_seqbegin(&mount_lock);
2164                } else {
2165                        path_get(&nd->path);
2166                }
2167                return s;
2168        }
2169
2170        nd->root.mnt = NULL;
2171        nd->path.mnt = NULL;
2172        nd->path.dentry = NULL;
2173
2174        nd->m_seq = read_seqbegin(&mount_lock);
2175        if (*s == '/') {
2176                if (flags & LOOKUP_RCU)
2177                        rcu_read_lock();
2178                set_root(nd);
2179                if (likely(!nd_jump_root(nd)))
2180                        return s;
2181                nd->root.mnt = NULL;
2182                rcu_read_unlock();
2183                return ERR_PTR(-ECHILD);
2184        } else if (nd->dfd == AT_FDCWD) {
2185                if (flags & LOOKUP_RCU) {
2186                        struct fs_struct *fs = current->fs;
2187                        unsigned seq;
2188
2189                        rcu_read_lock();
2190
2191                        do {
2192                                seq = read_seqcount_begin(&fs->seq);
2193                                nd->path = fs->pwd;
2194                                nd->inode = nd->path.dentry->d_inode;
2195                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2196                        } while (read_seqcount_retry(&fs->seq, seq));
2197                } else {
2198                        get_fs_pwd(current->fs, &nd->path);
2199                        nd->inode = nd->path.dentry->d_inode;
2200                }
2201                return s;
2202        } else {
2203                /* Caller must check execute permissions on the starting path component */
2204                struct fd f = fdget_raw(nd->dfd);
2205                struct dentry *dentry;
2206
2207                if (!f.file)
2208                        return ERR_PTR(-EBADF);
2209
2210                dentry = f.file->f_path.dentry;
2211
2212                if (*s) {
2213                        if (!d_can_lookup(dentry)) {
2214                                fdput(f);
2215                                return ERR_PTR(-ENOTDIR);
2216                        }
2217                }
2218
2219                nd->path = f.file->f_path;
2220                if (flags & LOOKUP_RCU) {
2221                        rcu_read_lock();
2222                        nd->inode = nd->path.dentry->d_inode;
2223                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2224                } else {
2225                        path_get(&nd->path);
2226                        nd->inode = nd->path.dentry->d_inode;
2227                }
2228                fdput(f);
2229                return s;
2230        }
2231}
2232
2233static const char *trailing_symlink(struct nameidata *nd)
2234{
2235        const char *s;
2236        int error = may_follow_link(nd);
2237        if (unlikely(error))
2238                return ERR_PTR(error);
2239        nd->flags |= LOOKUP_PARENT;
2240        nd->stack[0].name = NULL;
2241        s = get_link(nd);
2242        return s ? s : "";
2243}
2244
2245static inline int lookup_last(struct nameidata *nd)
2246{
2247        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2248                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2249
2250        nd->flags &= ~LOOKUP_PARENT;
2251        return walk_component(nd,
2252                        nd->flags & LOOKUP_FOLLOW
2253                                ? nd->depth
2254                                        ? WALK_PUT | WALK_GET
2255                                        : WALK_GET
2256                                : 0);
2257}
2258
2259/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2260static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2261{
2262        const char *s = path_init(nd, flags);
2263        int err;
2264
2265        if (IS_ERR(s))
2266                return PTR_ERR(s);
2267        while (!(err = link_path_walk(s, nd))
2268                && ((err = lookup_last(nd)) > 0)) {
2269                s = trailing_symlink(nd);
2270                if (IS_ERR(s)) {
2271                        err = PTR_ERR(s);
2272                        break;
2273                }
2274        }
2275        if (!err)
2276                err = complete_walk(nd);
2277
2278        if (!err && nd->flags & LOOKUP_DIRECTORY)
2279                if (!d_can_lookup(nd->path.dentry))
2280                        err = -ENOTDIR;
2281        if (!err) {
2282                *path = nd->path;
2283                nd->path.mnt = NULL;
2284                nd->path.dentry = NULL;
2285        }
2286        terminate_walk(nd);
2287        return err;
2288}
2289
2290static int filename_lookup(int dfd, struct filename *name, unsigned flags,
2291                           struct path *path, struct path *root)
2292{
2293        int retval;
2294        struct nameidata nd;
2295        if (IS_ERR(name))
2296                return PTR_ERR(name);
2297        if (unlikely(root)) {
2298                nd.root = *root;
2299                flags |= LOOKUP_ROOT;
2300        }
2301        set_nameidata(&nd, dfd, name);
2302        retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
2303        if (unlikely(retval == -ECHILD))
2304                retval = path_lookupat(&nd, flags, path);
2305        if (unlikely(retval == -ESTALE))
2306                retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
2307
2308        if (likely(!retval))
2309                audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
2310        restore_nameidata();
2311        putname(name);
2312        return retval;
2313}
2314
2315/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2316static int path_parentat(struct nameidata *nd, unsigned flags,
2317                                struct path *parent)
2318{
2319        const char *s = path_init(nd, flags);
2320        int err;
2321        if (IS_ERR(s))
2322                return PTR_ERR(s);
2323        err = link_path_walk(s, nd);
2324        if (!err)
2325                err = complete_walk(nd);
2326        if (!err) {
2327                *parent = nd->path;
2328                nd->path.mnt = NULL;
2329                nd->path.dentry = NULL;
2330        }
2331        terminate_walk(nd);
2332        return err;
2333}
2334
2335static struct filename *filename_parentat(int dfd, struct filename *name,
2336                                unsigned int flags, struct path *parent,
2337                                struct qstr *last, int *type)
2338{
2339        int retval;
2340        struct nameidata nd;
2341
2342        if (IS_ERR(name))
2343                return name;
2344        set_nameidata(&nd, dfd, name);
2345        retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2346        if (unlikely(retval == -ECHILD))
2347                retval = path_parentat(&nd, flags, parent);
2348        if (unlikely(retval == -ESTALE))
2349                retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2350        if (likely(!retval)) {
2351                *last = nd.last;
2352                *type = nd.last_type;
2353                audit_inode(name, parent->dentry, LOOKUP_PARENT);
2354        } else {
2355                putname(name);
2356                name = ERR_PTR(retval);
2357        }
2358        restore_nameidata();
2359        return name;
2360}
2361
2362/* does lookup, returns the object with parent locked */
2363struct dentry *kern_path_locked(const char *name, struct path *path)
2364{
2365        struct filename *filename;
2366        struct dentry *d;
2367        struct qstr last;
2368        int type;
2369
2370        filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
2371                                    &last, &type);
2372        if (IS_ERR(filename))
2373                return ERR_CAST(filename);
2374        if (unlikely(type != LAST_NORM)) {
2375                path_put(path);
2376                putname(filename);
2377                return ERR_PTR(-EINVAL);
2378        }
2379        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2380        d = __lookup_hash(&last, path->dentry, 0);
2381        if (IS_ERR(d)) {
2382                inode_unlock(path->dentry->d_inode);
2383                path_put(path);
2384        }
2385        putname(filename);
2386        return d;
2387}
2388
2389int kern_path(const char *name, unsigned int flags, struct path *path)
2390{
2391        return filename_lookup(AT_FDCWD, getname_kernel(name),
2392                               flags, path, NULL);
2393}
2394EXPORT_SYMBOL(kern_path);
2395
2396/**
2397 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2398 * @dentry:  pointer to dentry of the base directory
2399 * @mnt: pointer to vfs mount of the base directory
2400 * @name: pointer to file name
2401 * @flags: lookup flags
2402 * @path: pointer to struct path to fill
2403 */
2404int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2405                    const char *name, unsigned int flags,
2406                    struct path *path)
2407{
2408        struct path root = {.mnt = mnt, .dentry = dentry};
2409        /* the first argument of filename_lookup() is ignored with root */
2410        return filename_lookup(AT_FDCWD, getname_kernel(name),
2411                               flags , path, &root);
2412}
2413EXPORT_SYMBOL(vfs_path_lookup);
2414
2415/**
2416 * lookup_one_len - filesystem helper to lookup single pathname component
2417 * @name:       pathname component to lookup
2418 * @base:       base directory to lookup from
2419 * @len:        maximum length @len should be interpreted to
2420 *
2421 * Note that this routine is purely a helper for filesystem usage and should
2422 * not be called by generic code.
2423 *
2424 * The caller must hold base->i_mutex.
2425 */
2426struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2427{
2428        struct qstr this;
2429        unsigned int c;
2430        int err;
2431
2432        WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2433
2434        this.name = name;
2435        this.len = len;
2436        this.hash = full_name_hash(base, name, len);
2437        if (!len)
2438                return ERR_PTR(-EACCES);
2439
2440        if (unlikely(name[0] == '.')) {
2441                if (len < 2 || (len == 2 && name[1] == '.'))
2442                        return ERR_PTR(-EACCES);
2443        }
2444
2445        while (len--) {
2446                c = *(const unsigned char *)name++;
2447                if (c == '/' || c == '\0')
2448                        return ERR_PTR(-EACCES);
2449        }
2450        /*
2451         * See if the low-level filesystem might want
2452         * to use its own hash..
2453         */
2454        if (base->d_flags & DCACHE_OP_HASH) {
2455                int err = base->d_op->d_hash(base, &this);
2456                if (err < 0)
2457                        return ERR_PTR(err);
2458        }
2459
2460        err = inode_permission(base->d_inode, MAY_EXEC);
2461        if (err)
2462                return ERR_PTR(err);
2463
2464        return __lookup_hash(&this, base, 0);
2465}
2466EXPORT_SYMBOL(lookup_one_len);
2467
2468/**
2469 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
2470 * @name:       pathname component to lookup
2471 * @base:       base directory to lookup from
2472 * @len:        maximum length @len should be interpreted to
2473 *
2474 * Note that this routine is purely a helper for filesystem usage and should
2475 * not be called by generic code.
2476 *
2477 * Unlike lookup_one_len, it should be called without the parent
2478 * i_mutex held, and will take the i_mutex itself if necessary.
2479 */
2480struct dentry *lookup_one_len_unlocked(const char *name,
2481                                       struct dentry *base, int len)
2482{
2483        struct qstr this;
2484        unsigned int c;
2485        int err;
2486        struct dentry *ret;
2487
2488        this.name = name;
2489        this.len = len;
2490        this.hash = full_name_hash(base, name, len);
2491        if (!len)
2492                return ERR_PTR(-EACCES);
2493
2494        if (unlikely(name[0] == '.')) {
2495                if (len < 2 || (len == 2 && name[1] == '.'))
2496                        return ERR_PTR(-EACCES);
2497        }
2498
2499        while (len--) {
2500                c = *(const unsigned char *)name++;
2501                if (c == '/' || c == '\0')
2502                        return ERR_PTR(-EACCES);
2503        }
2504        /*
2505         * See if the low-level filesystem might want
2506         * to use its own hash..
2507         */
2508        if (base->d_flags & DCACHE_OP_HASH) {
2509                int err = base->d_op->d_hash(base, &this);
2510                if (err < 0)
2511                        return ERR_PTR(err);
2512        }
2513
2514        err = inode_permission(base->d_inode, MAY_EXEC);
2515        if (err)
2516                return ERR_PTR(err);
2517
2518        ret = lookup_dcache(&this, base, 0);
2519        if (!ret)
2520                ret = lookup_slow(&this, base, 0);
2521        return ret;
2522}
2523EXPORT_SYMBOL(lookup_one_len_unlocked);
2524
2525#ifdef CONFIG_UNIX98_PTYS
2526int path_pts(struct path *path)
2527{
2528        /* Find something mounted on "pts" in the same directory as
2529         * the input path.
2530         */
2531        struct dentry *child, *parent;
2532        struct qstr this;
2533        int ret;
2534
2535        ret = path_parent_directory(path);
2536        if (ret)
2537                return ret;
2538
2539        parent = path->dentry;
2540        this.name = "pts";
2541        this.len = 3;
2542        child = d_hash_and_lookup(parent, &this);
2543        if (!child)
2544                return -ENOENT;
2545
2546        path->dentry = child;
2547        dput(parent);
2548        follow_mount(path);
2549        return 0;
2550}
2551#endif
2552
2553int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2554                 struct path *path, int *empty)
2555{
2556        return filename_lookup(dfd, getname_flags(name, flags, empty),
2557                               flags, path, NULL);
2558}
2559EXPORT_SYMBOL(user_path_at_empty);
2560
2561/*
2562 * NB: most callers don't do anything directly with the reference to the
2563 *     to struct filename, but the nd->last pointer points into the name string
2564 *     allocated by getname. So we must hold the reference to it until all
2565 *     path-walking is complete.
2566 */
2567static inline struct filename *
2568user_path_parent(int dfd, const char __user *path,
2569                 struct path *parent,
2570                 struct qstr *last,
2571                 int *type,
2572                 unsigned int flags)
2573{
2574        /* only LOOKUP_REVAL is allowed in extra flags */
2575        return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
2576                                 parent, last, type);
2577}
2578
2579/**
2580 * mountpoint_last - look up last component for umount
2581 * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
2582 * @path: pointer to container for result
2583 *
2584 * This is a special lookup_last function just for umount. In this case, we
2585 * need to resolve the path without doing any revalidation.
2586 *
2587 * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
2588 * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
2589 * in almost all cases, this lookup will be served out of the dcache. The only
2590 * cases where it won't are if nd->last refers to a symlink or the path is
2591 * bogus and it doesn't exist.
2592 *
2593 * Returns:
2594 * -error: if there was an error during lookup. This includes -ENOENT if the
2595 *         lookup found a negative dentry. The nd->path reference will also be
2596 *         put in this case.
2597 *
2598 * 0:      if we successfully resolved nd->path and found it to not to be a
2599 *         symlink that needs to be followed. "path" will also be populated.
2600 *         The nd->path reference will also be put.
2601 *
2602 * 1:      if we successfully resolved nd->last and found it to be a symlink
2603 *         that needs to be followed. "path" will be populated with the path
2604 *         to the link, and nd->path will *not* be put.
2605 */
2606static int
2607mountpoint_last(struct nameidata *nd, struct path *path)
2608{
2609        int error = 0;
2610        struct dentry *dentry;
2611        struct dentry *dir = nd->path.dentry;
2612
2613        /* If we're in rcuwalk, drop out of it to handle last component */
2614        if (nd->flags & LOOKUP_RCU) {
2615                if (unlazy_walk(nd, NULL, 0))
2616                        return -ECHILD;
2617        }
2618
2619        nd->flags &= ~LOOKUP_PARENT;
2620
2621        if (unlikely(nd->last_type != LAST_NORM)) {
2622                error = handle_dots(nd, nd->last_type);
2623                if (error)
2624                        return error;
2625                dentry = dget(nd->path.dentry);
2626        } else {
2627                dentry = d_lookup(dir, &nd->last);
2628                if (!dentry) {
2629                        /*
2630                         * No cached dentry. Mounted dentries are pinned in the
2631                         * cache, so that means that this dentry is probably
2632                         * a symlink or the path doesn't actually point
2633                         * to a mounted dentry.
2634                         */
2635                        dentry = lookup_slow(&nd->last, dir,
2636                                             nd->flags | LOOKUP_NO_REVAL);
2637                        if (IS_ERR(dentry))
2638                                return PTR_ERR(dentry);
2639                }
2640        }
2641        if (d_is_negative(dentry)) {
2642                dput(dentry);
2643                return -ENOENT;
2644        }
2645        if (nd->depth)
2646                put_link(nd);
2647        path->dentry = dentry;
2648        path->mnt = nd->path.mnt;
2649        error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
2650                                   d_backing_inode(dentry), 0);
2651        if (unlikely(error))
2652                return error;
2653        mntget(path->mnt);
2654        follow_mount(path);
2655        return 0;
2656}
2657
2658/**
2659 * path_mountpoint - look up a path to be umounted
2660 * @nd:         lookup context
2661 * @flags:      lookup flags
2662 * @path:       pointer to container for result
2663 *
2664 * Look up the given name, but don't attempt to revalidate the last component.
2665 * Returns 0 and "path" will be valid on success; Returns error otherwise.
2666 */
2667static int
2668path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
2669{
2670        const char *s = path_init(nd, flags);
2671        int err;
2672        if (IS_ERR(s))
2673                return PTR_ERR(s);
2674        while (!(err = link_path_walk(s, nd)) &&
2675                (err = mountpoint_last(nd, path)) > 0) {
2676                s = trailing_symlink(nd);
2677                if (IS_ERR(s)) {
2678                        err = PTR_ERR(s);
2679                        break;
2680                }
2681        }
2682        terminate_walk(nd);
2683        return err;
2684}
2685
2686static int
2687filename_mountpoint(int dfd, struct filename *name, struct path *path,
2688                        unsigned int flags)
2689{
2690        struct nameidata nd;
2691        int error;
2692        if (IS_ERR(name))
2693                return PTR_ERR(name);
2694        set_nameidata(&nd, dfd, name);
2695        error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
2696        if (unlikely(error == -ECHILD))
2697                error = path_mountpoint(&nd, flags, path);
2698        if (unlikely(error == -ESTALE))
2699                error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
2700        if (likely(!error))
2701                audit_inode(name, path->dentry, 0);
2702        restore_nameidata();
2703        putname(name);
2704        return error;
2705}
2706
2707/**
2708 * user_path_mountpoint_at - lookup a path from userland in order to umount it
2709 * @dfd:        directory file descriptor
2710 * @name:       pathname from userland
2711 * @flags:      lookup flags
2712 * @path:       pointer to container to hold result
2713 *
2714 * A umount is a special case for path walking. We're not actually interested
2715 * in the inode in this situation, and ESTALE errors can be a problem. We
2716 * simply want track down the dentry and vfsmount attached at the mountpoint
2717 * and avoid revalidating the last component.
2718 *
2719 * Returns 0 and populates "path" on success.
2720 */
2721int
2722user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2723                        struct path *path)
2724{
2725        return filename_mountpoint(dfd, getname(name), path, flags);
2726}
2727
2728int
2729kern_path_mountpoint(int dfd, const char *name, struct path *path,
2730                        unsigned int flags)
2731{
2732        return filename_mountpoint(dfd, getname_kernel(name), path, flags);
2733}
2734EXPORT_SYMBOL(kern_path_mountpoint);
2735
2736int __check_sticky(struct inode *dir, struct inode *inode)
2737{
2738        kuid_t fsuid = current_fsuid();
2739
2740        if (uid_eq(inode->i_uid, fsuid))
2741                return 0;
2742        if (uid_eq(dir->i_uid, fsuid))
2743                return 0;
2744        return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
2745}
2746EXPORT_SYMBOL(__check_sticky);
2747
2748/*
2749 *      Check whether we can remove a link victim from directory dir, check
2750 *  whether the type of victim is right.
2751 *  1. We can't do it if dir is read-only (done in permission())
2752 *  2. We should have write and exec permissions on dir
2753 *  3. We can't remove anything from append-only dir
2754 *  4. We can't do anything with immutable dir (done in permission())
2755 *  5. If the sticky bit on dir is set we should either
2756 *      a. be owner of dir, or
2757 *      b. be owner of victim, or
2758 *      c. have CAP_FOWNER capability
2759 *  6. If the victim is append-only or immutable we can't do antyhing with
2760 *     links pointing to it.
2761 *  7. If the victim has an unknown uid or gid we can't change the inode.
2762 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2763 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2764 * 10. We can't remove a root or mountpoint.
2765 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
2766 *     nfs_async_unlink().
2767 */
2768static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
2769{
2770        struct inode *inode = d_backing_inode(victim);
2771        int error;
2772
2773        if (d_is_negative(victim))
2774                return -ENOENT;
2775        BUG_ON(!inode);
2776
2777        BUG_ON(victim->d_parent->d_inode != dir);
2778        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2779
2780        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
2781        if (error)
2782                return error;
2783        if (IS_APPEND(dir))
2784                return -EPERM;
2785
2786        if (check_sticky(dir, inode) || IS_APPEND(inode) ||
2787            IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
2788                return -EPERM;
2789        if (isdir) {
2790                if (!d_is_dir(victim))
2791                        return -ENOTDIR;
2792                if (IS_ROOT(victim))
2793                        return -EBUSY;
2794        } else if (d_is_dir(victim))
2795                return -EISDIR;
2796        if (IS_DEADDIR(dir))
2797                return -ENOENT;
2798        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2799                return -EBUSY;
2800        return 0;
2801}
2802
2803/*      Check whether we can create an object with dentry child in directory
2804 *  dir.
2805 *  1. We can't do it if child already exists (open has special treatment for
2806 *     this case, but since we are inlined it's OK)
2807 *  2. We can't do it if dir is read-only (done in permission())
2808 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
2809 *  4. We should have write and exec permissions on dir
2810 *  5. We can't do it if dir is immutable (done in permission())
2811 */
2812static inline int may_create(struct inode *dir, struct dentry *child)
2813{
2814        struct user_namespace *s_user_ns;
2815        audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
2816        if (child->d_inode)
2817                return -EEXIST;
2818        if (IS_DEADDIR(dir))
2819                return -ENOENT;
2820        s_user_ns = dir->i_sb->s_user_ns;
2821        if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
2822            !kgid_has_mapping(s_user_ns, current_fsgid()))
2823                return -EOVERFLOW;
2824        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
2825}
2826
2827/*
2828 * p1 and p2 should be directories on the same fs.
2829 */
2830struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
2831{
2832        struct dentry *p;
2833
2834        if (p1 == p2) {
2835                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2836                return NULL;
2837        }
2838
2839        mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
2840
2841        p = d_ancestor(p2, p1);
2842        if (p) {
2843                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
2844                inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2845                return p;
2846        }
2847
2848        p = d_ancestor(p1, p2);
2849        if (p) {
2850                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2851                inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2852                return p;
2853        }
2854
2855        inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2856        inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
2857        return NULL;
2858}
2859EXPORT_SYMBOL(lock_rename);
2860
2861void unlock_rename(struct dentry *p1, struct dentry *p2)
2862{
2863        inode_unlock(p1->d_inode);
2864        if (p1 != p2) {
2865                inode_unlock(p2->d_inode);
2866                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
2867        }
2868}
2869EXPORT_SYMBOL(unlock_rename);
2870
2871int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2872                bool want_excl)
2873{
2874        int error = may_create(dir, dentry);
2875        if (error)
2876                return error;
2877
2878        if (!dir->i_op->create)
2879                return -EACCES; /* shouldn't it be ENOSYS? */
2880        mode &= S_IALLUGO;
2881        mode |= S_IFREG;
2882        error = security_inode_create(dir, dentry, mode);
2883        if (error)
2884                return error;
2885        error = dir->i_op->create(dir, dentry, mode, want_excl);
2886        if (!error)
2887                fsnotify_create(dir, dentry);
2888        return error;
2889}
2890EXPORT_SYMBOL(vfs_create);
2891
2892bool may_open_dev(const struct path *path)
2893{
2894        return !(path->mnt->mnt_flags & MNT_NODEV) &&
2895                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
2896}
2897
2898static int may_open(struct path *path, int acc_mode, int flag)
2899{
2900        struct dentry *dentry = path->dentry;
2901        struct inode *inode = dentry->d_inode;
2902        int error;
2903
2904        if (!inode)
2905                return -ENOENT;
2906
2907        switch (inode->i_mode & S_IFMT) {
2908        case S_IFLNK:
2909                return -ELOOP;
2910        case S_IFDIR:
2911                if (acc_mode & MAY_WRITE)
2912                        return -EISDIR;
2913                break;
2914        case S_IFBLK:
2915        case S_IFCHR:
2916                if (!may_open_dev(path))
2917                        return -EACCES;
2918                /*FALLTHRU*/
2919        case S_IFIFO:
2920        case S_IFSOCK:
2921                flag &= ~O_TRUNC;
2922                break;
2923        }
2924
2925        error = inode_permission(inode, MAY_OPEN | acc_mode);
2926        if (error)
2927                return error;
2928
2929        /*
2930         * An append-only file must be opened in append mode for writing.
2931         */
2932        if (IS_APPEND(inode)) {
2933                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2934                        return -EPERM;
2935                if (flag & O_TRUNC)
2936                        return -EPERM;
2937        }
2938
2939        /* O_NOATIME can only be set by the owner or superuser */
2940        if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2941                return -EPERM;
2942
2943        return 0;
2944}
2945
2946static int handle_truncate(struct file *filp)
2947{
2948        struct path *path = &filp->f_path;
2949        struct inode *inode = path->dentry->d_inode;
2950        int error = get_write_access(inode);
2951        if (error)
2952                return error;
2953        /*
2954         * Refuse to truncate files with mandatory locks held on them.
2955         */
2956        error = locks_verify_locked(filp);
2957        if (!error)
2958                error = security_path_truncate(path);
2959        if (!error) {
2960                error = do_truncate(path->dentry, 0,
2961                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2962                                    filp);
2963        }
2964        put_write_access(inode);
2965        return error;
2966}
2967
2968static inline int open_to_namei_flags(int flag)
2969{
2970        if ((flag & O_ACCMODE) == 3)
2971                flag--;
2972        return flag;
2973}
2974
2975static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
2976{
2977        int error = security_path_mknod(dir, dentry, mode, 0);
2978        if (error)
2979                return error;
2980
2981        error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
2982        if (error)
2983                return error;
2984
2985        return security_inode_create(dir->dentry->d_inode, dentry, mode);
2986}
2987
2988/*
2989 * Attempt to atomically look up, create and open a file from a negative
2990 * dentry.
2991 *
2992 * Returns 0 if successful.  The file will have been created and attached to
2993 * @file by the filesystem calling finish_open().
2994 *
2995 * Returns 1 if the file was looked up only or didn't need creating.  The
2996 * caller will need to perform the open themselves.  @path will have been
2997 * updated to point to the new dentry.  This may be negative.
2998 *
2999 * Returns an error code otherwise.
3000 */
3001static int atomic_open(struct nameidata *nd, struct dentry *dentry,
3002                        struct path *path, struct file *file,
3003                        const struct open_flags *op,
3004                        int open_flag, umode_t mode,
3005                        int *opened)
3006{
3007        struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
3008        struct inode *dir =  nd->path.dentry->d_inode;
3009        int error;
3010
3011        if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */
3012                open_flag &= ~O_TRUNC;
3013
3014        if (nd->flags & LOOKUP_DIRECTORY)
3015                open_flag |= O_DIRECTORY;
3016
3017        file->f_path.dentry = DENTRY_NOT_SET;
3018        file->f_path.mnt = nd->path.mnt;
3019        error = dir->i_op->atomic_open(dir, dentry, file,
3020                                       open_to_namei_flags(open_flag),
3021                                       mode, opened);
3022        d_lookup_done(dentry);
3023        if (!error) {
3024                /*
3025                 * We didn't have the inode before the open, so check open
3026                 * permission here.
3027                 */
3028                int acc_mode = op->acc_mode;
3029                if (*opened & FILE_CREATED) {
3030                        WARN_ON(!(open_flag & O_CREAT));
3031                        fsnotify_create(dir, dentry);
3032                        acc_mode = 0;
3033                }
3034                error = may_open(&file->f_path, acc_mode, open_flag);
3035                if (WARN_ON(error > 0))
3036                        error = -EINVAL;
3037        } else if (error > 0) {
3038                if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3039                        error = -EIO;
3040                } else {
3041                        if (file->f_path.dentry) {
3042                                dput(dentry);
3043                                dentry = file->f_path.dentry;
3044                        }
3045                        if (*opened & FILE_CREATED)
3046                                fsnotify_create(dir, dentry);
3047                        if (unlikely(d_is_negative(dentry))) {
3048                                error = -ENOENT;
3049                        } else {
3050                                path->dentry = dentry;
3051                                path->mnt = nd->path.mnt;
3052                                return 1;
3053                        }
3054                }
3055        }
3056        dput(dentry);
3057        return error;
3058}
3059
3060/*
3061 * Look up and maybe create and open the last component.
3062 *
3063 * Must be called with i_mutex held on parent.
3064 *
3065 * Returns 0 if the file was successfully atomically created (if necessary) and
3066 * opened.  In this case the file will be returned attached to @file.
3067 *
3068 * Returns 1 if the file was not completely opened at this time, though lookups
3069 * and creations will have been performed and the dentry returned in @path will
3070 * be positive upon return if O_CREAT was specified.  If O_CREAT wasn't
3071 * specified then a negative dentry may be returned.
3072 *
3073 * An error code is returned otherwise.
3074 *
3075 * FILE_CREATE will be set in @*opened if the dentry was created and will be
3076 * cleared otherwise prior to returning.
3077 */
3078static int lookup_open(struct nameidata *nd, struct path *path,
3079                        struct file *file,
3080                        const struct open_flags *op,
3081                        bool got_write, int *opened)
3082{
3083        struct dentry *dir = nd->path.dentry;
3084        struct inode *dir_inode = dir->d_inode;
3085        int open_flag = op->open_flag;
3086        struct dentry *dentry;
3087        int error, create_error = 0;
3088        umode_t mode = op->mode;
3089        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
3090
3091        if (unlikely(IS_DEADDIR(dir_inode)))
3092                return -ENOENT;
3093
3094        *opened &= ~FILE_CREATED;
3095        dentry = d_lookup(dir, &nd->last);
3096        for (;;) {
3097                if (!dentry) {
3098                        dentry = d_alloc_parallel(dir, &nd->last, &wq);
3099                        if (IS_ERR(dentry))
3100                                return PTR_ERR(dentry);
3101                }
3102                if (d_in_lookup(dentry))
3103                        break;
3104
3105                if (!(dentry->d_flags & DCACHE_OP_REVALIDATE))
3106                        break;
3107
3108                error = d_revalidate(dentry, nd->flags);
3109                if (likely(error > 0))
3110                        break;
3111                if (error)
3112                        goto out_dput;
3113                d_invalidate(dentry);
3114                dput(dentry);
3115                dentry = NULL;
3116        }
3117        if (dentry->d_inode) {
3118                /* Cached positive dentry: will open in f_op->open */
3119                goto out_no_open;
3120        }
3121
3122        /*
3123         * Checking write permission is tricky, bacuse we don't know if we are
3124         * going to actually need it: O_CREAT opens should work as long as the
3125         * file exists.  But checking existence breaks atomicity.  The trick is
3126         * to check access and if not granted clear O_CREAT from the flags.
3127         *
3128         * Another problem is returing the "right" error value (e.g. for an
3129         * O_EXCL open we want to return EEXIST not EROFS).
3130         */
3131        if (open_flag & O_CREAT) {
3132                if (!IS_POSIXACL(dir->d_inode))
3133                        mode &= ~current_umask();
3134                if (unlikely(!got_write)) {
3135                        create_error = -EROFS;
3136                        open_flag &= ~O_CREAT;
3137                        if (open_flag & (O_EXCL | O_TRUNC))
3138                                goto no_open;
3139                        /* No side effects, safe to clear O_CREAT */
3140                } else {
3141                        create_error = may_o_create(&nd->path, dentry, mode);
3142                        if (create_error) {
3143                                open_flag &= ~O_CREAT;
3144                                if (open_flag & O_EXCL)
3145                                        goto no_open;
3146                        }
3147                }
3148        } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
3149                   unlikely(!got_write)) {
3150                /*
3151                 * No O_CREATE -> atomicity not a requirement -> fall
3152                 * back to lookup + open
3153                 */
3154                goto no_open;
3155        }
3156
3157        if (dir_inode->i_op->atomic_open) {
3158                error = atomic_open(nd, dentry, path, file, op, open_flag,
3159                                    mode, opened);
3160                if (unlikely(error == -ENOENT) && create_error)
3161                        error = create_error;
3162                return error;
3163        }
3164
3165no_open:
3166        if (d_in_lookup(dentry)) {
3167                struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
3168                                                             nd->flags);
3169                d_lookup_done(dentry);
3170                if (unlikely(res)) {
3171                        if (IS_ERR(res)) {
3172                                error = PTR_ERR(res);
3173                                goto out_dput;
3174                        }
3175                        dput(dentry);
3176                        dentry = res;
3177                }
3178        }
3179
3180        /* Negative dentry, just create the file */
3181        if (!dentry->d_inode && (open_flag & O_CREAT)) {
3182                *opened |= FILE_CREATED;
3183                audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
3184                if (!dir_inode->i_op->create) {
3185                        error = -EACCES;
3186                        goto out_dput;
3187                }
3188                error = dir_inode->i_op->create(dir_inode, dentry, mode,
3189                                                open_flag & O_EXCL);
3190                if (error)
3191                        goto out_dput;
3192                fsnotify_create(dir_inode, dentry);
3193        }
3194        if (unlikely(create_error) && !dentry->d_inode) {
3195                error = create_error;
3196                goto out_dput;
3197        }
3198out_no_open:
3199        path->dentry = dentry;
3200        path->mnt = nd->path.mnt;
3201        return 1;
3202
3203out_dput:
3204        dput(dentry);
3205        return error;
3206}
3207
3208/*
3209 * Handle the last step of open()
3210 */
3211static int do_last(struct nameidata *nd,
3212                   struct file *file, const struct open_flags *op,
3213                   int *opened)
3214{
3215        struct dentry *dir = nd->path.dentry;
3216        int open_flag = op->open_flag;
3217        bool will_truncate = (open_flag & O_TRUNC) != 0;
3218        bool got_write = false;
3219        int acc_mode = op->acc_mode;
3220        unsigned seq;
3221        struct inode *inode;
3222        struct path path;
3223        int error;
3224
3225        nd->flags &= ~LOOKUP_PARENT;
3226        nd->flags |= op->intent;
3227
3228        if (nd->last_type != LAST_NORM) {
3229                error = handle_dots(nd, nd->last_type);
3230                if (unlikely(error))
3231                        return error;
3232                goto finish_open;
3233        }
3234
3235        if (!(open_flag & O_CREAT)) {
3236                if (nd->last.name[nd->last.len])
3237                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
3238                /* we _can_ be in RCU mode here */
3239                error = lookup_fast(nd, &path, &inode, &seq);
3240                if (likely(error > 0))
3241                        goto finish_lookup;
3242
3243                if (error < 0)
3244                        return error;
3245
3246                BUG_ON(nd->inode != dir->d_inode);
3247                BUG_ON(nd->flags & LOOKUP_RCU);
3248        } else {
3249                /* create side of things */
3250                /*
3251                 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
3252                 * has been cleared when we got to the last component we are
3253                 * about to look up
3254                 */
3255                error = complete_walk(nd);
3256                if (error)
3257                        return error;
3258
3259                audit_inode(nd->name, dir, LOOKUP_PARENT);
3260                /* trailing slashes? */
3261                if (unlikely(nd->last.name[nd->last.len]))
3262                        return -EISDIR;
3263        }
3264
3265        if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3266                error = mnt_want_write(nd->path.mnt);
3267                if (!error)
3268                        got_write = true;
3269                /*
3270                 * do _not_ fail yet - we might not need that or fail with
3271                 * a different error; let lookup_open() decide; we'll be
3272                 * dropping this one anyway.
3273                 */
3274        }
3275        if (open_flag & O_CREAT)
3276                inode_lock(dir->d_inode);
3277        else
3278                inode_lock_shared(dir->d_inode);
3279        error = lookup_open(nd, &path, file, op, got_write, opened);
3280        if (open_flag & O_CREAT)
3281                inode_unlock(dir->d_inode);
3282        else
3283                inode_unlock_shared(dir->d_inode);
3284
3285        if (error <= 0) {
3286                if (error)
3287                        goto out;
3288
3289                if ((*opened & FILE_CREATED) ||
3290                    !S_ISREG(file_inode(file)->i_mode))
3291                        will_truncate = false;
3292
3293                audit_inode(nd->name, file->f_path.dentry, 0);
3294                goto opened;
3295        }
3296
3297        if (*opened & FILE_CREATED) {
3298                /* Don't check for write permission, don't truncate */
3299                open_flag &= ~O_TRUNC;
3300                will_truncate = false;
3301                acc_mode = 0;
3302                path_to_nameidata(&path, nd);
3303                goto finish_open_created;
3304        }
3305
3306        /*
3307         * If atomic_open() acquired write access it is dropped now due to
3308         * possible mount and symlink following (this might be optimized away if
3309         * necessary...)
3310         */
3311        if (got_write) {
3312                mnt_drop_write(nd->path.mnt);
3313                got_write = false;
3314        }
3315
3316        error = follow_managed(&path, nd);
3317        if (unlikely(error < 0))
3318                return error;
3319
3320        if (unlikely(d_is_negative(path.dentry))) {
3321                path_to_nameidata(&path, nd);
3322                return -ENOENT;
3323        }
3324
3325        /*
3326         * create/update audit record if it already exists.
3327         */
3328        audit_inode(nd->name, path.dentry, 0);
3329
3330        if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
3331                path_to_nameidata(&path, nd);
3332                return -EEXIST;
3333        }
3334
3335        seq = 0;        /* out of RCU mode, so the value doesn't matter */
3336        inode = d_backing_inode(path.dentry);
3337finish_lookup:
3338        if (nd->depth)
3339                put_link(nd);
3340        error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
3341                                   inode, seq);
3342        if (unlikely(error))
3343                return error;
3344
3345        path_to_nameidata(&path, nd);
3346        nd->inode = inode;
3347        nd->seq = seq;
3348        /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
3349finish_open:
3350        error = complete_walk(nd);
3351        if (error)
3352                return error;
3353        audit_inode(nd->name, nd->path.dentry, 0);
3354        error = -EISDIR;
3355        if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
3356                goto out;
3357        error = -ENOTDIR;
3358        if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3359                goto out;
3360        if (!d_is_reg(nd->path.dentry))
3361                will_truncate = false;
3362
3363        if (will_truncate) {
3364                error = mnt_want_write(nd->path.mnt);
3365                if (error)
3366                        goto out;
3367                got_write = true;
3368        }
3369finish_open_created:
3370        error = may_open(&nd->path, acc_mode, open_flag);
3371        if (error)
3372                goto out;
3373        BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
3374        error = vfs_open(&nd->path, file, current_cred());
3375        if (error)
3376                goto out;
3377        *opened |= FILE_OPENED;
3378opened:
3379        error = open_check_o_direct(file);
3380        if (!error)
3381                error = ima_file_check(file, op->acc_mode, *opened);
3382        if (!error && will_truncate)
3383                error = handle_truncate(file);
3384out:
3385        if (unlikely(error) && (*opened & FILE_OPENED))
3386                fput(file);
3387        if (unlikely(error > 0)) {
3388                WARN_ON(1);
3389                error = -EINVAL;
3390        }
3391        if (got_write)
3392                mnt_drop_write(nd->path.mnt);
3393        return error;
3394}
3395
3396static int do_tmpfile(struct nameidata *nd, unsigned flags,
3397                const struct open_flags *op,
3398                struct file *file, int *opened)
3399{
3400        static const struct qstr name = QSTR_INIT("/", 1);
3401        struct dentry *child;
3402        struct inode *dir;
3403        struct path path;
3404        int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3405        if (unlikely(error))
3406                return error;
3407        error = mnt_want_write(path.mnt);
3408        if (unlikely(error))
3409                goto out;
3410        dir = path.dentry->d_inode;
3411        /* we want directory to be writable */
3412        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
3413        if (error)
3414                goto out2;
3415        if (!dir->i_op->tmpfile) {
3416                error = -EOPNOTSUPP;
3417                goto out2;
3418        }
3419        child = d_alloc(path.dentry, &name);
3420        if (unlikely(!child)) {
3421                error = -ENOMEM;
3422                goto out2;
3423        }
3424        dput(path.dentry);
3425        path.dentry = child;
3426        error = dir->i_op->tmpfile(dir, child, op->mode);
3427        if (error)
3428                goto out2;
3429        audit_inode(nd->name, child, 0);
3430        /* Don't check for other permissions, the inode was just created */
3431        error = may_open(&path, 0, op->open_flag);
3432        if (error)
3433                goto out2;
3434        file->f_path.mnt = path.mnt;
3435        error = finish_open(file, child, NULL, opened);
3436        if (error)
3437                goto out2;
3438        error = open_check_o_direct(file);
3439        if (error) {
3440                fput(file);
3441        } else if (!(op->open_flag & O_EXCL)) {
3442                struct inode *inode = file_inode(file);
3443                spin_lock(&inode->i_lock);
3444                inode->i_state |= I_LINKABLE;
3445                spin_unlock(&inode->i_lock);
3446        }
3447out2:
3448        mnt_drop_write(path.mnt);
3449out:
3450        path_put(&path);
3451        return error;
3452}
3453
3454static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
3455{
3456        struct path path;
3457        int error = path_lookupat(nd, flags, &path);
3458        if (!error) {
3459                audit_inode(nd->name, path.dentry, 0);
3460                error = vfs_open(&path, file, current_cred());
3461                path_put(&path);
3462        }
3463        return error;
3464}
3465
3466static struct file *path_openat(struct nameidata *nd,
3467                        const struct open_flags *op, unsigned flags)
3468{
3469        const char *s;
3470        struct file *file;
3471        int opened = 0;
3472        int error;
3473
3474        file = get_empty_filp();
3475        if (IS_ERR(file))
3476                return file;
3477
3478        file->f_flags = op->open_flag;
3479
3480        if (unlikely(file->f_flags & __O_TMPFILE)) {
3481                error = do_tmpfile(nd, flags, op, file, &opened);
3482                goto out2;
3483        }
3484
3485        if (unlikely(file->f_flags & O_PATH)) {
3486                error = do_o_path(nd, flags, file);
3487                if (!error)
3488                        opened |= FILE_OPENED;
3489                goto out2;
3490        }
3491
3492        s = path_init(nd, flags);
3493        if (IS_ERR(s)) {
3494                put_filp(file);
3495                return ERR_CAST(s);
3496        }
3497        while (!(error = link_path_walk(s, nd)) &&
3498                (error = do_last(nd, file, op, &opened)) > 0) {
3499                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3500                s = trailing_symlink(nd);
3501                if (IS_ERR(s)) {
3502                        error = PTR_ERR(s);
3503                        break;
3504                }
3505        }
3506        terminate_walk(nd);
3507out2:
3508        if (!(opened & FILE_OPENED)) {
3509                BUG_ON(!error);
3510                put_filp(file);
3511        }
3512        if (unlikely(error)) {
3513                if (error == -EOPENSTALE) {
3514                        if (flags & LOOKUP_RCU)
3515                                error = -ECHILD;
3516                        else
3517                                error = -ESTALE;
3518                }
3519                file = ERR_PTR(error);
3520        }
3521        return file;
3522}
3523
3524struct file *do_filp_open(int dfd, struct filename *pathname,
3525                const struct open_flags *op)
3526{
3527        struct nameidata nd;
3528        int flags = op->lookup_flags;
3529        struct file *filp;
3530
3531        set_nameidata(&nd, dfd, pathname);
3532        filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3533        if (unlikely(filp == ERR_PTR(-ECHILD)))
3534                filp = path_openat(&nd, op, flags);
3535        if (unlikely(filp == ERR_PTR(-ESTALE)))
3536                filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3537        restore_nameidata();
3538        return filp;
3539}
3540
3541struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3542                const char *name, const struct open_flags *op)
3543{
3544        struct nameidata nd;
3545        struct file *file;
3546        struct filename *filename;
3547        int flags = op->lookup_flags | LOOKUP_ROOT;
3548
3549        nd.root.mnt = mnt;
3550        nd.root.dentry = dentry;
3551
3552        if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
3553                return ERR_PTR(-ELOOP);
3554
3555        filename = getname_kernel(name);
3556        if (IS_ERR(filename))
3557                return ERR_CAST(filename);
3558
3559        set_nameidata(&nd, -1, filename);
3560        file = path_openat(&nd, op, flags | LOOKUP_RCU);
3561        if (unlikely(file == ERR_PTR(-ECHILD)))
3562                file = path_openat(&nd, op, flags);
3563        if (unlikely(file == ERR_PTR(-ESTALE)))
3564                file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3565        restore_nameidata();
3566        putname(filename);
3567        return file;
3568}
3569
3570static struct dentry *filename_create(int dfd, struct filename *name,
3571                                struct path *path, unsigned int lookup_flags)
3572{
3573        struct dentry *dentry = ERR_PTR(-EEXIST);
3574        struct qstr last;
3575        int type;
3576        int err2;
3577        int error;
3578        bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
3579
3580        /*
3581         * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
3582         * other flags passed in are ignored!
3583         */
3584        lookup_flags &= LOOKUP_REVAL;
3585
3586        name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
3587        if (IS_ERR(name))
3588                return ERR_CAST(name);
3589
3590        /*
3591         * Yucky last component or no last component at all?
3592         * (foo/., foo/.., /////)
3593         */
3594        if (unlikely(type != LAST_NORM))
3595                goto out;
3596
3597        /* don't fail immediately if it's r/o, at least try to report other errors */
3598        err2 = mnt_want_write(path->mnt);
3599        /*
3600         * Do the final lookup.
3601         */
3602        lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
3603        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3604        dentry = __lookup_hash(&last, path->dentry, lookup_flags);
3605        if (IS_ERR(dentry))
3606                goto unlock;
3607
3608        error = -EEXIST;
3609        if (d_is_positive(dentry))
3610                goto fail;
3611
3612        /*
3613         * Special case - lookup gave negative, but... we had foo/bar/
3614         * From the vfs_mknod() POV we just have a negative dentry -
3615         * all is fine. Let's be bastards - you had / on the end, you've
3616         * been asking for (non-existent) directory. -ENOENT for you.
3617         */
3618        if (unlikely(!is_dir && last.name[last.len])) {
3619                error = -ENOENT;
3620                goto fail;
3621        }
3622        if (unlikely(err2)) {
3623                error = err2;
3624                goto fail;
3625        }
3626        putname(name);
3627        return dentry;
3628fail:
3629        dput(dentry);
3630        dentry = ERR_PTR(error);
3631unlock:
3632        inode_unlock(path->dentry->d_inode);
3633        if (!err2)
3634                mnt_drop_write(path->mnt);
3635out:
3636        path_put(path);
3637        putname(name);
3638        return dentry;
3639}
3640
3641struct dentry *kern_path_create(int dfd, const char *pathname,
3642                                struct path *path, unsigned int lookup_flags)
3643{
3644        return filename_create(dfd, getname_kernel(pathname),
3645                                path, lookup_flags);
3646}
3647EXPORT_SYMBOL(kern_path_create);
3648
3649void done_path_create(struct path *path, struct dentry *dentry)
3650{
3651        dput(dentry);
3652        inode_unlock(path->dentry->d_inode);
3653        mnt_drop_write(path->mnt);
3654        path_put(path);
3655}
3656EXPORT_SYMBOL(done_path_create);
3657
3658inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3659                                struct path *path, unsigned int lookup_flags)
3660{
3661        return filename_create(dfd, getname(pathname), path, lookup_flags);
3662}
3663EXPORT_SYMBOL(user_path_create);
3664
3665int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
3666{
3667        int error = may_create(dir, dentry);
3668
3669        if (error)
3670                return error;
3671
3672        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
3673                return -EPERM;
3674
3675        if (!dir->i_op->mknod)
3676                return -EPERM;
3677
3678        error = devcgroup_inode_mknod(mode, dev);
3679        if (error)
3680                return error;
3681
3682        error = security_inode_mknod(dir, dentry, mode, dev);
3683        if (error)
3684                return error;
3685
3686        error = dir->i_op->mknod(dir, dentry, mode, dev);
3687        if (!error)
3688                fsnotify_create(dir, dentry);
3689        return error;
3690}
3691EXPORT_SYMBOL(vfs_mknod);
3692
3693static int may_mknod(umode_t mode)
3694{
3695        switch (mode & S_IFMT) {
3696        case S_IFREG:
3697        case S_IFCHR:
3698        case S_IFBLK:
3699        case S_IFIFO:
3700        case S_IFSOCK:
3701        case 0: /* zero mode translates to S_IFREG */
3702                return 0;
3703        case S_IFDIR:
3704                return -EPERM;
3705        default:
3706                return -EINVAL;
3707        }
3708}
3709
3710SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3711                unsigned, dev)
3712{
3713        struct dentry *dentry;
3714        struct path path;
3715        int error;
3716        unsigned int lookup_flags = 0;
3717
3718        error = may_mknod(mode);
3719        if (error)
3720                return error;
3721retry:
3722        dentry = user_path_create(dfd, filename, &path, lookup_flags);
3723        if (IS_ERR(dentry))
3724                return PTR_ERR(dentry);
3725
3726        if (!IS_POSIXACL(path.dentry->d_inode))
3727                mode &= ~current_umask();
3728        error = security_path_mknod(&path, dentry, mode, dev);
3729        if (error)
3730                goto out;
3731        switch (mode & S_IFMT) {
3732                case 0: case S_IFREG:
3733                        error = vfs_create(path.dentry->d_inode,dentry,mode,true);
3734                        if (!error)
3735                                ima_post_path_mknod(dentry);
3736                        break;
3737                case S_IFCHR: case S_IFBLK:
3738                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,
3739                                        new_decode_dev(dev));
3740                        break;
3741                case S_IFIFO: case S_IFSOCK:
3742                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
3743                        break;
3744        }
3745out:
3746        done_path_create(&path, dentry);
3747        if (retry_estale(error, lookup_flags)) {
3748                lookup_flags |= LOOKUP_REVAL;
3749                goto retry;
3750        }
3751        return error;
3752}
3753
3754SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3755{
3756        return sys_mknodat(AT_FDCWD, filename, mode, dev);
3757}
3758
3759int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3760{
3761        int error = may_create(dir, dentry);
3762        unsigned max_links = dir->i_sb->s_max_links;
3763
3764        if (error)
3765                return error;
3766
3767        if (!dir->i_op->mkdir)
3768                return -EPERM;
3769
3770        mode &= (S_IRWXUGO|S_ISVTX);
3771        error = security_inode_mkdir(dir, dentry, mode);
3772        if (error)
3773                return error;
3774
3775        if (max_links && dir->i_nlink >= max_links)
3776                return -EMLINK;
3777
3778        error = dir->i_op->mkdir(dir, dentry, mode);
3779        if (!error)
3780                fsnotify_mkdir(dir, dentry);
3781        return error;
3782}
3783EXPORT_SYMBOL(vfs_mkdir);
3784
3785SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3786{
3787        struct dentry *dentry;
3788        struct path path;
3789        int error;
3790        unsigned int lookup_flags = LOOKUP_DIRECTORY;
3791
3792retry:
3793        dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3794        if (IS_ERR(dentry))
3795                return PTR_ERR(dentry);
3796
3797        if (!IS_POSIXACL(path.dentry->d_inode))
3798                mode &= ~current_umask();
3799        error = security_path_mkdir(&path, dentry, mode);
3800        if (!error)
3801                error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
3802        done_path_create(&path, dentry);
3803        if (retry_estale(error, lookup_flags)) {
3804                lookup_flags |= LOOKUP_REVAL;
3805                goto retry;
3806        }
3807        return error;
3808}
3809
3810SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3811{
3812        return sys_mkdirat(AT_FDCWD, pathname, mode);
3813}
3814
3815int vfs_rmdir(struct inode *dir, struct dentry *dentry)
3816{
3817        int error = may_delete(dir, dentry, 1);
3818
3819        if (error)
3820                return error;
3821
3822        if (!dir->i_op->rmdir)
3823                return -EPERM;
3824
3825        dget(dentry);
3826        inode_lock(dentry->d_inode);
3827
3828        error = -EBUSY;
3829        if (is_local_mountpoint(dentry))
3830                goto out;
3831
3832        error = security_inode_rmdir(dir, dentry);
3833        if (error)
3834                goto out;
3835
3836        shrink_dcache_parent(dentry);
3837        error = dir->i_op->rmdir(dir, dentry);
3838        if (error)
3839                goto out;
3840
3841        dentry->d_inode->i_flags |= S_DEAD;
3842        dont_mount(dentry);
3843        detach_mounts(dentry);
3844
3845out:
3846        inode_unlock(dentry->d_inode);
3847        dput(dentry);
3848        if (!error)
3849                d_delete(dentry);
3850        return error;
3851}
3852EXPORT_SYMBOL(vfs_rmdir);
3853
3854static long do_rmdir(int dfd, const char __user *pathname)
3855{
3856        int error = 0;
3857        struct filename *name;
3858        struct dentry *dentry;
3859        struct path path;
3860        struct qstr last;
3861        int type;
3862        unsigned int lookup_flags = 0;
3863retry:
3864        name = user_path_parent(dfd, pathname,
3865                                &path, &last, &type, lookup_flags);
3866        if (IS_ERR(name))
3867                return PTR_ERR(name);
3868
3869        switch (type) {
3870        case LAST_DOTDOT:
3871                error = -ENOTEMPTY;
3872                goto exit1;
3873        case LAST_DOT:
3874                error = -EINVAL;
3875                goto exit1;
3876        case LAST_ROOT:
3877                error = -EBUSY;
3878                goto exit1;
3879        }
3880
3881        error = mnt_want_write(path.mnt);
3882        if (error)
3883                goto exit1;
3884
3885        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3886        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
3887        error = PTR_ERR(dentry);
3888        if (IS_ERR(dentry))
3889                goto exit2;
3890        if (!dentry->d_inode) {
3891                error = -ENOENT;
3892                goto exit3;
3893        }
3894        error = security_path_rmdir(&path, dentry);
3895        if (error)
3896                goto exit3;
3897        error = vfs_rmdir(path.dentry->d_inode, dentry);
3898exit3:
3899        dput(dentry);
3900exit2:
3901        inode_unlock(path.dentry->d_inode);
3902        mnt_drop_write(path.mnt);
3903exit1:
3904        path_put(&path);
3905        putname(name);
3906        if (retry_estale(error, lookup_flags)) {
3907                lookup_flags |= LOOKUP_REVAL;
3908                goto retry;
3909        }
3910        return error;
3911}
3912
3913SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3914{
3915        return do_rmdir(AT_FDCWD, pathname);
3916}
3917
3918/**
3919 * vfs_unlink - unlink a filesystem object
3920 * @dir:        parent directory
3921 * @dentry:     victim
3922 * @delegated_inode: returns victim inode, if the inode is delegated.
3923 *
3924 * The caller must hold dir->i_mutex.
3925 *
3926 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
3927 * return a reference to the inode in delegated_inode.  The caller
3928 * should then break the delegation on that inode and retry.  Because
3929 * breaking a delegation may take a long time, the caller should drop
3930 * dir->i_mutex before doing so.
3931 *
3932 * Alternatively, a caller may pass NULL for delegated_inode.  This may
3933 * be appropriate for callers that expect the underlying filesystem not
3934 * to be NFS exported.
3935 */
3936int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
3937{
3938        struct inode *target = dentry->d_inode;
3939        int error = may_delete(dir, dentry, 0);
3940
3941        if (error)
3942                return error;
3943
3944        if (!dir->i_op->unlink)
3945                return -EPERM;
3946
3947        inode_lock(target);
3948        if (is_local_mountpoint(dentry))
3949                error = -EBUSY;
3950        else {
3951                error = security_inode_unlink(dir, dentry);
3952                if (!error) {
3953                        error = try_break_deleg(target, delegated_inode);
3954                        if (error)
3955                                goto out;
3956                        error = dir->i_op->unlink(dir, dentry);
3957                        if (!error) {
3958                                dont_mount(dentry);
3959                                detach_mounts(dentry);
3960                        }
3961                }
3962        }
3963out:
3964        inode_unlock(target);
3965
3966        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
3967        if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
3968                fsnotify_link_count(target);
3969                d_delete(dentry);
3970        }
3971
3972        return error;
3973}
3974EXPORT_SYMBOL(vfs_unlink);
3975
3976/*
3977 * Make sure that the actual truncation of the file will occur outside its
3978 * directory's i_mutex.  Truncate can take a long time if there is a lot of
3979 * writeout happening, and we don't want to prevent access to the directory
3980 * while waiting on the I/O.
3981 */
3982static long do_unlinkat(int dfd, const char __user *pathname)
3983{
3984        int error;
3985        struct filename *name;
3986        struct dentry *dentry;
3987        struct path path;
3988        struct qstr last;
3989        int type;
3990        struct inode *inode = NULL;
3991        struct inode *delegated_inode = NULL;
3992        unsigned int lookup_flags = 0;
3993retry:
3994        name = user_path_parent(dfd, pathname,
3995                                &path, &last, &type, lookup_flags);
3996        if (IS_ERR(name))
3997                return PTR_ERR(name);
3998
3999        error = -EISDIR;
4000        if (type != LAST_NORM)
4001                goto exit1;
4002
4003        error = mnt_want_write(path.mnt);
4004        if (error)
4005                goto exit1;
4006retry_deleg:
4007        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4008        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
4009        error = PTR_ERR(dentry);
4010        if (!IS_ERR(dentry)) {
4011                /* Why not before? Because we want correct error value */
4012                if (last.name[last.len])
4013                        goto slashes;
4014                inode = dentry->d_inode;
4015                if (d_is_negative(dentry))
4016                        goto slashes;
4017                ihold(inode);
4018                error = security_path_unlink(&path, dentry);
4019                if (error)
4020                        goto exit2;
4021                error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
4022exit2:
4023                dput(dentry);
4024        }
4025        inode_unlock(path.dentry->d_inode);
4026        if (inode)
4027                iput(inode);    /* truncate the inode here */
4028        inode = NULL;
4029        if (delegated_inode) {
4030                error = break_deleg_wait(&delegated_inode);
4031                if (!error)
4032                        goto retry_deleg;
4033        }
4034        mnt_drop_write(path.mnt);
4035exit1:
4036        path_put(&path);
4037        putname(name);
4038        if (retry_estale(error, lookup_flags)) {
4039                lookup_flags |= LOOKUP_REVAL;
4040                inode = NULL;
4041                goto retry;
4042        }
4043        return error;
4044
4045slashes:
4046        if (d_is_negative(dentry))
4047                error = -ENOENT;
4048        else if (d_is_dir(dentry))
4049                error = -EISDIR;
4050        else
4051                error = -ENOTDIR;
4052        goto exit2;
4053}
4054
4055SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4056{
4057        if ((flag & ~AT_REMOVEDIR) != 0)
4058                return -EINVAL;
4059
4060        if (flag & AT_REMOVEDIR)
4061                return do_rmdir(dfd, pathname);
4062
4063        return do_unlinkat(dfd, pathname);
4064}
4065
4066SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4067{
4068        return do_unlinkat(AT_FDCWD, pathname);
4069}
4070
4071int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
4072{
4073        int error = may_create(dir, dentry);
4074
4075        if (error)
4076                return error;
4077
4078        if (!dir->i_op->symlink)
4079                return -EPERM;
4080
4081        error = security_inode_symlink(dir, dentry, oldname);
4082        if (error)
4083                return error;
4084
4085        error = dir->i_op->symlink(dir, dentry, oldname);
4086        if (!error)
4087                fsnotify_create(dir, dentry);
4088        return error;
4089}
4090EXPORT_SYMBOL(vfs_symlink);
4091
4092SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
4093                int, newdfd, const char __user *, newname)
4094{
4095        int error;
4096        struct filename *from;
4097        struct dentry *dentry;
4098        struct path path;
4099        unsigned int lookup_flags = 0;
4100
4101        from = getname(oldname);
4102        if (IS_ERR(from))
4103                return PTR_ERR(from);
4104retry:
4105        dentry = user_path_create(newdfd, newname, &path, lookup_flags);
4106        error = PTR_ERR(dentry);
4107        if (IS_ERR(dentry))
4108                goto out_putname;
4109
4110        error = security_path_symlink(&path, dentry, from->name);
4111        if (!error)
4112                error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
4113        done_path_create(&path, dentry);
4114        if (retry_estale(error, lookup_flags)) {
4115                lookup_flags |= LOOKUP_REVAL;
4116                goto retry;
4117        }
4118out_putname:
4119        putname(from);
4120        return error;
4121}
4122
4123SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4124{
4125        return sys_symlinkat(oldname, AT_FDCWD, newname);
4126}
4127
4128/**
4129 * vfs_link - create a new link
4130 * @old_dentry: object to be linked
4131 * @dir:        new parent
4132 * @new_dentry: where to create the new link
4133 * @delegated_inode: returns inode needing a delegation break
4134 *
4135 * The caller must hold dir->i_mutex
4136 *
4137 * If vfs_link discovers a delegation on the to-be-linked file in need
4138 * of breaking, it will return -EWOULDBLOCK and return a reference to the
4139 * inode in delegated_inode.  The caller should then break the delegation
4140 * and retry.  Because breaking a delegation may take a long time, the
4141 * caller should drop the i_mutex before doing so.
4142 *
4143 * Alternatively, a caller may pass NULL for delegated_inode.  This may
4144 * be appropriate for callers that expect the underlying filesystem not
4145 * to be NFS exported.
4146 */
4147int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
4148{
4149        struct inode *inode = old_dentry->d_inode;
4150        unsigned max_links = dir->i_sb->s_max_links;
4151        int error;
4152
4153        if (!inode)
4154                return -ENOENT;
4155
4156        error = may_create(dir, new_dentry);
4157        if (error)
4158                return error;
4159
4160        if (dir->i_sb != inode->i_sb)
4161                return -EXDEV;
4162
4163        /*
4164         * A link to an append-only or immutable file cannot be created.
4165         */
4166        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4167                return -EPERM;
4168        /*
4169         * Updating the link count will likely cause i_uid and i_gid to
4170         * be writen back improperly if their true value is unknown to
4171         * the vfs.
4172         */
4173        if (HAS_UNMAPPED_ID(inode))
4174                return -EPERM;
4175        if (!dir->i_op->link)
4176                return -EPERM;
4177        if (S_ISDIR(inode->i_mode))
4178                return -EPERM;
4179
4180        error = security_inode_link(old_dentry, dir, new_dentry);
4181        if (error)
4182                return error;
4183
4184        inode_lock(inode);
4185        /* Make sure we don't allow creating hardlink to an unlinked file */
4186        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4187                error =  -ENOENT;
4188        else if (max_links && inode->i_nlink >= max_links)
4189                error = -EMLINK;
4190        else {
4191                error = try_break_deleg(inode, delegated_inode);
4192                if (!error)
4193                        error = dir->i_op->link(old_dentry, dir, new_dentry);
4194        }
4195
4196        if (!error && (inode->i_state & I_LINKABLE)) {
4197                spin_lock(&inode->i_lock);
4198                inode->i_state &= ~I_LINKABLE;
4199                spin_unlock(&inode->i_lock);
4200        }
4201        inode_unlock(inode);
4202        if (!error)
4203                fsnotify_link(dir, inode, new_dentry);
4204        return error;
4205}
4206EXPORT_SYMBOL(vfs_link);
4207
4208/*
4209 * Hardlinks are often used in delicate situations.  We avoid
4210 * security-related surprises by not following symlinks on the
4211 * newname.  --KAB
4212 *
4213 * We don't follow them on the oldname either to be compatible
4214 * with linux 2.0, and to avoid hard-linking to directories
4215 * and other special files.  --ADM
4216 */
4217SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4218                int, newdfd, const char __user *, newname, int, flags)
4219{
4220        struct dentry *new_dentry;
4221        struct path old_path, new_path;
4222        struct inode *delegated_inode = NULL;
4223        int how = 0;
4224        int error;
4225
4226        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4227                return -EINVAL;
4228        /*
4229         * To use null names we require CAP_DAC_READ_SEARCH
4230         * This ensures that not everyone will be able to create
4231         * handlink using the passed filedescriptor.
4232         */
4233        if (flags & AT_EMPTY_PATH) {
4234                if (!capable(CAP_DAC_READ_SEARCH))
4235                        return -ENOENT;
4236                how = LOOKUP_EMPTY;
4237        }
4238
4239        if (flags & AT_SYMLINK_FOLLOW)
4240                how |= LOOKUP_FOLLOW;
4241retry:
4242        error = user_path_at(olddfd, oldname, how, &old_path);
4243        if (error)
4244                return error;
4245
4246        new_dentry = user_path_create(newdfd, newname, &new_path,
4247                                        (how & LOOKUP_REVAL));
4248        error = PTR_ERR(new_dentry);
4249        if (IS_ERR(new_dentry))
4250                goto out;
4251
4252        error = -EXDEV;
4253        if (old_path.mnt != new_path.mnt)
4254                goto out_dput;
4255        error = may_linkat(&old_path);
4256        if (unlikely(error))
4257                goto out_dput;
4258        error = security_path_link(old_path.dentry, &new_path, new_dentry);
4259        if (error)
4260                goto out_dput;
4261        error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
4262out_dput:
4263        done_path_create(&new_path, new_dentry);
4264        if (delegated_inode) {
4265                error = break_deleg_wait(&delegated_inode);
4266                if (!error) {
4267                        path_put(&old_path);
4268                        goto retry;
4269                }
4270        }
4271        if (retry_estale(error, how)) {
4272                path_put(&old_path);
4273                how |= LOOKUP_REVAL;
4274                goto retry;
4275        }
4276out:
4277        path_put(&old_path);
4278
4279        return error;
4280}
4281
4282SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4283{
4284        return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4285}
4286
4287/**
4288 * vfs_rename - rename a filesystem object
4289 * @old_dir:    parent of source
4290 * @old_dentry: source
4291 * @new_dir:    parent of destination
4292 * @new_dentry: destination
4293 * @delegated_inode: returns an inode needing a delegation break
4294 * @flags:      rename flags
4295 *
4296 * The caller must hold multiple mutexes--see lock_rename()).
4297 *
4298 * If vfs_rename discovers a delegation in need of breaking at either
4299 * the source or destination, it will return -EWOULDBLOCK and return a
4300 * reference to the inode in delegated_inode.  The caller should then
4301 * break the delegation and retry.  Because breaking a delegation may
4302 * take a long time, the caller should drop all locks before doing
4303 * so.
4304 *
4305 * Alternatively, a caller may pass NULL for delegated_inode.  This may
4306 * be appropriate for callers that expect the underlying filesystem not
4307 * to be NFS exported.
4308 *
4309 * The worst of all namespace operations - renaming directory. "Perverted"
4310 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4311 * Problems:
4312 *      a) we can get into loop creation.
4313 *      b) race potential - two innocent renames can create a loop together.
4314 *         That's where 4.4 screws up. Current fix: serialization on
4315 *         sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4316 *         story.
4317 *      c) we have to lock _four_ objects - parents and victim (if it exists),
4318 *         and source (if it is not a directory).
4319 *         And that - after we got ->i_mutex on parents (until then we don't know
4320 *         whether the target exists).  Solution: try to be smart with locking
4321 *         order for inodes.  We rely on the fact that tree topology may change
4322 *         only under ->s_vfs_rename_mutex _and_ that parent of the object we
4323 *         move will be locked.  Thus we can rank directories by the tree
4324 *         (ancestors first) and rank all non-directories after them.
4325 *         That works since everybody except rename does "lock parent, lookup,
4326 *         lock child" and rename is under ->s_vfs_rename_mutex.
4327 *         HOWEVER, it relies on the assumption that any object with ->lookup()
4328 *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
4329 *         we'd better make sure that there's no link(2) for them.
4330 *      d) conversion from fhandle to dentry may come in the wrong moment - when
4331 *         we are removing the target. Solution: we will have to grab ->i_mutex
4332 *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4333 *         ->i_mutex on parents, which works but leads to some truly excessive
4334 *         locking].
4335 */
4336int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4337               struct inode *new_dir, struct dentry *new_dentry,
4338               struct inode **delegated_inode, unsigned int flags)
4339{
4340        int error;
4341        bool is_dir = d_is_dir(old_dentry);
4342        const unsigned char *old_name;
4343        struct inode *source = old_dentry->d_inode;
4344        struct inode *target = new_dentry->d_inode;
4345        bool new_is_dir = false;
4346        unsigned max_links = new_dir->i_sb->s_max_links;
4347
4348        /*
4349         * Check source == target.
4350         * On overlayfs need to look at underlying inodes.
4351         */
4352        if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
4353                return 0;
4354
4355        error = may_delete(old_dir, old_dentry, is_dir);
4356        if (error)
4357                return error;
4358
4359        if (!target) {
4360                error = may_create(new_dir, new_dentry);
4361        } else {
4362                new_is_dir = d_is_dir(new_dentry);
4363
4364                if (!(flags & RENAME_EXCHANGE))
4365                        error = may_delete(new_dir, new_dentry, is_dir);
4366                else
4367                        error = may_delete(new_dir, new_dentry, new_is_dir);
4368        }
4369        if (error)
4370                return error;
4371
4372        if (!old_dir->i_op->rename)
4373                return -EPERM;
4374
4375        /*
4376         * If we are going to change the parent - check write permissions,
4377         * we'll need to flip '..'.
4378         */
4379        if (new_dir != old_dir) {
4380                if (is_dir) {
4381                        error = inode_permission(source, MAY_WRITE);
4382                        if (error)
4383                                return error;
4384                }
4385                if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4386                        error = inode_permission(target, MAY_WRITE);
4387                        if (error)
4388                                return error;
4389                }
4390        }
4391
4392        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4393                                      flags);
4394        if (error)
4395                return error;
4396
4397        old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4398        dget(new_dentry);
4399        if (!is_dir || (flags & RENAME_EXCHANGE))
4400                lock_two_nondirectories(source, target);
4401        else if (target)
4402                inode_lock(target);
4403
4404        error = -EBUSY;
4405        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
4406                goto out;
4407
4408        if (max_links && new_dir != old_dir) {
4409                error = -EMLINK;
4410                if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4411                        goto out;
4412                if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4413                    old_dir->i_nlink >= max_links)
4414                        goto out;
4415        }
4416        if (is_dir && !(flags & RENAME_EXCHANGE) && target)
4417                shrink_dcache_parent(new_dentry);
4418        if (!is_dir) {
4419                error = try_break_deleg(source, delegated_inode);
4420                if (error)
4421                        goto out;
4422        }
4423        if (target && !new_is_dir) {
4424                error = try_break_deleg(target, delegated_inode);
4425                if (error)
4426                        goto out;
4427        }
4428        error = old_dir->i_op->rename(old_dir, old_dentry,
4429                                       new_dir, new_dentry, flags);
4430        if (error)
4431                goto out;
4432
4433        if (!(flags & RENAME_EXCHANGE) && target) {
4434                if (is_dir)
4435                        target->i_flags |= S_DEAD;
4436                dont_mount(new_dentry);
4437                detach_mounts(new_dentry);
4438        }
4439        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4440                if (!(flags & RENAME_EXCHANGE))
4441                        d_move(old_dentry, new_dentry);
4442                else
4443                        d_exchange(old_dentry, new_dentry);
4444        }
4445out:
4446        if (!is_dir || (flags & RENAME_EXCHANGE))
4447                unlock_two_nondirectories(source, target);
4448        else if (target)
4449                inode_unlock(target);
4450        dput(new_dentry);
4451        if (!error) {
4452                fsnotify_move(old_dir, new_dir, old_name, is_dir,
4453                              !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
4454                if (flags & RENAME_EXCHANGE) {
4455                        fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
4456                                      new_is_dir, NULL, new_dentry);
4457                }
4458        }
4459        fsnotify_oldname_free(old_name);
4460
4461        return error;
4462}
4463EXPORT_SYMBOL(vfs_rename);
4464
4465SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4466                int, newdfd, const char __user *, newname, unsigned int, flags)
4467{
4468        struct dentry *old_dentry, *new_dentry;
4469        struct dentry *trap;
4470        struct path old_path, new_path;
4471        struct qstr old_last, new_last;
4472        int old_type, new_type;
4473        struct inode *delegated_inode = NULL;
4474        struct filename *from;
4475        struct filename *to;
4476        unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4477        bool should_retry = false;
4478        int error;
4479
4480        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4481                return -EINVAL;
4482
4483        if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
4484            (flags & RENAME_EXCHANGE))
4485                return -EINVAL;
4486
4487        if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
4488                return -EPERM;
4489
4490        if (flags & RENAME_EXCHANGE)
4491                target_flags = 0;
4492
4493retry:
4494        from = user_path_parent(olddfd, oldname,
4495                                &old_path, &old_last, &old_type, lookup_flags);
4496        if (IS_ERR(from)) {
4497                error = PTR_ERR(from);
4498                goto exit;
4499        }
4500
4501        to = user_path_parent(newdfd, newname,
4502                                &new_path, &new_last, &new_type, lookup_flags);
4503        if (IS_ERR(to)) {
4504                error = PTR_ERR(to);
4505                goto exit1;
4506        }
4507
4508        error = -EXDEV;
4509        if (old_path.mnt != new_path.mnt)
4510                goto exit2;
4511
4512        error = -EBUSY;
4513        if (old_type != LAST_NORM)
4514                goto exit2;
4515
4516        if (flags & RENAME_NOREPLACE)
4517                error = -EEXIST;
4518        if (new_type != LAST_NORM)
4519                goto exit2;
4520
4521        error = mnt_want_write(old_path.mnt);
4522        if (error)
4523                goto exit2;
4524
4525retry_deleg:
4526        trap = lock_rename(new_path.dentry, old_path.dentry);
4527
4528        old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
4529        error = PTR_ERR(old_dentry);
4530        if (IS_ERR(old_dentry))
4531                goto exit3;
4532        /* source must exist */
4533        error = -ENOENT;
4534        if (d_is_negative(old_dentry))
4535                goto exit4;
4536        new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
4537        error = PTR_ERR(new_dentry);
4538        if (IS_ERR(new_dentry))
4539                goto exit4;
4540        error = -EEXIST;
4541        if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
4542                goto exit5;
4543        if (flags & RENAME_EXCHANGE) {
4544                error = -ENOENT;
4545                if (d_is_negative(new_dentry))
4546                        goto exit5;
4547
4548                if (!d_is_dir(new_dentry)) {
4549                        error = -ENOTDIR;
4550                        if (new_last.name[new_last.len])
4551                                goto exit5;
4552                }
4553        }
4554        /* unless the source is a directory trailing slashes give -ENOTDIR */
4555        if (!d_is_dir(old_dentry)) {
4556                error = -ENOTDIR;
4557                if (old_last.name[old_last.len])
4558                        goto exit5;
4559                if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
4560                        goto exit5;
4561        }
4562        /* source should not be ancestor of target */
4563        error = -EINVAL;
4564        if (old_dentry == trap)
4565                goto exit5;
4566        /* target should not be an ancestor of source */
4567        if (!(flags & RENAME_EXCHANGE))
4568                error = -ENOTEMPTY;
4569        if (new_dentry == trap)
4570                goto exit5;
4571
4572        error = security_path_rename(&old_path, old_dentry,
4573                                     &new_path, new_dentry, flags);
4574        if (error)
4575                goto exit5;
4576        error = vfs_rename(old_path.dentry->d_inode, old_dentry,
4577                           new_path.dentry->d_inode, new_dentry,
4578                           &delegated_inode, flags);
4579exit5:
4580        dput(new_dentry);
4581exit4:
4582        dput(old_dentry);
4583exit3:
4584        unlock_rename(new_path.dentry, old_path.dentry);
4585        if (delegated_inode) {
4586                error = break_deleg_wait(&delegated_inode);
4587                if (!error)
4588                        goto retry_deleg;
4589        }
4590        mnt_drop_write(old_path.mnt);
4591exit2:
4592        if (retry_estale(error, lookup_flags))
4593                should_retry = true;
4594        path_put(&new_path);
4595        putname(to);
4596exit1:
4597        path_put(&old_path);
4598        putname(from);
4599        if (should_retry) {
4600                should_retry = false;
4601                lookup_flags |= LOOKUP_REVAL;
4602                goto retry;
4603        }
4604exit:
4605        return error;
4606}
4607
4608SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4609                int, newdfd, const char __user *, newname)
4610{
4611        return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
4612}
4613
4614SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4615{
4616        return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4617}
4618
4619int vfs_whiteout(struct inode *dir, struct dentry *dentry)
4620{
4621        int error = may_create(dir, dentry);
4622        if (error)
4623                return error;
4624
4625        if (!dir->i_op->mknod)
4626                return -EPERM;
4627
4628        return dir->i_op->mknod(dir, dentry,
4629                                S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
4630}
4631EXPORT_SYMBOL(vfs_whiteout);
4632
4633int readlink_copy(char __user *buffer, int buflen, const char *link)
4634{
4635        int len = PTR_ERR(link);
4636        if (IS_ERR(link))
4637                goto out;
4638
4639        len = strlen(link);
4640        if (len > (unsigned) buflen)
4641                len = buflen;
4642        if (copy_to_user(buffer, link, len))
4643                len = -EFAULT;
4644out:
4645        return len;
4646}
4647
4648/*
4649 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
4650 * have ->get_link() not calling nd_jump_link().  Using (or not using) it
4651 * for any given inode is up to filesystem.
4652 */
4653int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4654{
4655        DEFINE_DELAYED_CALL(done);
4656        struct inode *inode = d_inode(dentry);
4657        const char *link = inode->i_link;
4658        int res;
4659
4660        if (!link) {
4661                link = inode->i_op->get_link(dentry, inode, &done);
4662                if (IS_ERR(link))
4663                        return PTR_ERR(link);
4664        }
4665        res = readlink_copy(buffer, buflen, link);
4666        do_delayed_call(&done);
4667        return res;
4668}
4669EXPORT_SYMBOL(generic_readlink);
4670
4671/**
4672 * vfs_get_link - get symlink body
4673 * @dentry: dentry on which to get symbolic link
4674 * @done: caller needs to free returned data with this
4675 *
4676 * Calls security hook and i_op->get_link() on the supplied inode.
4677 *
4678 * It does not touch atime.  That's up to the caller if necessary.
4679 *
4680 * Does not work on "special" symlinks like /proc/$$/fd/N
4681 */
4682const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
4683{
4684        const char *res = ERR_PTR(-EINVAL);
4685        struct inode *inode = d_inode(dentry);
4686
4687        if (d_is_symlink(dentry)) {
4688                res = ERR_PTR(security_inode_readlink(dentry));
4689                if (!res)
4690                        res = inode->i_op->get_link(dentry, inode, done);
4691        }
4692        return res;
4693}
4694EXPORT_SYMBOL(vfs_get_link);
4695
4696/* get the link contents into pagecache */
4697const char *page_get_link(struct dentry *dentry, struct inode *inode,
4698                          struct delayed_call *callback)
4699{
4700        char *kaddr;
4701        struct page *page;
4702        struct address_space *mapping = inode->i_mapping;
4703
4704        if (!dentry) {
4705                page = find_get_page(mapping, 0);
4706                if (!page)
4707                        return ERR_PTR(-ECHILD);
4708                if (!PageUptodate(page)) {
4709                        put_page(page);
4710                        return ERR_PTR(-ECHILD);
4711                }
4712        } else {
4713                page = read_mapping_page(mapping, 0, NULL);
4714                if (IS_ERR(page))
4715                        return (char*)page;
4716        }
4717        set_delayed_call(callback, page_put_link, page);
4718        BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
4719        kaddr = page_address(page);
4720        nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4721        return kaddr;
4722}
4723
4724EXPORT_SYMBOL(page_get_link);
4725
4726void page_put_link(void *arg)
4727{
4728        put_page(arg);
4729}
4730EXPORT_SYMBOL(page_put_link);
4731
4732int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4733{
4734        DEFINE_DELAYED_CALL(done);
4735        int res = readlink_copy(buffer, buflen,
4736                                page_get_link(dentry, d_inode(dentry),
4737                                              &done));
4738        do_delayed_call(&done);
4739        return res;
4740}
4741EXPORT_SYMBOL(page_readlink);
4742
4743/*
4744 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
4745 */
4746int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
4747{
4748        struct address_space *mapping = inode->i_mapping;
4749        struct page *page;
4750        void *fsdata;
4751        int err;
4752        unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
4753        if (nofs)
4754                flags |= AOP_FLAG_NOFS;
4755
4756retry:
4757        err = pagecache_write_begin(NULL, mapping, 0, len-1,
4758                                flags, &page, &fsdata);
4759        if (err)
4760                goto fail;
4761
4762        memcpy(page_address(page), symname, len-1);
4763
4764        err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
4765                                                        page, fsdata);
4766        if (err < 0)
4767                goto fail;
4768        if (err < len-1)
4769                goto retry;
4770
4771        mark_inode_dirty(inode);
4772        return 0;
4773fail:
4774        return err;
4775}
4776EXPORT_SYMBOL(__page_symlink);
4777
4778int page_symlink(struct inode *inode, const char *symname, int len)
4779{
4780        return __page_symlink(inode, symname, len,
4781                        !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4782}
4783EXPORT_SYMBOL(page_symlink);
4784
4785const struct inode_operations page_symlink_inode_operations = {
4786        .readlink       = generic_readlink,
4787        .get_link       = page_get_link,
4788};
4789EXPORT_SYMBOL(page_symlink_inode_operations);
4790