linux/fs/namei.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/namei.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * Some corrections by tytso.
   9 */
  10
  11/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  12 * lookup logic.
  13 */
  14/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  15 */
  16
  17#include <linux/init.h>
  18#include <linux/export.h>
  19#include <linux/kernel.h>
  20#include <linux/slab.h>
  21#include <linux/fs.h>
  22#include <linux/namei.h>
  23#include <linux/pagemap.h>
  24#include <linux/fsnotify.h>
  25#include <linux/personality.h>
  26#include <linux/security.h>
  27#include <linux/ima.h>
  28#include <linux/syscalls.h>
  29#include <linux/mount.h>
  30#include <linux/audit.h>
  31#include <linux/capability.h>
  32#include <linux/file.h>
  33#include <linux/fcntl.h>
  34#include <linux/device_cgroup.h>
  35#include <linux/fs_struct.h>
  36#include <linux/posix_acl.h>
  37#include <linux/hash.h>
  38#include <linux/bitops.h>
  39#include <linux/init_task.h>
  40#include <linux/uaccess.h>
  41
  42#include "internal.h"
  43#include "mount.h"
  44
  45/* [Feb-1997 T. Schoebel-Theuer]
  46 * Fundamental changes in the pathname lookup mechanisms (namei)
  47 * were necessary because of omirr.  The reason is that omirr needs
  48 * to know the _real_ pathname, not the user-supplied one, in case
  49 * of symlinks (and also when transname replacements occur).
  50 *
  51 * The new code replaces the old recursive symlink resolution with
  52 * an iterative one (in case of non-nested symlink chains).  It does
  53 * this with calls to <fs>_follow_link().
  54 * As a side effect, dir_namei(), _namei() and follow_link() are now 
  55 * replaced with a single function lookup_dentry() that can handle all 
  56 * the special cases of the former code.
  57 *
  58 * With the new dcache, the pathname is stored at each inode, at least as
  59 * long as the refcount of the inode is positive.  As a side effect, the
  60 * size of the dcache depends on the inode cache and thus is dynamic.
  61 *
  62 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  63 * resolution to correspond with current state of the code.
  64 *
  65 * Note that the symlink resolution is not *completely* iterative.
  66 * There is still a significant amount of tail- and mid- recursion in
  67 * the algorithm.  Also, note that <fs>_readlink() is not used in
  68 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  69 * may return different results than <fs>_follow_link().  Many virtual
  70 * filesystems (including /proc) exhibit this behavior.
  71 */
  72
  73/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  74 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  75 * and the name already exists in form of a symlink, try to create the new
  76 * name indicated by the symlink. The old code always complained that the
  77 * name already exists, due to not following the symlink even if its target
  78 * is nonexistent.  The new semantics affects also mknod() and link() when
  79 * the name is a symlink pointing to a non-existent name.
  80 *
  81 * I don't know which semantics is the right one, since I have no access
  82 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  83 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  84 * "old" one. Personally, I think the new semantics is much more logical.
  85 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  86 * file does succeed in both HP-UX and SunOs, but not in Solaris
  87 * and in the old Linux semantics.
  88 */
  89
  90/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  91 * semantics.  See the comments in "open_namei" and "do_link" below.
  92 *
  93 * [10-Sep-98 Alan Modra] Another symlink change.
  94 */
  95
  96/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  97 *      inside the path - always follow.
  98 *      in the last component in creation/removal/renaming - never follow.
  99 *      if LOOKUP_FOLLOW passed - follow.
 100 *      if the pathname has trailing slashes - follow.
 101 *      otherwise - don't follow.
 102 * (applied in that order).
 103 *
 104 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 105 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 106 * During the 2.4 we need to fix the userland stuff depending on it -
 107 * hopefully we will be able to get rid of that wart in 2.5. So far only
 108 * XEmacs seems to be relying on it...
 109 */
 110/*
 111 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 112 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 113 * any extra contention...
 114 */
 115
 116/* In order to reduce some races, while at the same time doing additional
 117 * checking and hopefully speeding things up, we copy filenames to the
 118 * kernel data space before using them..
 119 *
 120 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 121 * PATH_MAX includes the nul terminator --RR.
 122 */
 123
 124#define EMBEDDED_NAME_MAX       (PATH_MAX - offsetof(struct filename, iname))
 125
 126struct filename *
 127getname_flags(const char __user *filename, int flags, int *empty)
 128{
 129        struct filename *result;
 130        char *kname;
 131        int len;
 132
 133        result = audit_reusename(filename);
 134        if (result)
 135                return result;
 136
 137        result = __getname();
 138        if (unlikely(!result))
 139                return ERR_PTR(-ENOMEM);
 140
 141        /*
 142         * First, try to embed the struct filename inside the names_cache
 143         * allocation
 144         */
 145        kname = (char *)result->iname;
 146        result->name = kname;
 147
 148        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
 149        if (unlikely(len < 0)) {
 150                __putname(result);
 151                return ERR_PTR(len);
 152        }
 153
 154        /*
 155         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
 156         * separate struct filename so we can dedicate the entire
 157         * names_cache allocation for the pathname, and re-do the copy from
 158         * userland.
 159         */
 160        if (unlikely(len == EMBEDDED_NAME_MAX)) {
 161                const size_t size = offsetof(struct filename, iname[1]);
 162                kname = (char *)result;
 163
 164                /*
 165                 * size is chosen that way we to guarantee that
 166                 * result->iname[0] is within the same object and that
 167                 * kname can't be equal to result->iname, no matter what.
 168                 */
 169                result = kzalloc(size, GFP_KERNEL);
 170                if (unlikely(!result)) {
 171                        __putname(kname);
 172                        return ERR_PTR(-ENOMEM);
 173                }
 174                result->name = kname;
 175                len = strncpy_from_user(kname, filename, PATH_MAX);
 176                if (unlikely(len < 0)) {
 177                        __putname(kname);
 178                        kfree(result);
 179                        return ERR_PTR(len);
 180                }
 181                if (unlikely(len == PATH_MAX)) {
 182                        __putname(kname);
 183                        kfree(result);
 184                        return ERR_PTR(-ENAMETOOLONG);
 185                }
 186        }
 187
 188        result->refcnt = 1;
 189        /* The empty path is special. */
 190        if (unlikely(!len)) {
 191                if (empty)
 192                        *empty = 1;
 193                if (!(flags & LOOKUP_EMPTY)) {
 194                        putname(result);
 195                        return ERR_PTR(-ENOENT);
 196                }
 197        }
 198
 199        result->uptr = filename;
 200        result->aname = NULL;
 201        audit_getname(result);
 202        return result;
 203}
 204
 205struct filename *
 206getname(const char __user * filename)
 207{
 208        return getname_flags(filename, 0, NULL);
 209}
 210
 211struct filename *
 212getname_kernel(const char * filename)
 213{
 214        struct filename *result;
 215        int len = strlen(filename) + 1;
 216
 217        result = __getname();
 218        if (unlikely(!result))
 219                return ERR_PTR(-ENOMEM);
 220
 221        if (len <= EMBEDDED_NAME_MAX) {
 222                result->name = (char *)result->iname;
 223        } else if (len <= PATH_MAX) {
 224                struct filename *tmp;
 225
 226                tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
 227                if (unlikely(!tmp)) {
 228                        __putname(result);
 229                        return ERR_PTR(-ENOMEM);
 230                }
 231                tmp->name = (char *)result;
 232                result = tmp;
 233        } else {
 234                __putname(result);
 235                return ERR_PTR(-ENAMETOOLONG);
 236        }
 237        memcpy((char *)result->name, filename, len);
 238        result->uptr = NULL;
 239        result->aname = NULL;
 240        result->refcnt = 1;
 241        audit_getname(result);
 242
 243        return result;
 244}
 245
 246void putname(struct filename *name)
 247{
 248        BUG_ON(name->refcnt <= 0);
 249
 250        if (--name->refcnt > 0)
 251                return;
 252
 253        if (name->name != name->iname) {
 254                __putname(name->name);
 255                kfree(name);
 256        } else
 257                __putname(name);
 258}
 259
 260static int check_acl(struct inode *inode, int mask)
 261{
 262#ifdef CONFIG_FS_POSIX_ACL
 263        struct posix_acl *acl;
 264
 265        if (mask & MAY_NOT_BLOCK) {
 266                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
 267                if (!acl)
 268                        return -EAGAIN;
 269                /* no ->get_acl() calls in RCU mode... */
 270                if (is_uncached_acl(acl))
 271                        return -ECHILD;
 272                return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
 273        }
 274
 275        acl = get_acl(inode, ACL_TYPE_ACCESS);
 276        if (IS_ERR(acl))
 277                return PTR_ERR(acl);
 278        if (acl) {
 279                int error = posix_acl_permission(inode, acl, mask);
 280                posix_acl_release(acl);
 281                return error;
 282        }
 283#endif
 284
 285        return -EAGAIN;
 286}
 287
 288/*
 289 * This does the basic permission checking
 290 */
 291static int acl_permission_check(struct inode *inode, int mask)
 292{
 293        unsigned int mode = inode->i_mode;
 294
 295        if (likely(uid_eq(current_fsuid(), inode->i_uid)))
 296                mode >>= 6;
 297        else {
 298                if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
 299                        int error = check_acl(inode, mask);
 300                        if (error != -EAGAIN)
 301                                return error;
 302                }
 303
 304                if (in_group_p(inode->i_gid))
 305                        mode >>= 3;
 306        }
 307
 308        /*
 309         * If the DACs are ok we don't need any capability check.
 310         */
 311        if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 312                return 0;
 313        return -EACCES;
 314}
 315
 316/**
 317 * generic_permission -  check for access rights on a Posix-like filesystem
 318 * @inode:      inode to check access rights for
 319 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
 320 *
 321 * Used to check for read/write/execute permissions on a file.
 322 * We use "fsuid" for this, letting us set arbitrary permissions
 323 * for filesystem access without changing the "normal" uids which
 324 * are used for other things.
 325 *
 326 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 327 * request cannot be satisfied (eg. requires blocking or too much complexity).
 328 * It would then be called again in ref-walk mode.
 329 */
 330int generic_permission(struct inode *inode, int mask)
 331{
 332        int ret;
 333
 334        /*
 335         * Do the basic permission checks.
 336         */
 337        ret = acl_permission_check(inode, mask);
 338        if (ret != -EACCES)
 339                return ret;
 340
 341        if (S_ISDIR(inode->i_mode)) {
 342                /* DACs are overridable for directories */
 343                if (!(mask & MAY_WRITE))
 344                        if (capable_wrt_inode_uidgid(inode,
 345                                                     CAP_DAC_READ_SEARCH))
 346                                return 0;
 347                if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
 348                        return 0;
 349                return -EACCES;
 350        }
 351
 352        /*
 353         * Searching includes executable on directories, else just read.
 354         */
 355        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 356        if (mask == MAY_READ)
 357                if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
 358                        return 0;
 359        /*
 360         * Read/write DACs are always overridable.
 361         * Executable DACs are overridable when there is
 362         * at least one exec bit set.
 363         */
 364        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
 365                if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
 366                        return 0;
 367
 368        return -EACCES;
 369}
 370EXPORT_SYMBOL(generic_permission);
 371
 372/*
 373 * We _really_ want to just do "generic_permission()" without
 374 * even looking at the inode->i_op values. So we keep a cache
 375 * flag in inode->i_opflags, that says "this has not special
 376 * permission function, use the fast case".
 377 */
 378static inline int do_inode_permission(struct inode *inode, int mask)
 379{
 380        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 381                if (likely(inode->i_op->permission))
 382                        return inode->i_op->permission(inode, mask);
 383
 384                /* This gets set once for the inode lifetime */
 385                spin_lock(&inode->i_lock);
 386                inode->i_opflags |= IOP_FASTPERM;
 387                spin_unlock(&inode->i_lock);
 388        }
 389        return generic_permission(inode, mask);
 390}
 391
 392/**
 393 * __inode_permission - Check for access rights to a given inode
 394 * @inode: Inode to check permission on
 395 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 396 *
 397 * Check for read/write/execute permissions on an inode.
 398 *
 399 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 400 *
 401 * This does not check for a read-only file system.  You probably want
 402 * inode_permission().
 403 */
 404int __inode_permission(struct inode *inode, int mask)
 405{
 406        int retval;
 407
 408        if (unlikely(mask & MAY_WRITE)) {
 409                /*
 410                 * Nobody gets write access to an immutable file.
 411                 */
 412                if (IS_IMMUTABLE(inode))
 413                        return -EPERM;
 414
 415                /*
 416                 * Updating mtime will likely cause i_uid and i_gid to be
 417                 * written back improperly if their true value is unknown
 418                 * to the vfs.
 419                 */
 420                if (HAS_UNMAPPED_ID(inode))
 421                        return -EACCES;
 422        }
 423
 424        retval = do_inode_permission(inode, mask);
 425        if (retval)
 426                return retval;
 427
 428        retval = devcgroup_inode_permission(inode, mask);
 429        if (retval)
 430                return retval;
 431
 432        return security_inode_permission(inode, mask);
 433}
 434EXPORT_SYMBOL(__inode_permission);
 435
 436/**
 437 * sb_permission - Check superblock-level permissions
 438 * @sb: Superblock of inode to check permission on
 439 * @inode: Inode to check permission on
 440 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 441 *
 442 * Separate out file-system wide checks from inode-specific permission checks.
 443 */
 444static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 445{
 446        if (unlikely(mask & MAY_WRITE)) {
 447                umode_t mode = inode->i_mode;
 448
 449                /* Nobody gets write access to a read-only fs. */
 450                if ((sb->s_flags & MS_RDONLY) &&
 451                    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 452                        return -EROFS;
 453        }
 454        return 0;
 455}
 456
 457/**
 458 * inode_permission - Check for access rights to a given inode
 459 * @inode: Inode to check permission on
 460 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 461 *
 462 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 463 * this, letting us set arbitrary permissions for filesystem access without
 464 * changing the "normal" UIDs which are used for other things.
 465 *
 466 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 467 */
 468int inode_permission(struct inode *inode, int mask)
 469{
 470        int retval;
 471
 472        retval = sb_permission(inode->i_sb, inode, mask);
 473        if (retval)
 474                return retval;
 475        return __inode_permission(inode, mask);
 476}
 477EXPORT_SYMBOL(inode_permission);
 478
 479/**
 480 * path_get - get a reference to a path
 481 * @path: path to get the reference to
 482 *
 483 * Given a path increment the reference count to the dentry and the vfsmount.
 484 */
 485void path_get(const struct path *path)
 486{
 487        mntget(path->mnt);
 488        dget(path->dentry);
 489}
 490EXPORT_SYMBOL(path_get);
 491
 492/**
 493 * path_put - put a reference to a path
 494 * @path: path to put the reference to
 495 *
 496 * Given a path decrement the reference count to the dentry and the vfsmount.
 497 */
 498void path_put(const struct path *path)
 499{
 500        dput(path->dentry);
 501        mntput(path->mnt);
 502}
 503EXPORT_SYMBOL(path_put);
 504
 505#define EMBEDDED_LEVELS 2
 506struct nameidata {
 507        struct path     path;
 508        struct qstr     last;
 509        struct path     root;
 510        struct inode    *inode; /* path.dentry.d_inode */
 511        unsigned int    flags;
 512        unsigned        seq, m_seq;
 513        int             last_type;
 514        unsigned        depth;
 515        int             total_link_count;
 516        struct saved {
 517                struct path link;
 518                struct delayed_call done;
 519                const char *name;
 520                unsigned seq;
 521        } *stack, internal[EMBEDDED_LEVELS];
 522        struct filename *name;
 523        struct nameidata *saved;
 524        struct inode    *link_inode;
 525        unsigned        root_seq;
 526        int             dfd;
 527} __randomize_layout;
 528
 529static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 530{
 531        struct nameidata *old = current->nameidata;
 532        p->stack = p->internal;
 533        p->dfd = dfd;
 534        p->name = name;
 535        p->total_link_count = old ? old->total_link_count : 0;
 536        p->saved = old;
 537        current->nameidata = p;
 538}
 539
 540static void restore_nameidata(void)
 541{
 542        struct nameidata *now = current->nameidata, *old = now->saved;
 543
 544        current->nameidata = old;
 545        if (old)
 546                old->total_link_count = now->total_link_count;
 547        if (now->stack != now->internal)
 548                kfree(now->stack);
 549}
 550
 551static int __nd_alloc_stack(struct nameidata *nd)
 552{
 553        struct saved *p;
 554
 555        if (nd->flags & LOOKUP_RCU) {
 556                p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
 557                                  GFP_ATOMIC);
 558                if (unlikely(!p))
 559                        return -ECHILD;
 560        } else {
 561                p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
 562                                  GFP_KERNEL);
 563                if (unlikely(!p))
 564                        return -ENOMEM;
 565        }
 566        memcpy(p, nd->internal, sizeof(nd->internal));
 567        nd->stack = p;
 568        return 0;
 569}
 570
 571/**
 572 * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
 573 * @path: nameidate to verify
 574 *
 575 * Rename can sometimes move a file or directory outside of a bind
 576 * mount, path_connected allows those cases to be detected.
 577 */
 578static bool path_connected(const struct path *path)
 579{
 580        struct vfsmount *mnt = path->mnt;
 581
 582        /* Only bind mounts can have disconnected paths */
 583        if (mnt->mnt_root == mnt->mnt_sb->s_root)
 584                return true;
 585
 586        return is_subdir(path->dentry, mnt->mnt_root);
 587}
 588
 589static inline int nd_alloc_stack(struct nameidata *nd)
 590{
 591        if (likely(nd->depth != EMBEDDED_LEVELS))
 592                return 0;
 593        if (likely(nd->stack != nd->internal))
 594                return 0;
 595        return __nd_alloc_stack(nd);
 596}
 597
 598static void drop_links(struct nameidata *nd)
 599{
 600        int i = nd->depth;
 601        while (i--) {
 602                struct saved *last = nd->stack + i;
 603                do_delayed_call(&last->done);
 604                clear_delayed_call(&last->done);
 605        }
 606}
 607
 608static void terminate_walk(struct nameidata *nd)
 609{
 610        drop_links(nd);
 611        if (!(nd->flags & LOOKUP_RCU)) {
 612                int i;
 613                path_put(&nd->path);
 614                for (i = 0; i < nd->depth; i++)
 615                        path_put(&nd->stack[i].link);
 616                if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 617                        path_put(&nd->root);
 618                        nd->root.mnt = NULL;
 619                }
 620        } else {
 621                nd->flags &= ~LOOKUP_RCU;
 622                if (!(nd->flags & LOOKUP_ROOT))
 623                        nd->root.mnt = NULL;
 624                rcu_read_unlock();
 625        }
 626        nd->depth = 0;
 627}
 628
 629/* path_put is needed afterwards regardless of success or failure */
 630static bool legitimize_path(struct nameidata *nd,
 631                            struct path *path, unsigned seq)
 632{
 633        int res = __legitimize_mnt(path->mnt, nd->m_seq);
 634        if (unlikely(res)) {
 635                if (res > 0)
 636                        path->mnt = NULL;
 637                path->dentry = NULL;
 638                return false;
 639        }
 640        if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
 641                path->dentry = NULL;
 642                return false;
 643        }
 644        return !read_seqcount_retry(&path->dentry->d_seq, seq);
 645}
 646
 647static bool legitimize_links(struct nameidata *nd)
 648{
 649        int i;
 650        for (i = 0; i < nd->depth; i++) {
 651                struct saved *last = nd->stack + i;
 652                if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
 653                        drop_links(nd);
 654                        nd->depth = i + 1;
 655                        return false;
 656                }
 657        }
 658        return true;
 659}
 660
 661/*
 662 * Path walking has 2 modes, rcu-walk and ref-walk (see
 663 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 664 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 665 * normal reference counts on dentries and vfsmounts to transition to ref-walk
 666 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 667 * got stuck, so ref-walk may continue from there. If this is not successful
 668 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 669 * to restart the path walk from the beginning in ref-walk mode.
 670 */
 671
 672/**
 673 * unlazy_walk - try to switch to ref-walk mode.
 674 * @nd: nameidata pathwalk data
 675 * Returns: 0 on success, -ECHILD on failure
 676 *
 677 * unlazy_walk attempts to legitimize the current nd->path and nd->root
 678 * for ref-walk mode.
 679 * Must be called from rcu-walk context.
 680 * Nothing should touch nameidata between unlazy_walk() failure and
 681 * terminate_walk().
 682 */
 683static int unlazy_walk(struct nameidata *nd)
 684{
 685        struct dentry *parent = nd->path.dentry;
 686
 687        BUG_ON(!(nd->flags & LOOKUP_RCU));
 688
 689        nd->flags &= ~LOOKUP_RCU;
 690        if (unlikely(!legitimize_links(nd)))
 691                goto out2;
 692        if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
 693                goto out1;
 694        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 695                if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq)))
 696                        goto out;
 697        }
 698        rcu_read_unlock();
 699        BUG_ON(nd->inode != parent->d_inode);
 700        return 0;
 701
 702out2:
 703        nd->path.mnt = NULL;
 704        nd->path.dentry = NULL;
 705out1:
 706        if (!(nd->flags & LOOKUP_ROOT))
 707                nd->root.mnt = NULL;
 708out:
 709        rcu_read_unlock();
 710        return -ECHILD;
 711}
 712
 713/**
 714 * unlazy_child - try to switch to ref-walk mode.
 715 * @nd: nameidata pathwalk data
 716 * @dentry: child of nd->path.dentry
 717 * @seq: seq number to check dentry against
 718 * Returns: 0 on success, -ECHILD on failure
 719 *
 720 * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
 721 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 722 * @nd.  Must be called from rcu-walk context.
 723 * Nothing should touch nameidata between unlazy_child() failure and
 724 * terminate_walk().
 725 */
 726static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
 727{
 728        BUG_ON(!(nd->flags & LOOKUP_RCU));
 729
 730        nd->flags &= ~LOOKUP_RCU;
 731        if (unlikely(!legitimize_links(nd)))
 732                goto out2;
 733        if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
 734                goto out2;
 735        if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
 736                goto out1;
 737
 738        /*
 739         * We need to move both the parent and the dentry from the RCU domain
 740         * to be properly refcounted. And the sequence number in the dentry
 741         * validates *both* dentry counters, since we checked the sequence
 742         * number of the parent after we got the child sequence number. So we
 743         * know the parent must still be valid if the child sequence number is
 744         */
 745        if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
 746                goto out;
 747        if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) {
 748                rcu_read_unlock();
 749                dput(dentry);
 750                goto drop_root_mnt;
 751        }
 752        /*
 753         * Sequence counts matched. Now make sure that the root is
 754         * still valid and get it if required.
 755         */
 756        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 757                if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
 758                        rcu_read_unlock();
 759                        dput(dentry);
 760                        return -ECHILD;
 761                }
 762        }
 763
 764        rcu_read_unlock();
 765        return 0;
 766
 767out2:
 768        nd->path.mnt = NULL;
 769out1:
 770        nd->path.dentry = NULL;
 771out:
 772        rcu_read_unlock();
 773drop_root_mnt:
 774        if (!(nd->flags & LOOKUP_ROOT))
 775                nd->root.mnt = NULL;
 776        return -ECHILD;
 777}
 778
 779static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
 780{
 781        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
 782                return dentry->d_op->d_revalidate(dentry, flags);
 783        else
 784                return 1;
 785}
 786
 787/**
 788 * complete_walk - successful completion of path walk
 789 * @nd:  pointer nameidata
 790 *
 791 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 792 * Revalidate the final result, unless we'd already done that during
 793 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 794 * success, -error on failure.  In case of failure caller does not
 795 * need to drop nd->path.
 796 */
 797static int complete_walk(struct nameidata *nd)
 798{
 799        struct dentry *dentry = nd->path.dentry;
 800        int status;
 801
 802        if (nd->flags & LOOKUP_RCU) {
 803                if (!(nd->flags & LOOKUP_ROOT))
 804                        nd->root.mnt = NULL;
 805                if (unlikely(unlazy_walk(nd)))
 806                        return -ECHILD;
 807        }
 808
 809        if (likely(!(nd->flags & LOOKUP_JUMPED)))
 810                return 0;
 811
 812        if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
 813                return 0;
 814
 815        status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
 816        if (status > 0)
 817                return 0;
 818
 819        if (!status)
 820                status = -ESTALE;
 821
 822        return status;
 823}
 824
 825static void set_root(struct nameidata *nd)
 826{
 827        struct fs_struct *fs = current->fs;
 828
 829        if (nd->flags & LOOKUP_RCU) {
 830                unsigned seq;
 831
 832                do {
 833                        seq = read_seqcount_begin(&fs->seq);
 834                        nd->root = fs->root;
 835                        nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
 836                } while (read_seqcount_retry(&fs->seq, seq));
 837        } else {
 838                get_fs_root(fs, &nd->root);
 839        }
 840}
 841
 842static void path_put_conditional(struct path *path, struct nameidata *nd)
 843{
 844        dput(path->dentry);
 845        if (path->mnt != nd->path.mnt)
 846                mntput(path->mnt);
 847}
 848
 849static inline void path_to_nameidata(const struct path *path,
 850                                        struct nameidata *nd)
 851{
 852        if (!(nd->flags & LOOKUP_RCU)) {
 853                dput(nd->path.dentry);
 854                if (nd->path.mnt != path->mnt)
 855                        mntput(nd->path.mnt);
 856        }
 857        nd->path.mnt = path->mnt;
 858        nd->path.dentry = path->dentry;
 859}
 860
 861static int nd_jump_root(struct nameidata *nd)
 862{
 863        if (nd->flags & LOOKUP_RCU) {
 864                struct dentry *d;
 865                nd->path = nd->root;
 866                d = nd->path.dentry;
 867                nd->inode = d->d_inode;
 868                nd->seq = nd->root_seq;
 869                if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
 870                        return -ECHILD;
 871        } else {
 872                path_put(&nd->path);
 873                nd->path = nd->root;
 874                path_get(&nd->path);
 875                nd->inode = nd->path.dentry->d_inode;
 876        }
 877        nd->flags |= LOOKUP_JUMPED;
 878        return 0;
 879}
 880
 881/*
 882 * Helper to directly jump to a known parsed path from ->get_link,
 883 * caller must have taken a reference to path beforehand.
 884 */
 885void nd_jump_link(struct path *path)
 886{
 887        struct nameidata *nd = current->nameidata;
 888        path_put(&nd->path);
 889
 890        nd->path = *path;
 891        nd->inode = nd->path.dentry->d_inode;
 892        nd->flags |= LOOKUP_JUMPED;
 893}
 894
 895static inline void put_link(struct nameidata *nd)
 896{
 897        struct saved *last = nd->stack + --nd->depth;
 898        do_delayed_call(&last->done);
 899        if (!(nd->flags & LOOKUP_RCU))
 900                path_put(&last->link);
 901}
 902
 903int sysctl_protected_symlinks __read_mostly = 0;
 904int sysctl_protected_hardlinks __read_mostly = 0;
 905
 906/**
 907 * may_follow_link - Check symlink following for unsafe situations
 908 * @nd: nameidata pathwalk data
 909 *
 910 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 911 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 912 * in a sticky world-writable directory. This is to protect privileged
 913 * processes from failing races against path names that may change out
 914 * from under them by way of other users creating malicious symlinks.
 915 * It will permit symlinks to be followed only when outside a sticky
 916 * world-writable directory, or when the uid of the symlink and follower
 917 * match, or when the directory owner matches the symlink's owner.
 918 *
 919 * Returns 0 if following the symlink is allowed, -ve on error.
 920 */
 921static inline int may_follow_link(struct nameidata *nd)
 922{
 923        const struct inode *inode;
 924        const struct inode *parent;
 925        kuid_t puid;
 926
 927        if (!sysctl_protected_symlinks)
 928                return 0;
 929
 930        /* Allowed if owner and follower match. */
 931        inode = nd->link_inode;
 932        if (uid_eq(current_cred()->fsuid, inode->i_uid))
 933                return 0;
 934
 935        /* Allowed if parent directory not sticky and world-writable. */
 936        parent = nd->inode;
 937        if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
 938                return 0;
 939
 940        /* Allowed if parent directory and link owner match. */
 941        puid = parent->i_uid;
 942        if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
 943                return 0;
 944
 945        if (nd->flags & LOOKUP_RCU)
 946                return -ECHILD;
 947
 948        audit_log_link_denied("follow_link", &nd->stack[0].link);
 949        return -EACCES;
 950}
 951
 952/**
 953 * safe_hardlink_source - Check for safe hardlink conditions
 954 * @inode: the source inode to hardlink from
 955 *
 956 * Return false if at least one of the following conditions:
 957 *    - inode is not a regular file
 958 *    - inode is setuid
 959 *    - inode is setgid and group-exec
 960 *    - access failure for read and write
 961 *
 962 * Otherwise returns true.
 963 */
 964static bool safe_hardlink_source(struct inode *inode)
 965{
 966        umode_t mode = inode->i_mode;
 967
 968        /* Special files should not get pinned to the filesystem. */
 969        if (!S_ISREG(mode))
 970                return false;
 971
 972        /* Setuid files should not get pinned to the filesystem. */
 973        if (mode & S_ISUID)
 974                return false;
 975
 976        /* Executable setgid files should not get pinned to the filesystem. */
 977        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
 978                return false;
 979
 980        /* Hardlinking to unreadable or unwritable sources is dangerous. */
 981        if (inode_permission(inode, MAY_READ | MAY_WRITE))
 982                return false;
 983
 984        return true;
 985}
 986
 987/**
 988 * may_linkat - Check permissions for creating a hardlink
 989 * @link: the source to hardlink from
 990 *
 991 * Block hardlink when all of:
 992 *  - sysctl_protected_hardlinks enabled
 993 *  - fsuid does not match inode
 994 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 995 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
 996 *
 997 * Returns 0 if successful, -ve on error.
 998 */
 999static int may_linkat(struct path *link)
1000{
1001        struct inode *inode;
1002
1003        if (!sysctl_protected_hardlinks)
1004                return 0;
1005
1006        inode = link->dentry->d_inode;
1007
1008        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
1009         * otherwise, it must be a safe source.
1010         */
1011        if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
1012                return 0;
1013
1014        audit_log_link_denied("linkat", link);
1015        return -EPERM;
1016}
1017
1018static __always_inline
1019const char *get_link(struct nameidata *nd)
1020{
1021        struct saved *last = nd->stack + nd->depth - 1;
1022        struct dentry *dentry = last->link.dentry;
1023        struct inode *inode = nd->link_inode;
1024        int error;
1025        const char *res;
1026
1027        if (!(nd->flags & LOOKUP_RCU)) {
1028                touch_atime(&last->link);
1029                cond_resched();
1030        } else if (atime_needs_update_rcu(&last->link, inode)) {
1031                if (unlikely(unlazy_walk(nd)))
1032                        return ERR_PTR(-ECHILD);
1033                touch_atime(&last->link);
1034        }
1035
1036        error = security_inode_follow_link(dentry, inode,
1037                                           nd->flags & LOOKUP_RCU);
1038        if (unlikely(error))
1039                return ERR_PTR(error);
1040
1041        nd->last_type = LAST_BIND;
1042        res = inode->i_link;
1043        if (!res) {
1044                const char * (*get)(struct dentry *, struct inode *,
1045                                struct delayed_call *);
1046                get = inode->i_op->get_link;
1047                if (nd->flags & LOOKUP_RCU) {
1048                        res = get(NULL, inode, &last->done);
1049                        if (res == ERR_PTR(-ECHILD)) {
1050                                if (unlikely(unlazy_walk(nd)))
1051                                        return ERR_PTR(-ECHILD);
1052                                res = get(dentry, inode, &last->done);
1053                        }
1054                } else {
1055                        res = get(dentry, inode, &last->done);
1056                }
1057                if (IS_ERR_OR_NULL(res))
1058                        return res;
1059        }
1060        if (*res == '/') {
1061                if (!nd->root.mnt)
1062                        set_root(nd);
1063                if (unlikely(nd_jump_root(nd)))
1064                        return ERR_PTR(-ECHILD);
1065                while (unlikely(*++res == '/'))
1066                        ;
1067        }
1068        if (!*res)
1069                res = NULL;
1070        return res;
1071}
1072
1073/*
1074 * follow_up - Find the mountpoint of path's vfsmount
1075 *
1076 * Given a path, find the mountpoint of its source file system.
1077 * Replace @path with the path of the mountpoint in the parent mount.
1078 * Up is towards /.
1079 *
1080 * Return 1 if we went up a level and 0 if we were already at the
1081 * root.
1082 */
1083int follow_up(struct path *path)
1084{
1085        struct mount *mnt = real_mount(path->mnt);
1086        struct mount *parent;
1087        struct dentry *mountpoint;
1088
1089        read_seqlock_excl(&mount_lock);
1090        parent = mnt->mnt_parent;
1091        if (parent == mnt) {
1092                read_sequnlock_excl(&mount_lock);
1093                return 0;
1094        }
1095        mntget(&parent->mnt);
1096        mountpoint = dget(mnt->mnt_mountpoint);
1097        read_sequnlock_excl(&mount_lock);
1098        dput(path->dentry);
1099        path->dentry = mountpoint;
1100        mntput(path->mnt);
1101        path->mnt = &parent->mnt;
1102        return 1;
1103}
1104EXPORT_SYMBOL(follow_up);
1105
1106/*
1107 * Perform an automount
1108 * - return -EISDIR to tell follow_managed() to stop and return the path we
1109 *   were called with.
1110 */
1111static int follow_automount(struct path *path, struct nameidata *nd,
1112                            bool *need_mntput)
1113{
1114        struct vfsmount *mnt;
1115        int err;
1116
1117        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
1118                return -EREMOTE;
1119
1120        /* We don't want to mount if someone's just doing a stat -
1121         * unless they're stat'ing a directory and appended a '/' to
1122         * the name.
1123         *
1124         * We do, however, want to mount if someone wants to open or
1125         * create a file of any type under the mountpoint, wants to
1126         * traverse through the mountpoint or wants to open the
1127         * mounted directory.  Also, autofs may mark negative dentries
1128         * as being automount points.  These will need the attentions
1129         * of the daemon to instantiate them before they can be used.
1130         */
1131        if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1132                           LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1133            path->dentry->d_inode)
1134                return -EISDIR;
1135
1136        if (path->dentry->d_sb->s_user_ns != &init_user_ns)
1137                return -EACCES;
1138
1139        nd->total_link_count++;
1140        if (nd->total_link_count >= 40)
1141                return -ELOOP;
1142
1143        mnt = path->dentry->d_op->d_automount(path);
1144        if (IS_ERR(mnt)) {
1145                /*
1146                 * The filesystem is allowed to return -EISDIR here to indicate
1147                 * it doesn't want to automount.  For instance, autofs would do
1148                 * this so that its userspace daemon can mount on this dentry.
1149                 *
1150                 * However, we can only permit this if it's a terminal point in
1151                 * the path being looked up; if it wasn't then the remainder of
1152                 * the path is inaccessible and we should say so.
1153                 */
1154                if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
1155                        return -EREMOTE;
1156                return PTR_ERR(mnt);
1157        }
1158
1159        if (!mnt) /* mount collision */
1160                return 0;
1161
1162        if (!*need_mntput) {
1163                /* lock_mount() may release path->mnt on error */
1164                mntget(path->mnt);
1165                *need_mntput = true;
1166        }
1167        err = finish_automount(mnt, path);
1168
1169        switch (err) {
1170        case -EBUSY:
1171                /* Someone else made a mount here whilst we were busy */
1172                return 0;
1173        case 0:
1174                path_put(path);
1175                path->mnt = mnt;
1176                path->dentry = dget(mnt->mnt_root);
1177                return 0;
1178        default:
1179                return err;
1180        }
1181
1182}
1183
1184/*
1185 * Handle a dentry that is managed in some way.
1186 * - Flagged for transit management (autofs)
1187 * - Flagged as mountpoint
1188 * - Flagged as automount point
1189 *
1190 * This may only be called in refwalk mode.
1191 *
1192 * Serialization is taken care of in namespace.c
1193 */
1194static int follow_managed(struct path *path, struct nameidata *nd)
1195{
1196        struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1197        unsigned managed;
1198        bool need_mntput = false;
1199        int ret = 0;
1200
1201        /* Given that we're not holding a lock here, we retain the value in a
1202         * local variable for each dentry as we look at it so that we don't see
1203         * the components of that value change under us */
1204        while (managed = ACCESS_ONCE(path->dentry->d_flags),
1205               managed &= DCACHE_MANAGED_DENTRY,
1206               unlikely(managed != 0)) {
1207                /* Allow the filesystem to manage the transit without i_mutex
1208                 * being held. */
1209                if (managed & DCACHE_MANAGE_TRANSIT) {
1210                        BUG_ON(!path->dentry->d_op);
1211                        BUG_ON(!path->dentry->d_op->d_manage);
1212                        ret = path->dentry->d_op->d_manage(path, false);
1213                        if (ret < 0)
1214                                break;
1215                }
1216
1217                /* Transit to a mounted filesystem. */
1218                if (managed & DCACHE_MOUNTED) {
1219                        struct vfsmount *mounted = lookup_mnt(path);
1220                        if (mounted) {
1221                                dput(path->dentry);
1222                                if (need_mntput)
1223                                        mntput(path->mnt);
1224                                path->mnt = mounted;
1225                                path->dentry = dget(mounted->mnt_root);
1226                                need_mntput = true;
1227                                continue;
1228                        }
1229
1230                        /* Something is mounted on this dentry in another
1231                         * namespace and/or whatever was mounted there in this
1232                         * namespace got unmounted before lookup_mnt() could
1233                         * get it */
1234                }
1235
1236                /* Handle an automount point */
1237                if (managed & DCACHE_NEED_AUTOMOUNT) {
1238                        ret = follow_automount(path, nd, &need_mntput);
1239                        if (ret < 0)
1240                                break;
1241                        continue;
1242                }
1243
1244                /* We didn't change the current path point */
1245                break;
1246        }
1247
1248        if (need_mntput && path->mnt == mnt)
1249                mntput(path->mnt);
1250        if (ret == -EISDIR || !ret)
1251                ret = 1;
1252        if (need_mntput)
1253                nd->flags |= LOOKUP_JUMPED;
1254        if (unlikely(ret < 0))
1255                path_put_conditional(path, nd);
1256        return ret;
1257}
1258
1259int follow_down_one(struct path *path)
1260{
1261        struct vfsmount *mounted;
1262
1263        mounted = lookup_mnt(path);
1264        if (mounted) {
1265                dput(path->dentry);
1266                mntput(path->mnt);
1267                path->mnt = mounted;
1268                path->dentry = dget(mounted->mnt_root);
1269                return 1;
1270        }
1271        return 0;
1272}
1273EXPORT_SYMBOL(follow_down_one);
1274
1275static inline int managed_dentry_rcu(const struct path *path)
1276{
1277        return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
1278                path->dentry->d_op->d_manage(path, true) : 0;
1279}
1280
1281/*
1282 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
1283 * we meet a managed dentry that would need blocking.
1284 */
1285static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1286                               struct inode **inode, unsigned *seqp)
1287{
1288        for (;;) {
1289                struct mount *mounted;
1290                /*
1291                 * Don't forget we might have a non-mountpoint managed dentry
1292                 * that wants to block transit.
1293                 */
1294                switch (managed_dentry_rcu(path)) {
1295                case -ECHILD:
1296                default:
1297                        return false;
1298                case -EISDIR:
1299                        return true;
1300                case 0:
1301                        break;
1302                }
1303
1304                if (!d_mountpoint(path->dentry))
1305                        return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1306
1307                mounted = __lookup_mnt(path->mnt, path->dentry);
1308                if (!mounted)
1309                        break;
1310                path->mnt = &mounted->mnt;
1311                path->dentry = mounted->mnt.mnt_root;
1312                nd->flags |= LOOKUP_JUMPED;
1313                *seqp = read_seqcount_begin(&path->dentry->d_seq);
1314                /*
1315                 * Update the inode too. We don't need to re-check the
1316                 * dentry sequence number here after this d_inode read,
1317                 * because a mount-point is always pinned.
1318                 */
1319                *inode = path->dentry->d_inode;
1320        }
1321        return !read_seqretry(&mount_lock, nd->m_seq) &&
1322                !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1323}
1324
1325static int follow_dotdot_rcu(struct nameidata *nd)
1326{
1327        struct inode *inode = nd->inode;
1328
1329        while (1) {
1330                if (path_equal(&nd->path, &nd->root))
1331                        break;
1332                if (nd->path.dentry != nd->path.mnt->mnt_root) {
1333                        struct dentry *old = nd->path.dentry;
1334                        struct dentry *parent = old->d_parent;
1335                        unsigned seq;
1336
1337                        inode = parent->d_inode;
1338                        seq = read_seqcount_begin(&parent->d_seq);
1339                        if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
1340                                return -ECHILD;
1341                        nd->path.dentry = parent;
1342                        nd->seq = seq;
1343                        if (unlikely(!path_connected(&nd->path)))
1344                                return -ENOENT;
1345                        break;
1346                } else {
1347                        struct mount *mnt = real_mount(nd->path.mnt);
1348                        struct mount *mparent = mnt->mnt_parent;
1349                        struct dentry *mountpoint = mnt->mnt_mountpoint;
1350                        struct inode *inode2 = mountpoint->d_inode;
1351                        unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
1352                        if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1353                                return -ECHILD;
1354                        if (&mparent->mnt == nd->path.mnt)
1355                                break;
1356                        /* we know that mountpoint was pinned */
1357                        nd->path.dentry = mountpoint;
1358                        nd->path.mnt = &mparent->mnt;
1359                        inode = inode2;
1360                        nd->seq = seq;
1361                }
1362        }
1363        while (unlikely(d_mountpoint(nd->path.dentry))) {
1364                struct mount *mounted;
1365                mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
1366                if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1367                        return -ECHILD;
1368                if (!mounted)
1369                        break;
1370                nd->path.mnt = &mounted->mnt;
1371                nd->path.dentry = mounted->mnt.mnt_root;
1372                inode = nd->path.dentry->d_inode;
1373                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1374        }
1375        nd->inode = inode;
1376        return 0;
1377}
1378
1379/*
1380 * Follow down to the covering mount currently visible to userspace.  At each
1381 * point, the filesystem owning that dentry may be queried as to whether the
1382 * caller is permitted to proceed or not.
1383 */
1384int follow_down(struct path *path)
1385{
1386        unsigned managed;
1387        int ret;
1388
1389        while (managed = ACCESS_ONCE(path->dentry->d_flags),
1390               unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1391                /* Allow the filesystem to manage the transit without i_mutex
1392                 * being held.
1393                 *
1394                 * We indicate to the filesystem if someone is trying to mount
1395                 * something here.  This gives autofs the chance to deny anyone
1396                 * other than its daemon the right to mount on its
1397                 * superstructure.
1398                 *
1399                 * The filesystem may sleep at this point.
1400                 */
1401                if (managed & DCACHE_MANAGE_TRANSIT) {
1402                        BUG_ON(!path->dentry->d_op);
1403                        BUG_ON(!path->dentry->d_op->d_manage);
1404                        ret = path->dentry->d_op->d_manage(path, false);
1405                        if (ret < 0)
1406                                return ret == -EISDIR ? 0 : ret;
1407                }
1408
1409                /* Transit to a mounted filesystem. */
1410                if (managed & DCACHE_MOUNTED) {
1411                        struct vfsmount *mounted = lookup_mnt(path);
1412                        if (!mounted)
1413                                break;
1414                        dput(path->dentry);
1415                        mntput(path->mnt);
1416                        path->mnt = mounted;
1417                        path->dentry = dget(mounted->mnt_root);
1418                        continue;
1419                }
1420
1421                /* Don't handle automount points here */
1422                break;
1423        }
1424        return 0;
1425}
1426EXPORT_SYMBOL(follow_down);
1427
1428/*
1429 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1430 */
1431static void follow_mount(struct path *path)
1432{
1433        while (d_mountpoint(path->dentry)) {
1434                struct vfsmount *mounted = lookup_mnt(path);
1435                if (!mounted)
1436                        break;
1437                dput(path->dentry);
1438                mntput(path->mnt);
1439                path->mnt = mounted;
1440                path->dentry = dget(mounted->mnt_root);
1441        }
1442}
1443
1444static int path_parent_directory(struct path *path)
1445{
1446        struct dentry *old = path->dentry;
1447        /* rare case of legitimate dget_parent()... */
1448        path->dentry = dget_parent(path->dentry);
1449        dput(old);
1450        if (unlikely(!path_connected(path)))
1451                return -ENOENT;
1452        return 0;
1453}
1454
1455static int follow_dotdot(struct nameidata *nd)
1456{
1457        while(1) {
1458                if (nd->path.dentry == nd->root.dentry &&
1459                    nd->path.mnt == nd->root.mnt) {
1460                        break;
1461                }
1462                if (nd->path.dentry != nd->path.mnt->mnt_root) {
1463                        int ret = path_parent_directory(&nd->path);
1464                        if (ret)
1465                                return ret;
1466                        break;
1467                }
1468                if (!follow_up(&nd->path))
1469                        break;
1470        }
1471        follow_mount(&nd->path);
1472        nd->inode = nd->path.dentry->d_inode;
1473        return 0;
1474}
1475
1476/*
1477 * This looks up the name in dcache and possibly revalidates the found dentry.
1478 * NULL is returned if the dentry does not exist in the cache.
1479 */
1480static struct dentry *lookup_dcache(const struct qstr *name,
1481                                    struct dentry *dir,
1482                                    unsigned int flags)
1483{
1484        struct dentry *dentry = d_lookup(dir, name);
1485        if (dentry) {
1486                int error = d_revalidate(dentry, flags);
1487                if (unlikely(error <= 0)) {
1488                        if (!error)
1489                                d_invalidate(dentry);
1490                        dput(dentry);
1491                        return ERR_PTR(error);
1492                }
1493        }
1494        return dentry;
1495}
1496
1497/*
1498 * Call i_op->lookup on the dentry.  The dentry must be negative and
1499 * unhashed.
1500 *
1501 * dir->d_inode->i_mutex must be held
1502 */
1503static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
1504                                  unsigned int flags)
1505{
1506        struct dentry *old;
1507
1508        /* Don't create child dentry for a dead directory. */
1509        if (unlikely(IS_DEADDIR(dir))) {
1510                dput(dentry);
1511                return ERR_PTR(-ENOENT);
1512        }
1513
1514        old = dir->i_op->lookup(dir, dentry, flags);
1515        if (unlikely(old)) {
1516                dput(dentry);
1517                dentry = old;
1518        }
1519        return dentry;
1520}
1521
1522static struct dentry *__lookup_hash(const struct qstr *name,
1523                struct dentry *base, unsigned int flags)
1524{
1525        struct dentry *dentry = lookup_dcache(name, base, flags);
1526
1527        if (dentry)
1528                return dentry;
1529
1530        dentry = d_alloc(base, name);
1531        if (unlikely(!dentry))
1532                return ERR_PTR(-ENOMEM);
1533
1534        return lookup_real(base->d_inode, dentry, flags);
1535}
1536
1537static int lookup_fast(struct nameidata *nd,
1538                       struct path *path, struct inode **inode,
1539                       unsigned *seqp)
1540{
1541        struct vfsmount *mnt = nd->path.mnt;
1542        struct dentry *dentry, *parent = nd->path.dentry;
1543        int status = 1;
1544        int err;
1545
1546        /*
1547         * Rename seqlock is not required here because in the off chance
1548         * of a false negative due to a concurrent rename, the caller is
1549         * going to fall back to non-racy lookup.
1550         */
1551        if (nd->flags & LOOKUP_RCU) {
1552                unsigned seq;
1553                bool negative;
1554                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1555                if (unlikely(!dentry)) {
1556                        if (unlazy_walk(nd))
1557                                return -ECHILD;
1558                        return 0;
1559                }
1560
1561                /*
1562                 * This sequence count validates that the inode matches
1563                 * the dentry name information from lookup.
1564                 */
1565                *inode = d_backing_inode(dentry);
1566                negative = d_is_negative(dentry);
1567                if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1568                        return -ECHILD;
1569
1570                /*
1571                 * This sequence count validates that the parent had no
1572                 * changes while we did the lookup of the dentry above.
1573                 *
1574                 * The memory barrier in read_seqcount_begin of child is
1575                 *  enough, we can use __read_seqcount_retry here.
1576                 */
1577                if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
1578                        return -ECHILD;
1579
1580                *seqp = seq;
1581                status = d_revalidate(dentry, nd->flags);
1582                if (likely(status > 0)) {
1583                        /*
1584                         * Note: do negative dentry check after revalidation in
1585                         * case that drops it.
1586                         */
1587                        if (unlikely(negative))
1588                                return -ENOENT;
1589                        path->mnt = mnt;
1590                        path->dentry = dentry;
1591                        if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1592                                return 1;
1593                }
1594                if (unlazy_child(nd, dentry, seq))
1595                        return -ECHILD;
1596                if (unlikely(status == -ECHILD))
1597                        /* we'd been told to redo it in non-rcu mode */
1598                        status = d_revalidate(dentry, nd->flags);
1599        } else {
1600                dentry = __d_lookup(parent, &nd->last);
1601                if (unlikely(!dentry))
1602                        return 0;
1603                status = d_revalidate(dentry, nd->flags);
1604        }
1605        if (unlikely(status <= 0)) {
1606                if (!status)
1607                        d_invalidate(dentry);
1608                dput(dentry);
1609                return status;
1610        }
1611        if (unlikely(d_is_negative(dentry))) {
1612                dput(dentry);
1613                return -ENOENT;
1614        }
1615
1616        path->mnt = mnt;
1617        path->dentry = dentry;
1618        err = follow_managed(path, nd);
1619        if (likely(err > 0))
1620                *inode = d_backing_inode(path->dentry);
1621        return err;
1622}
1623
1624/* Fast lookup failed, do it the slow way */
1625static struct dentry *lookup_slow(const struct qstr *name,
1626                                  struct dentry *dir,
1627                                  unsigned int flags)
1628{
1629        struct dentry *dentry = ERR_PTR(-ENOENT), *old;
1630        struct inode *inode = dir->d_inode;
1631        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1632
1633        inode_lock_shared(inode);
1634        /* Don't go there if it's already dead */
1635        if (unlikely(IS_DEADDIR(inode)))
1636                goto out;
1637again:
1638        dentry = d_alloc_parallel(dir, name, &wq);
1639        if (IS_ERR(dentry))
1640                goto out;
1641        if (unlikely(!d_in_lookup(dentry))) {
1642                if (!(flags & LOOKUP_NO_REVAL)) {
1643                        int error = d_revalidate(dentry, flags);
1644                        if (unlikely(error <= 0)) {
1645                                if (!error) {
1646                                        d_invalidate(dentry);
1647                                        dput(dentry);
1648                                        goto again;
1649                                }
1650                                dput(dentry);
1651                                dentry = ERR_PTR(error);
1652                        }
1653                }
1654        } else {
1655                old = inode->i_op->lookup(inode, dentry, flags);
1656                d_lookup_done(dentry);
1657                if (unlikely(old)) {
1658                        dput(dentry);
1659                        dentry = old;
1660                }
1661        }
1662out:
1663        inode_unlock_shared(inode);
1664        return dentry;
1665}
1666
1667static inline int may_lookup(struct nameidata *nd)
1668{
1669        if (nd->flags & LOOKUP_RCU) {
1670                int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1671                if (err != -ECHILD)
1672                        return err;
1673                if (unlazy_walk(nd))
1674                        return -ECHILD;
1675        }
1676        return inode_permission(nd->inode, MAY_EXEC);
1677}
1678
1679static inline int handle_dots(struct nameidata *nd, int type)
1680{
1681        if (type == LAST_DOTDOT) {
1682                if (!nd->root.mnt)
1683                        set_root(nd);
1684                if (nd->flags & LOOKUP_RCU) {
1685                        return follow_dotdot_rcu(nd);
1686                } else
1687                        return follow_dotdot(nd);
1688        }
1689        return 0;
1690}
1691
1692static int pick_link(struct nameidata *nd, struct path *link,
1693                     struct inode *inode, unsigned seq)
1694{
1695        int error;
1696        struct saved *last;
1697        if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
1698                path_to_nameidata(link, nd);
1699                return -ELOOP;
1700        }
1701        if (!(nd->flags & LOOKUP_RCU)) {
1702                if (link->mnt == nd->path.mnt)
1703                        mntget(link->mnt);
1704        }
1705        error = nd_alloc_stack(nd);
1706        if (unlikely(error)) {
1707                if (error == -ECHILD) {
1708                        if (unlikely(!legitimize_path(nd, link, seq))) {
1709                                drop_links(nd);
1710                                nd->depth = 0;
1711                                nd->flags &= ~LOOKUP_RCU;
1712                                nd->path.mnt = NULL;
1713                                nd->path.dentry = NULL;
1714                                if (!(nd->flags & LOOKUP_ROOT))
1715                                        nd->root.mnt = NULL;
1716                                rcu_read_unlock();
1717                        } else if (likely(unlazy_walk(nd)) == 0)
1718                                error = nd_alloc_stack(nd);
1719                }
1720                if (error) {
1721                        path_put(link);
1722                        return error;
1723                }
1724        }
1725
1726        last = nd->stack + nd->depth++;
1727        last->link = *link;
1728        clear_delayed_call(&last->done);
1729        nd->link_inode = inode;
1730        last->seq = seq;
1731        return 1;
1732}
1733
1734enum {WALK_FOLLOW = 1, WALK_MORE = 2};
1735
1736/*
1737 * Do we need to follow links? We _really_ want to be able
1738 * to do this check without having to look at inode->i_op,
1739 * so we keep a cache of "no, this doesn't need follow_link"
1740 * for the common case.
1741 */
1742static inline int step_into(struct nameidata *nd, struct path *path,
1743                            int flags, struct inode *inode, unsigned seq)
1744{
1745        if (!(flags & WALK_MORE) && nd->depth)
1746                put_link(nd);
1747        if (likely(!d_is_symlink(path->dentry)) ||
1748           !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
1749                /* not a symlink or should not follow */
1750                path_to_nameidata(path, nd);
1751                nd->inode = inode;
1752                nd->seq = seq;
1753                return 0;
1754        }
1755        /* make sure that d_is_symlink above matches inode */
1756        if (nd->flags & LOOKUP_RCU) {
1757                if (read_seqcount_retry(&path->dentry->d_seq, seq))
1758                        return -ECHILD;
1759        }
1760        return pick_link(nd, path, inode, seq);
1761}
1762
1763static int walk_component(struct nameidata *nd, int flags)
1764{
1765        struct path path;
1766        struct inode *inode;
1767        unsigned seq;
1768        int err;
1769        /*
1770         * "." and ".." are special - ".." especially so because it has
1771         * to be able to know about the current root directory and
1772         * parent relationships.
1773         */
1774        if (unlikely(nd->last_type != LAST_NORM)) {
1775                err = handle_dots(nd, nd->last_type);
1776                if (!(flags & WALK_MORE) && nd->depth)
1777                        put_link(nd);
1778                return err;
1779        }
1780        err = lookup_fast(nd, &path, &inode, &seq);
1781        if (unlikely(err <= 0)) {
1782                if (err < 0)
1783                        return err;
1784                path.dentry = lookup_slow(&nd->last, nd->path.dentry,
1785                                          nd->flags);
1786                if (IS_ERR(path.dentry))
1787                        return PTR_ERR(path.dentry);
1788
1789                path.mnt = nd->path.mnt;
1790                err = follow_managed(&path, nd);
1791                if (unlikely(err < 0))
1792                        return err;
1793
1794                if (unlikely(d_is_negative(path.dentry))) {
1795                        path_to_nameidata(&path, nd);
1796                        return -ENOENT;
1797                }
1798
1799                seq = 0;        /* we are already out of RCU mode */
1800                inode = d_backing_inode(path.dentry);
1801        }
1802
1803        return step_into(nd, &path, flags, inode, seq);
1804}
1805
1806/*
1807 * We can do the critical dentry name comparison and hashing
1808 * operations one word at a time, but we are limited to:
1809 *
1810 * - Architectures with fast unaligned word accesses. We could
1811 *   do a "get_unaligned()" if this helps and is sufficiently
1812 *   fast.
1813 *
1814 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
1815 *   do not trap on the (extremely unlikely) case of a page
1816 *   crossing operation.
1817 *
1818 * - Furthermore, we need an efficient 64-bit compile for the
1819 *   64-bit case in order to generate the "number of bytes in
1820 *   the final mask". Again, that could be replaced with a
1821 *   efficient population count instruction or similar.
1822 */
1823#ifdef CONFIG_DCACHE_WORD_ACCESS
1824
1825#include <asm/word-at-a-time.h>
1826
1827#ifdef HASH_MIX
1828
1829/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1830
1831#elif defined(CONFIG_64BIT)
1832/*
1833 * Register pressure in the mixing function is an issue, particularly
1834 * on 32-bit x86, but almost any function requires one state value and
1835 * one temporary.  Instead, use a function designed for two state values
1836 * and no temporaries.
1837 *
1838 * This function cannot create a collision in only two iterations, so
1839 * we have two iterations to achieve avalanche.  In those two iterations,
1840 * we have six layers of mixing, which is enough to spread one bit's
1841 * influence out to 2^6 = 64 state bits.
1842 *
1843 * Rotate constants are scored by considering either 64 one-bit input
1844 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
1845 * probability of that delta causing a change to each of the 128 output
1846 * bits, using a sample of random initial states.
1847 *
1848 * The Shannon entropy of the computed probabilities is then summed
1849 * to produce a score.  Ideally, any input change has a 50% chance of
1850 * toggling any given output bit.
1851 *
1852 * Mixing scores (in bits) for (12,45):
1853 * Input delta: 1-bit      2-bit
1854 * 1 round:     713.3    42542.6
1855 * 2 rounds:   2753.7   140389.8
1856 * 3 rounds:   5954.1   233458.2
1857 * 4 rounds:   7862.6   256672.2
1858 * Perfect:    8192     258048
1859 *            (64*128) (64*63/2 * 128)
1860 */
1861#define HASH_MIX(x, y, a)       \
1862        (       x ^= (a),       \
1863        y ^= x, x = rol64(x,12),\
1864        x += y, y = rol64(y,45),\
1865        y *= 9                  )
1866
1867/*
1868 * Fold two longs into one 32-bit hash value.  This must be fast, but
1869 * latency isn't quite as critical, as there is a fair bit of additional
1870 * work done before the hash value is used.
1871 */
1872static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1873{
1874        y ^= x * GOLDEN_RATIO_64;
1875        y *= GOLDEN_RATIO_64;
1876        return y >> 32;
1877}
1878
1879#else   /* 32-bit case */
1880
1881/*
1882 * Mixing scores (in bits) for (7,20):
1883 * Input delta: 1-bit      2-bit
1884 * 1 round:     330.3     9201.6
1885 * 2 rounds:   1246.4    25475.4
1886 * 3 rounds:   1907.1    31295.1
1887 * 4 rounds:   2042.3    31718.6
1888 * Perfect:    2048      31744
1889 *            (32*64)   (32*31/2 * 64)
1890 */
1891#define HASH_MIX(x, y, a)       \
1892        (       x ^= (a),       \
1893        y ^= x, x = rol32(x, 7),\
1894        x += y, y = rol32(y,20),\
1895        y *= 9                  )
1896
1897static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1898{
1899        /* Use arch-optimized multiply if one exists */
1900        return __hash_32(y ^ __hash_32(x));
1901}
1902
1903#endif
1904
1905/*
1906 * Return the hash of a string of known length.  This is carfully
1907 * designed to match hash_name(), which is the more critical function.
1908 * In particular, we must end by hashing a final word containing 0..7
1909 * payload bytes, to match the way that hash_name() iterates until it
1910 * finds the delimiter after the name.
1911 */
1912unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
1913{
1914        unsigned long a, x = 0, y = (unsigned long)salt;
1915
1916        for (;;) {
1917                if (!len)
1918                        goto done;
1919                a = load_unaligned_zeropad(name);
1920                if (len < sizeof(unsigned long))
1921                        break;
1922                HASH_MIX(x, y, a);
1923                name += sizeof(unsigned long);
1924                len -= sizeof(unsigned long);
1925        }
1926        x ^= a & bytemask_from_count(len);
1927done:
1928        return fold_hash(x, y);
1929}
1930EXPORT_SYMBOL(full_name_hash);
1931
1932/* Return the "hash_len" (hash and length) of a null-terminated string */
1933u64 hashlen_string(const void *salt, const char *name)
1934{
1935        unsigned long a = 0, x = 0, y = (unsigned long)salt;
1936        unsigned long adata, mask, len;
1937        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1938
1939        len = 0;
1940        goto inside;
1941
1942        do {
1943                HASH_MIX(x, y, a);
1944                len += sizeof(unsigned long);
1945inside:
1946                a = load_unaligned_zeropad(name+len);
1947        } while (!has_zero(a, &adata, &constants));
1948
1949        adata = prep_zero_mask(a, adata, &constants);
1950        mask = create_zero_mask(adata);
1951        x ^= a & zero_bytemask(mask);
1952
1953        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
1954}
1955EXPORT_SYMBOL(hashlen_string);
1956
1957/*
1958 * Calculate the length and hash of the path component, and
1959 * return the "hash_len" as the result.
1960 */
1961static inline u64 hash_name(const void *salt, const char *name)
1962{
1963        unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
1964        unsigned long adata, bdata, mask, len;
1965        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1966
1967        len = 0;
1968        goto inside;
1969
1970        do {
1971                HASH_MIX(x, y, a);
1972                len += sizeof(unsigned long);
1973inside:
1974                a = load_unaligned_zeropad(name+len);
1975                b = a ^ REPEAT_BYTE('/');
1976        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
1977
1978        adata = prep_zero_mask(a, adata, &constants);
1979        bdata = prep_zero_mask(b, bdata, &constants);
1980        mask = create_zero_mask(adata | bdata);
1981        x ^= a & zero_bytemask(mask);
1982
1983        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
1984}
1985
1986#else   /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
1987
1988/* Return the hash of a string of known length */
1989unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
1990{
1991        unsigned long hash = init_name_hash(salt);
1992        while (len--)
1993                hash = partial_name_hash((unsigned char)*name++, hash);
1994        return end_name_hash(hash);
1995}
1996EXPORT_SYMBOL(full_name_hash);
1997
1998/* Return the "hash_len" (hash and length) of a null-terminated string */
1999u64 hashlen_string(const void *salt, const char *name)
2000{
2001        unsigned long hash = init_name_hash(salt);
2002        unsigned long len = 0, c;
2003
2004        c = (unsigned char)*name;
2005        while (c) {
2006                len++;
2007                hash = partial_name_hash(c, hash);
2008                c = (unsigned char)name[len];
2009        }
2010        return hashlen_create(end_name_hash(hash), len);
2011}
2012EXPORT_SYMBOL(hashlen_string);
2013
2014/*
2015 * We know there's a real path component here of at least
2016 * one character.
2017 */
2018static inline u64 hash_name(const void *salt, const char *name)
2019{
2020        unsigned long hash = init_name_hash(salt);
2021        unsigned long len = 0, c;
2022
2023        c = (unsigned char)*name;
2024        do {
2025                len++;
2026                hash = partial_name_hash(c, hash);
2027                c = (unsigned char)name[len];
2028        } while (c && c != '/');
2029        return hashlen_create(end_name_hash(hash), len);
2030}
2031
2032#endif
2033
2034/*
2035 * Name resolution.
2036 * This is the basic name resolution function, turning a pathname into
2037 * the final dentry. We expect 'base' to be positive and a directory.
2038 *
2039 * Returns 0 and nd will have valid dentry and mnt on success.
2040 * Returns error and drops reference to input namei data on failure.
2041 */
2042static int link_path_walk(const char *name, struct nameidata *nd)
2043{
2044        int err;
2045
2046        while (*name=='/')
2047                name++;
2048        if (!*name)
2049                return 0;
2050
2051        /* At this point we know we have a real path component. */
2052        for(;;) {
2053                u64 hash_len;
2054                int type;
2055
2056                err = may_lookup(nd);
2057                if (err)
2058                        return err;
2059
2060                hash_len = hash_name(nd->path.dentry, name);
2061
2062                type = LAST_NORM;
2063                if (name[0] == '.') switch (hashlen_len(hash_len)) {
2064                        case 2:
2065                                if (name[1] == '.') {
2066                                        type = LAST_DOTDOT;
2067                                        nd->flags |= LOOKUP_JUMPED;
2068                                }
2069                                break;
2070                        case 1:
2071                                type = LAST_DOT;
2072                }
2073                if (likely(type == LAST_NORM)) {
2074                        struct dentry *parent = nd->path.dentry;
2075                        nd->flags &= ~LOOKUP_JUMPED;
2076                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2077                                struct qstr this = { { .hash_len = hash_len }, .name = name };
2078                                err = parent->d_op->d_hash(parent, &this);
2079                                if (err < 0)
2080                                        return err;
2081                                hash_len = this.hash_len;
2082                                name = this.name;
2083                        }
2084                }
2085
2086                nd->last.hash_len = hash_len;
2087                nd->last.name = name;
2088                nd->last_type = type;
2089
2090                name += hashlen_len(hash_len);
2091                if (!*name)
2092                        goto OK;
2093                /*
2094                 * If it wasn't NUL, we know it was '/'. Skip that
2095                 * slash, and continue until no more slashes.
2096                 */
2097                do {
2098                        name++;
2099                } while (unlikely(*name == '/'));
2100                if (unlikely(!*name)) {
2101OK:
2102                        /* pathname body, done */
2103                        if (!nd->depth)
2104                                return 0;
2105                        name = nd->stack[nd->depth - 1].name;
2106                        /* trailing symlink, done */
2107                        if (!name)
2108                                return 0;
2109                        /* last component of nested symlink */
2110                        err = walk_component(nd, WALK_FOLLOW);
2111                } else {
2112                        /* not the last component */
2113                        err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
2114                }
2115                if (err < 0)
2116                        return err;
2117
2118                if (err) {
2119                        const char *s = get_link(nd);
2120
2121                        if (IS_ERR(s))
2122                                return PTR_ERR(s);
2123                        err = 0;
2124                        if (unlikely(!s)) {
2125                                /* jumped */
2126                                put_link(nd);
2127                        } else {
2128                                nd->stack[nd->depth - 1].name = name;
2129                                name = s;
2130                                continue;
2131                        }
2132                }
2133                if (unlikely(!d_can_lookup(nd->path.dentry))) {
2134                        if (nd->flags & LOOKUP_RCU) {
2135                                if (unlazy_walk(nd))
2136                                        return -ECHILD;
2137                        }
2138                        return -ENOTDIR;
2139                }
2140        }
2141}
2142
2143static const char *path_init(struct nameidata *nd, unsigned flags)
2144{
2145        const char *s = nd->name->name;
2146
2147        if (!*s)
2148                flags &= ~LOOKUP_RCU;
2149
2150        nd->last_type = LAST_ROOT; /* if there are only slashes... */
2151        nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
2152        nd->depth = 0;
2153        if (flags & LOOKUP_ROOT) {
2154                struct dentry *root = nd->root.dentry;
2155                struct inode *inode = root->d_inode;
2156                if (*s && unlikely(!d_can_lookup(root)))
2157                        return ERR_PTR(-ENOTDIR);
2158                nd->path = nd->root;
2159                nd->inode = inode;
2160                if (flags & LOOKUP_RCU) {
2161                        rcu_read_lock();
2162                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2163                        nd->root_seq = nd->seq;
2164                        nd->m_seq = read_seqbegin(&mount_lock);
2165                } else {
2166                        path_get(&nd->path);
2167                }
2168                return s;
2169        }
2170
2171        nd->root.mnt = NULL;
2172        nd->path.mnt = NULL;
2173        nd->path.dentry = NULL;
2174
2175        nd->m_seq = read_seqbegin(&mount_lock);
2176        if (*s == '/') {
2177                if (flags & LOOKUP_RCU)
2178                        rcu_read_lock();
2179                set_root(nd);
2180                if (likely(!nd_jump_root(nd)))
2181                        return s;
2182                nd->root.mnt = NULL;
2183                rcu_read_unlock();
2184                return ERR_PTR(-ECHILD);
2185        } else if (nd->dfd == AT_FDCWD) {
2186                if (flags & LOOKUP_RCU) {
2187                        struct fs_struct *fs = current->fs;
2188                        unsigned seq;
2189
2190                        rcu_read_lock();
2191
2192                        do {
2193                                seq = read_seqcount_begin(&fs->seq);
2194                                nd->path = fs->pwd;
2195                                nd->inode = nd->path.dentry->d_inode;
2196                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2197                        } while (read_seqcount_retry(&fs->seq, seq));
2198                } else {
2199                        get_fs_pwd(current->fs, &nd->path);
2200                        nd->inode = nd->path.dentry->d_inode;
2201                }
2202                return s;
2203        } else {
2204                /* Caller must check execute permissions on the starting path component */
2205                struct fd f = fdget_raw(nd->dfd);
2206                struct dentry *dentry;
2207
2208                if (!f.file)
2209                        return ERR_PTR(-EBADF);
2210
2211                dentry = f.file->f_path.dentry;
2212
2213                if (*s) {
2214                        if (!d_can_lookup(dentry)) {
2215                                fdput(f);
2216                                return ERR_PTR(-ENOTDIR);
2217                        }
2218                }
2219
2220                nd->path = f.file->f_path;
2221                if (flags & LOOKUP_RCU) {
2222                        rcu_read_lock();
2223                        nd->inode = nd->path.dentry->d_inode;
2224                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2225                } else {
2226                        path_get(&nd->path);
2227                        nd->inode = nd->path.dentry->d_inode;
2228                }
2229                fdput(f);
2230                return s;
2231        }
2232}
2233
2234static const char *trailing_symlink(struct nameidata *nd)
2235{
2236        const char *s;
2237        int error = may_follow_link(nd);
2238        if (unlikely(error))
2239                return ERR_PTR(error);
2240        nd->flags |= LOOKUP_PARENT;
2241        nd->stack[0].name = NULL;
2242        s = get_link(nd);
2243        return s ? s : "";
2244}
2245
2246static inline int lookup_last(struct nameidata *nd)
2247{
2248        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2249                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2250
2251        nd->flags &= ~LOOKUP_PARENT;
2252        return walk_component(nd, 0);
2253}
2254
2255static int handle_lookup_down(struct nameidata *nd)
2256{
2257        struct path path = nd->path;
2258        struct inode *inode = nd->inode;
2259        unsigned seq = nd->seq;
2260        int err;
2261
2262        if (nd->flags & LOOKUP_RCU) {
2263                /*
2264                 * don't bother with unlazy_walk on failure - we are
2265                 * at the very beginning of walk, so we lose nothing
2266                 * if we simply redo everything in non-RCU mode
2267                 */
2268                if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
2269                        return -ECHILD;
2270        } else {
2271                dget(path.dentry);
2272                err = follow_managed(&path, nd);
2273                if (unlikely(err < 0))
2274                        return err;
2275                inode = d_backing_inode(path.dentry);
2276                seq = 0;
2277        }
2278        path_to_nameidata(&path, nd);
2279        nd->inode = inode;
2280        nd->seq = seq;
2281        return 0;
2282}
2283
2284/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2285static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2286{
2287        const char *s = path_init(nd, flags);
2288        int err;
2289
2290        if (IS_ERR(s))
2291                return PTR_ERR(s);
2292
2293        if (unlikely(flags & LOOKUP_DOWN)) {
2294                err = handle_lookup_down(nd);
2295                if (unlikely(err < 0)) {
2296                        terminate_walk(nd);
2297                        return err;
2298                }
2299        }
2300
2301        while (!(err = link_path_walk(s, nd))
2302                && ((err = lookup_last(nd)) > 0)) {
2303                s = trailing_symlink(nd);
2304                if (IS_ERR(s)) {
2305                        err = PTR_ERR(s);
2306                        break;
2307                }
2308        }
2309        if (!err)
2310                err = complete_walk(nd);
2311
2312        if (!err && nd->flags & LOOKUP_DIRECTORY)
2313                if (!d_can_lookup(nd->path.dentry))
2314                        err = -ENOTDIR;
2315        if (!err) {
2316                *path = nd->path;
2317                nd->path.mnt = NULL;
2318                nd->path.dentry = NULL;
2319        }
2320        terminate_walk(nd);
2321        return err;
2322}
2323
2324static int filename_lookup(int dfd, struct filename *name, unsigned flags,
2325                           struct path *path, struct path *root)
2326{
2327        int retval;
2328        struct nameidata nd;
2329        if (IS_ERR(name))
2330                return PTR_ERR(name);
2331        if (unlikely(root)) {
2332                nd.root = *root;
2333                flags |= LOOKUP_ROOT;
2334        }
2335        set_nameidata(&nd, dfd, name);
2336        retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
2337        if (unlikely(retval == -ECHILD))
2338                retval = path_lookupat(&nd, flags, path);
2339        if (unlikely(retval == -ESTALE))
2340                retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
2341
2342        if (likely(!retval))
2343                audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
2344        restore_nameidata();
2345        putname(name);
2346        return retval;
2347}
2348
2349/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2350static int path_parentat(struct nameidata *nd, unsigned flags,
2351                                struct path *parent)
2352{
2353        const char *s = path_init(nd, flags);
2354        int err;
2355        if (IS_ERR(s))
2356                return PTR_ERR(s);
2357        err = link_path_walk(s, nd);
2358        if (!err)
2359                err = complete_walk(nd);
2360        if (!err) {
2361                *parent = nd->path;
2362                nd->path.mnt = NULL;
2363                nd->path.dentry = NULL;
2364        }
2365        terminate_walk(nd);
2366        return err;
2367}
2368
2369static struct filename *filename_parentat(int dfd, struct filename *name,
2370                                unsigned int flags, struct path *parent,
2371                                struct qstr *last, int *type)
2372{
2373        int retval;
2374        struct nameidata nd;
2375
2376        if (IS_ERR(name))
2377                return name;
2378        set_nameidata(&nd, dfd, name);
2379        retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2380        if (unlikely(retval == -ECHILD))
2381                retval = path_parentat(&nd, flags, parent);
2382        if (unlikely(retval == -ESTALE))
2383                retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2384        if (likely(!retval)) {
2385                *last = nd.last;
2386                *type = nd.last_type;
2387                audit_inode(name, parent->dentry, LOOKUP_PARENT);
2388        } else {
2389                putname(name);
2390                name = ERR_PTR(retval);
2391        }
2392        restore_nameidata();
2393        return name;
2394}
2395
2396/* does lookup, returns the object with parent locked */
2397struct dentry *kern_path_locked(const char *name, struct path *path)
2398{
2399        struct filename *filename;
2400        struct dentry *d;
2401        struct qstr last;
2402        int type;
2403
2404        filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
2405                                    &last, &type);
2406        if (IS_ERR(filename))
2407                return ERR_CAST(filename);
2408        if (unlikely(type != LAST_NORM)) {
2409                path_put(path);
2410                putname(filename);
2411                return ERR_PTR(-EINVAL);
2412        }
2413        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2414        d = __lookup_hash(&last, path->dentry, 0);
2415        if (IS_ERR(d)) {
2416                inode_unlock(path->dentry->d_inode);
2417                path_put(path);
2418        }
2419        putname(filename);
2420        return d;
2421}
2422
2423int kern_path(const char *name, unsigned int flags, struct path *path)
2424{
2425        return filename_lookup(AT_FDCWD, getname_kernel(name),
2426                               flags, path, NULL);
2427}
2428EXPORT_SYMBOL(kern_path);
2429
2430/**
2431 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2432 * @dentry:  pointer to dentry of the base directory
2433 * @mnt: pointer to vfs mount of the base directory
2434 * @name: pointer to file name
2435 * @flags: lookup flags
2436 * @path: pointer to struct path to fill
2437 */
2438int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2439                    const char *name, unsigned int flags,
2440                    struct path *path)
2441{
2442        struct path root = {.mnt = mnt, .dentry = dentry};
2443        /* the first argument of filename_lookup() is ignored with root */
2444        return filename_lookup(AT_FDCWD, getname_kernel(name),
2445                               flags , path, &root);
2446}
2447EXPORT_SYMBOL(vfs_path_lookup);
2448
2449/**
2450 * lookup_one_len - filesystem helper to lookup single pathname component
2451 * @name:       pathname component to lookup
2452 * @base:       base directory to lookup from
2453 * @len:        maximum length @len should be interpreted to
2454 *
2455 * Note that this routine is purely a helper for filesystem usage and should
2456 * not be called by generic code.
2457 *
2458 * The caller must hold base->i_mutex.
2459 */
2460struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2461{
2462        struct qstr this;
2463        unsigned int c;
2464        int err;
2465
2466        WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2467
2468        this.name = name;
2469        this.len = len;
2470        this.hash = full_name_hash(base, name, len);
2471        if (!len)
2472                return ERR_PTR(-EACCES);
2473
2474        if (unlikely(name[0] == '.')) {
2475                if (len < 2 || (len == 2 && name[1] == '.'))
2476                        return ERR_PTR(-EACCES);
2477        }
2478
2479        while (len--) {
2480                c = *(const unsigned char *)name++;
2481                if (c == '/' || c == '\0')
2482                        return ERR_PTR(-EACCES);
2483        }
2484        /*
2485         * See if the low-level filesystem might want
2486         * to use its own hash..
2487         */
2488        if (base->d_flags & DCACHE_OP_HASH) {
2489                int err = base->d_op->d_hash(base, &this);
2490                if (err < 0)
2491                        return ERR_PTR(err);
2492        }
2493
2494        err = inode_permission(base->d_inode, MAY_EXEC);
2495        if (err)
2496                return ERR_PTR(err);
2497
2498        return __lookup_hash(&this, base, 0);
2499}
2500EXPORT_SYMBOL(lookup_one_len);
2501
2502/**
2503 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
2504 * @name:       pathname component to lookup
2505 * @base:       base directory to lookup from
2506 * @len:        maximum length @len should be interpreted to
2507 *
2508 * Note that this routine is purely a helper for filesystem usage and should
2509 * not be called by generic code.
2510 *
2511 * Unlike lookup_one_len, it should be called without the parent
2512 * i_mutex held, and will take the i_mutex itself if necessary.
2513 */
2514struct dentry *lookup_one_len_unlocked(const char *name,
2515                                       struct dentry *base, int len)
2516{
2517        struct qstr this;
2518        unsigned int c;
2519        int err;
2520        struct dentry *ret;
2521
2522        this.name = name;
2523        this.len = len;
2524        this.hash = full_name_hash(base, name, len);
2525        if (!len)
2526                return ERR_PTR(-EACCES);
2527
2528        if (unlikely(name[0] == '.')) {
2529                if (len < 2 || (len == 2 && name[1] == '.'))
2530                        return ERR_PTR(-EACCES);
2531        }
2532
2533        while (len--) {
2534                c = *(const unsigned char *)name++;
2535                if (c == '/' || c == '\0')
2536                        return ERR_PTR(-EACCES);
2537        }
2538        /*
2539         * See if the low-level filesystem might want
2540         * to use its own hash..
2541         */
2542        if (base->d_flags & DCACHE_OP_HASH) {
2543                int err = base->d_op->d_hash(base, &this);
2544                if (err < 0)
2545                        return ERR_PTR(err);
2546        }
2547
2548        err = inode_permission(base->d_inode, MAY_EXEC);
2549        if (err)
2550                return ERR_PTR(err);
2551
2552        ret = lookup_dcache(&this, base, 0);
2553        if (!ret)
2554                ret = lookup_slow(&this, base, 0);
2555        return ret;
2556}
2557EXPORT_SYMBOL(lookup_one_len_unlocked);
2558
2559#ifdef CONFIG_UNIX98_PTYS
2560int path_pts(struct path *path)
2561{
2562        /* Find something mounted on "pts" in the same directory as
2563         * the input path.
2564         */
2565        struct dentry *child, *parent;
2566        struct qstr this;
2567        int ret;
2568
2569        ret = path_parent_directory(path);
2570        if (ret)
2571                return ret;
2572
2573        parent = path->dentry;
2574        this.name = "pts";
2575        this.len = 3;
2576        child = d_hash_and_lookup(parent, &this);
2577        if (!child)
2578                return -ENOENT;
2579
2580        path->dentry = child;
2581        dput(parent);
2582        follow_mount(path);
2583        return 0;
2584}
2585#endif
2586
2587int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2588                 struct path *path, int *empty)
2589{
2590        return filename_lookup(dfd, getname_flags(name, flags, empty),
2591                               flags, path, NULL);
2592}
2593EXPORT_SYMBOL(user_path_at_empty);
2594
2595/**
2596 * mountpoint_last - look up last component for umount
2597 * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
2598 *
2599 * This is a special lookup_last function just for umount. In this case, we
2600 * need to resolve the path without doing any revalidation.
2601 *
2602 * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
2603 * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
2604 * in almost all cases, this lookup will be served out of the dcache. The only
2605 * cases where it won't are if nd->last refers to a symlink or the path is
2606 * bogus and it doesn't exist.
2607 *
2608 * Returns:
2609 * -error: if there was an error during lookup. This includes -ENOENT if the
2610 *         lookup found a negative dentry.
2611 *
2612 * 0:      if we successfully resolved nd->last and found it to not to be a
2613 *         symlink that needs to be followed.
2614 *
2615 * 1:      if we successfully resolved nd->last and found it to be a symlink
2616 *         that needs to be followed.
2617 */
2618static int
2619mountpoint_last(struct nameidata *nd)
2620{
2621        int error = 0;
2622        struct dentry *dir = nd->path.dentry;
2623        struct path path;
2624
2625        /* If we're in rcuwalk, drop out of it to handle last component */
2626        if (nd->flags & LOOKUP_RCU) {
2627                if (unlazy_walk(nd))
2628                        return -ECHILD;
2629        }
2630
2631        nd->flags &= ~LOOKUP_PARENT;
2632
2633        if (unlikely(nd->last_type != LAST_NORM)) {
2634                error = handle_dots(nd, nd->last_type);
2635                if (error)
2636                        return error;
2637                path.dentry = dget(nd->path.dentry);
2638        } else {
2639                path.dentry = d_lookup(dir, &nd->last);
2640                if (!path.dentry) {
2641                        /*
2642                         * No cached dentry. Mounted dentries are pinned in the
2643                         * cache, so that means that this dentry is probably
2644                         * a symlink or the path doesn't actually point
2645                         * to a mounted dentry.
2646                         */
2647                        path.dentry = lookup_slow(&nd->last, dir,
2648                                             nd->flags | LOOKUP_NO_REVAL);
2649                        if (IS_ERR(path.dentry))
2650                                return PTR_ERR(path.dentry);
2651                }
2652        }
2653        if (d_is_negative(path.dentry)) {
2654                dput(path.dentry);
2655                return -ENOENT;
2656        }
2657        path.mnt = nd->path.mnt;
2658        return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
2659}
2660
2661/**
2662 * path_mountpoint - look up a path to be umounted
2663 * @nd:         lookup context
2664 * @flags:      lookup flags
2665 * @path:       pointer to container for result
2666 *
2667 * Look up the given name, but don't attempt to revalidate the last component.
2668 * Returns 0 and "path" will be valid on success; Returns error otherwise.
2669 */
2670static int
2671path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
2672{
2673        const char *s = path_init(nd, flags);
2674        int err;
2675        if (IS_ERR(s))
2676                return PTR_ERR(s);
2677        while (!(err = link_path_walk(s, nd)) &&
2678                (err = mountpoint_last(nd)) > 0) {
2679                s = trailing_symlink(nd);
2680                if (IS_ERR(s)) {
2681                        err = PTR_ERR(s);
2682                        break;
2683                }
2684        }
2685        if (!err) {
2686                *path = nd->path;
2687                nd->path.mnt = NULL;
2688                nd->path.dentry = NULL;
2689                follow_mount(path);
2690        }
2691        terminate_walk(nd);
2692        return err;
2693}
2694
2695static int
2696filename_mountpoint(int dfd, struct filename *name, struct path *path,
2697                        unsigned int flags)
2698{
2699        struct nameidata nd;
2700        int error;
2701        if (IS_ERR(name))
2702                return PTR_ERR(name);
2703        set_nameidata(&nd, dfd, name);
2704        error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
2705        if (unlikely(error == -ECHILD))
2706                error = path_mountpoint(&nd, flags, path);
2707        if (unlikely(error == -ESTALE))
2708                error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
2709        if (likely(!error))
2710                audit_inode(name, path->dentry, 0);
2711        restore_nameidata();
2712        putname(name);
2713        return error;
2714}
2715
2716/**
2717 * user_path_mountpoint_at - lookup a path from userland in order to umount it
2718 * @dfd:        directory file descriptor
2719 * @name:       pathname from userland
2720 * @flags:      lookup flags
2721 * @path:       pointer to container to hold result
2722 *
2723 * A umount is a special case for path walking. We're not actually interested
2724 * in the inode in this situation, and ESTALE errors can be a problem. We
2725 * simply want track down the dentry and vfsmount attached at the mountpoint
2726 * and avoid revalidating the last component.
2727 *
2728 * Returns 0 and populates "path" on success.
2729 */
2730int
2731user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2732                        struct path *path)
2733{
2734        return filename_mountpoint(dfd, getname(name), path, flags);
2735}
2736
2737int
2738kern_path_mountpoint(int dfd, const char *name, struct path *path,
2739                        unsigned int flags)
2740{
2741        return filename_mountpoint(dfd, getname_kernel(name), path, flags);
2742}
2743EXPORT_SYMBOL(kern_path_mountpoint);
2744
2745int __check_sticky(struct inode *dir, struct inode *inode)
2746{
2747        kuid_t fsuid = current_fsuid();
2748
2749        if (uid_eq(inode->i_uid, fsuid))
2750                return 0;
2751        if (uid_eq(dir->i_uid, fsuid))
2752                return 0;
2753        return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
2754}
2755EXPORT_SYMBOL(__check_sticky);
2756
2757/*
2758 *      Check whether we can remove a link victim from directory dir, check
2759 *  whether the type of victim is right.
2760 *  1. We can't do it if dir is read-only (done in permission())
2761 *  2. We should have write and exec permissions on dir
2762 *  3. We can't remove anything from append-only dir
2763 *  4. We can't do anything with immutable dir (done in permission())
2764 *  5. If the sticky bit on dir is set we should either
2765 *      a. be owner of dir, or
2766 *      b. be owner of victim, or
2767 *      c. have CAP_FOWNER capability
2768 *  6. If the victim is append-only or immutable we can't do antyhing with
2769 *     links pointing to it.
2770 *  7. If the victim has an unknown uid or gid we can't change the inode.
2771 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2772 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2773 * 10. We can't remove a root or mountpoint.
2774 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
2775 *     nfs_async_unlink().
2776 */
2777static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
2778{
2779        struct inode *inode = d_backing_inode(victim);
2780        int error;
2781
2782        if (d_is_negative(victim))
2783                return -ENOENT;
2784        BUG_ON(!inode);
2785
2786        BUG_ON(victim->d_parent->d_inode != dir);
2787        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2788
2789        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
2790        if (error)
2791                return error;
2792        if (IS_APPEND(dir))
2793                return -EPERM;
2794
2795        if (check_sticky(dir, inode) || IS_APPEND(inode) ||
2796            IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
2797                return -EPERM;
2798        if (isdir) {
2799                if (!d_is_dir(victim))
2800                        return -ENOTDIR;
2801                if (IS_ROOT(victim))
2802                        return -EBUSY;
2803        } else if (d_is_dir(victim))
2804                return -EISDIR;
2805        if (IS_DEADDIR(dir))
2806                return -ENOENT;
2807        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2808                return -EBUSY;
2809        return 0;
2810}
2811
2812/*      Check whether we can create an object with dentry child in directory
2813 *  dir.
2814 *  1. We can't do it if child already exists (open has special treatment for
2815 *     this case, but since we are inlined it's OK)
2816 *  2. We can't do it if dir is read-only (done in permission())
2817 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
2818 *  4. We should have write and exec permissions on dir
2819 *  5. We can't do it if dir is immutable (done in permission())
2820 */
2821static inline int may_create(struct inode *dir, struct dentry *child)
2822{
2823        struct user_namespace *s_user_ns;
2824        audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
2825        if (child->d_inode)
2826                return -EEXIST;
2827        if (IS_DEADDIR(dir))
2828                return -ENOENT;
2829        s_user_ns = dir->i_sb->s_user_ns;
2830        if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
2831            !kgid_has_mapping(s_user_ns, current_fsgid()))
2832                return -EOVERFLOW;
2833        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
2834}
2835
2836/*
2837 * p1 and p2 should be directories on the same fs.
2838 */
2839struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
2840{
2841        struct dentry *p;
2842
2843        if (p1 == p2) {
2844                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2845                return NULL;
2846        }
2847
2848        mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
2849
2850        p = d_ancestor(p2, p1);
2851        if (p) {
2852                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
2853                inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2854                return p;
2855        }
2856
2857        p = d_ancestor(p1, p2);
2858        if (p) {
2859                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2860                inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2861                return p;
2862        }
2863
2864        inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2865        inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
2866        return NULL;
2867}
2868EXPORT_SYMBOL(lock_rename);
2869
2870void unlock_rename(struct dentry *p1, struct dentry *p2)
2871{
2872        inode_unlock(p1->d_inode);
2873        if (p1 != p2) {
2874                inode_unlock(p2->d_inode);
2875                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
2876        }
2877}
2878EXPORT_SYMBOL(unlock_rename);
2879
2880int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2881                bool want_excl)
2882{
2883        int error = may_create(dir, dentry);
2884        if (error)
2885                return error;
2886
2887        if (!dir->i_op->create)
2888                return -EACCES; /* shouldn't it be ENOSYS? */
2889        mode &= S_IALLUGO;
2890        mode |= S_IFREG;
2891        error = security_inode_create(dir, dentry, mode);
2892        if (error)
2893                return error;
2894        error = dir->i_op->create(dir, dentry, mode, want_excl);
2895        if (!error)
2896                fsnotify_create(dir, dentry);
2897        return error;
2898}
2899EXPORT_SYMBOL(vfs_create);
2900
2901bool may_open_dev(const struct path *path)
2902{
2903        return !(path->mnt->mnt_flags & MNT_NODEV) &&
2904                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
2905}
2906
2907static int may_open(const struct path *path, int acc_mode, int flag)
2908{
2909        struct dentry *dentry = path->dentry;
2910        struct inode *inode = dentry->d_inode;
2911        int error;
2912
2913        if (!inode)
2914                return -ENOENT;
2915
2916        switch (inode->i_mode & S_IFMT) {
2917        case S_IFLNK:
2918                return -ELOOP;
2919        case S_IFDIR:
2920                if (acc_mode & MAY_WRITE)
2921                        return -EISDIR;
2922                break;
2923        case S_IFBLK:
2924        case S_IFCHR:
2925                if (!may_open_dev(path))
2926                        return -EACCES;
2927                /*FALLTHRU*/
2928        case S_IFIFO:
2929        case S_IFSOCK:
2930                flag &= ~O_TRUNC;
2931                break;
2932        }
2933
2934        error = inode_permission(inode, MAY_OPEN | acc_mode);
2935        if (error)
2936                return error;
2937
2938        /*
2939         * An append-only file must be opened in append mode for writing.
2940         */
2941        if (IS_APPEND(inode)) {
2942                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2943                        return -EPERM;
2944                if (flag & O_TRUNC)
2945                        return -EPERM;
2946        }
2947
2948        /* O_NOATIME can only be set by the owner or superuser */
2949        if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2950                return -EPERM;
2951
2952        return 0;
2953}
2954
2955static int handle_truncate(struct file *filp)
2956{
2957        const struct path *path = &filp->f_path;
2958        struct inode *inode = path->dentry->d_inode;
2959        int error = get_write_access(inode);
2960        if (error)
2961                return error;
2962        /*
2963         * Refuse to truncate files with mandatory locks held on them.
2964         */
2965        error = locks_verify_locked(filp);
2966        if (!error)
2967                error = security_path_truncate(path);
2968        if (!error) {
2969                error = do_truncate(path->dentry, 0,
2970                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2971                                    filp);
2972        }
2973        put_write_access(inode);
2974        return error;
2975}
2976
2977static inline int open_to_namei_flags(int flag)
2978{
2979        if ((flag & O_ACCMODE) == 3)
2980                flag--;
2981        return flag;
2982}
2983
2984static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
2985{
2986        struct user_namespace *s_user_ns;
2987        int error = security_path_mknod(dir, dentry, mode, 0);
2988        if (error)
2989                return error;
2990
2991        s_user_ns = dir->dentry->d_sb->s_user_ns;
2992        if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
2993            !kgid_has_mapping(s_user_ns, current_fsgid()))
2994                return -EOVERFLOW;
2995
2996        error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
2997        if (error)
2998                return error;
2999
3000        return security_inode_create(dir->dentry->d_inode, dentry, mode);
3001}
3002
3003/*
3004 * Attempt to atomically look up, create and open a file from a negative
3005 * dentry.
3006 *
3007 * Returns 0 if successful.  The file will have been created and attached to
3008 * @file by the filesystem calling finish_open().
3009 *
3010 * Returns 1 if the file was looked up only or didn't need creating.  The
3011 * caller will need to perform the open themselves.  @path will have been
3012 * updated to point to the new dentry.  This may be negative.
3013 *
3014 * Returns an error code otherwise.
3015 */
3016static int atomic_open(struct nameidata *nd, struct dentry *dentry,
3017                        struct path *path, struct file *file,
3018                        const struct open_flags *op,
3019                        int open_flag, umode_t mode,
3020                        int *opened)
3021{
3022        struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
3023        struct inode *dir =  nd->path.dentry->d_inode;
3024        int error;
3025
3026        if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */
3027                open_flag &= ~O_TRUNC;
3028
3029        if (nd->flags & LOOKUP_DIRECTORY)
3030                open_flag |= O_DIRECTORY;
3031
3032        file->f_path.dentry = DENTRY_NOT_SET;
3033        file->f_path.mnt = nd->path.mnt;
3034        error = dir->i_op->atomic_open(dir, dentry, file,
3035                                       open_to_namei_flags(open_flag),
3036                                       mode, opened);
3037        d_lookup_done(dentry);
3038        if (!error) {
3039                /*
3040                 * We didn't have the inode before the open, so check open
3041                 * permission here.
3042                 */
3043                int acc_mode = op->acc_mode;
3044                if (*opened & FILE_CREATED) {
3045                        WARN_ON(!(open_flag & O_CREAT));
3046                        fsnotify_create(dir, dentry);
3047                        acc_mode = 0;
3048                }
3049                error = may_open(&file->f_path, acc_mode, open_flag);
3050                if (WARN_ON(error > 0))
3051                        error = -EINVAL;
3052        } else if (error > 0) {
3053                if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3054                        error = -EIO;
3055                } else {
3056                        if (file->f_path.dentry) {
3057                                dput(dentry);
3058                                dentry = file->f_path.dentry;
3059                        }
3060                        if (*opened & FILE_CREATED)
3061                                fsnotify_create(dir, dentry);
3062                        if (unlikely(d_is_negative(dentry))) {
3063                                error = -ENOENT;
3064                        } else {
3065                                path->dentry = dentry;
3066                                path->mnt = nd->path.mnt;
3067                                return 1;
3068                        }
3069                }
3070        }
3071        dput(dentry);
3072        return error;
3073}
3074
3075/*
3076 * Look up and maybe create and open the last component.
3077 *
3078 * Must be called with i_mutex held on parent.
3079 *
3080 * Returns 0 if the file was successfully atomically created (if necessary) and
3081 * opened.  In this case the file will be returned attached to @file.
3082 *
3083 * Returns 1 if the file was not completely opened at this time, though lookups
3084 * and creations will have been performed and the dentry returned in @path will
3085 * be positive upon return if O_CREAT was specified.  If O_CREAT wasn't
3086 * specified then a negative dentry may be returned.
3087 *
3088 * An error code is returned otherwise.
3089 *
3090 * FILE_CREATE will be set in @*opened if the dentry was created and will be
3091 * cleared otherwise prior to returning.
3092 */
3093static int lookup_open(struct nameidata *nd, struct path *path,
3094                        struct file *file,
3095                        const struct open_flags *op,
3096                        bool got_write, int *opened)
3097{
3098        struct dentry *dir = nd->path.dentry;
3099        struct inode *dir_inode = dir->d_inode;
3100        int open_flag = op->open_flag;
3101        struct dentry *dentry;
3102        int error, create_error = 0;
3103        umode_t mode = op->mode;
3104        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
3105
3106        if (unlikely(IS_DEADDIR(dir_inode)))
3107                return -ENOENT;
3108
3109        *opened &= ~FILE_CREATED;
3110        dentry = d_lookup(dir, &nd->last);
3111        for (;;) {
3112                if (!dentry) {
3113                        dentry = d_alloc_parallel(dir, &nd->last, &wq);
3114                        if (IS_ERR(dentry))
3115                                return PTR_ERR(dentry);
3116                }
3117                if (d_in_lookup(dentry))
3118                        break;
3119
3120                error = d_revalidate(dentry, nd->flags);
3121                if (likely(error > 0))
3122                        break;
3123                if (error)
3124                        goto out_dput;
3125                d_invalidate(dentry);
3126                dput(dentry);
3127                dentry = NULL;
3128        }
3129        if (dentry->d_inode) {
3130                /* Cached positive dentry: will open in f_op->open */
3131                goto out_no_open;
3132        }
3133
3134        /*
3135         * Checking write permission is tricky, bacuse we don't know if we are
3136         * going to actually need it: O_CREAT opens should work as long as the
3137         * file exists.  But checking existence breaks atomicity.  The trick is
3138         * to check access and if not granted clear O_CREAT from the flags.
3139         *
3140         * Another problem is returing the "right" error value (e.g. for an
3141         * O_EXCL open we want to return EEXIST not EROFS).
3142         */
3143        if (open_flag & O_CREAT) {
3144                if (!IS_POSIXACL(dir->d_inode))
3145                        mode &= ~current_umask();
3146                if (unlikely(!got_write)) {
3147                        create_error = -EROFS;
3148                        open_flag &= ~O_CREAT;
3149                        if (open_flag & (O_EXCL | O_TRUNC))
3150                                goto no_open;
3151                        /* No side effects, safe to clear O_CREAT */
3152                } else {
3153                        create_error = may_o_create(&nd->path, dentry, mode);
3154                        if (create_error) {
3155                                open_flag &= ~O_CREAT;
3156                                if (open_flag & O_EXCL)
3157                                        goto no_open;
3158                        }
3159                }
3160        } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
3161                   unlikely(!got_write)) {
3162                /*
3163                 * No O_CREATE -> atomicity not a requirement -> fall
3164                 * back to lookup + open
3165                 */
3166                goto no_open;
3167        }
3168
3169        if (dir_inode->i_op->atomic_open) {
3170                error = atomic_open(nd, dentry, path, file, op, open_flag,
3171                                    mode, opened);
3172                if (unlikely(error == -ENOENT) && create_error)
3173                        error = create_error;
3174                return error;
3175        }
3176
3177no_open:
3178        if (d_in_lookup(dentry)) {
3179                struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
3180                                                             nd->flags);
3181                d_lookup_done(dentry);
3182                if (unlikely(res)) {
3183                        if (IS_ERR(res)) {
3184                                error = PTR_ERR(res);
3185                                goto out_dput;
3186                        }
3187                        dput(dentry);
3188                        dentry = res;
3189                }
3190        }
3191
3192        /* Negative dentry, just create the file */
3193        if (!dentry->d_inode && (open_flag & O_CREAT)) {
3194                *opened |= FILE_CREATED;
3195                audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
3196                if (!dir_inode->i_op->create) {
3197                        error = -EACCES;
3198                        goto out_dput;
3199                }
3200                error = dir_inode->i_op->create(dir_inode, dentry, mode,
3201                                                open_flag & O_EXCL);
3202                if (error)
3203                        goto out_dput;
3204                fsnotify_create(dir_inode, dentry);
3205        }
3206        if (unlikely(create_error) && !dentry->d_inode) {
3207                error = create_error;
3208                goto out_dput;
3209        }
3210out_no_open:
3211        path->dentry = dentry;
3212        path->mnt = nd->path.mnt;
3213        return 1;
3214
3215out_dput:
3216        dput(dentry);
3217        return error;
3218}
3219
3220/*
3221 * Handle the last step of open()
3222 */
3223static int do_last(struct nameidata *nd,
3224                   struct file *file, const struct open_flags *op,
3225                   int *opened)
3226{
3227        struct dentry *dir = nd->path.dentry;
3228        int open_flag = op->open_flag;
3229        bool will_truncate = (open_flag & O_TRUNC) != 0;
3230        bool got_write = false;
3231        int acc_mode = op->acc_mode;
3232        unsigned seq;
3233        struct inode *inode;
3234        struct path path;
3235        int error;
3236
3237        nd->flags &= ~LOOKUP_PARENT;
3238        nd->flags |= op->intent;
3239
3240        if (nd->last_type != LAST_NORM) {
3241                error = handle_dots(nd, nd->last_type);
3242                if (unlikely(error))
3243                        return error;
3244                goto finish_open;
3245        }
3246
3247        if (!(open_flag & O_CREAT)) {
3248                if (nd->last.name[nd->last.len])
3249                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
3250                /* we _can_ be in RCU mode here */
3251                error = lookup_fast(nd, &path, &inode, &seq);
3252                if (likely(error > 0))
3253                        goto finish_lookup;
3254
3255                if (error < 0)
3256                        return error;
3257
3258                BUG_ON(nd->inode != dir->d_inode);
3259                BUG_ON(nd->flags & LOOKUP_RCU);
3260        } else {
3261                /* create side of things */
3262                /*
3263                 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
3264                 * has been cleared when we got to the last component we are
3265                 * about to look up
3266                 */
3267                error = complete_walk(nd);
3268                if (error)
3269                        return error;
3270
3271                audit_inode(nd->name, dir, LOOKUP_PARENT);
3272                /* trailing slashes? */
3273                if (unlikely(nd->last.name[nd->last.len]))
3274                        return -EISDIR;
3275        }
3276
3277        if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3278                error = mnt_want_write(nd->path.mnt);
3279                if (!error)
3280                        got_write = true;
3281                /*
3282                 * do _not_ fail yet - we might not need that or fail with
3283                 * a different error; let lookup_open() decide; we'll be
3284                 * dropping this one anyway.
3285                 */
3286        }
3287        if (open_flag & O_CREAT)
3288                inode_lock(dir->d_inode);
3289        else
3290                inode_lock_shared(dir->d_inode);
3291        error = lookup_open(nd, &path, file, op, got_write, opened);
3292        if (open_flag & O_CREAT)
3293                inode_unlock(dir->d_inode);
3294        else
3295                inode_unlock_shared(dir->d_inode);
3296
3297        if (error <= 0) {
3298                if (error)
3299                        goto out;
3300
3301                if ((*opened & FILE_CREATED) ||
3302                    !S_ISREG(file_inode(file)->i_mode))
3303                        will_truncate = false;
3304
3305                audit_inode(nd->name, file->f_path.dentry, 0);
3306                goto opened;
3307        }
3308
3309        if (*opened & FILE_CREATED) {
3310                /* Don't check for write permission, don't truncate */
3311                open_flag &= ~O_TRUNC;
3312                will_truncate = false;
3313                acc_mode = 0;
3314                path_to_nameidata(&path, nd);
3315                goto finish_open_created;
3316        }
3317
3318        /*
3319         * If atomic_open() acquired write access it is dropped now due to
3320         * possible mount and symlink following (this might be optimized away if
3321         * necessary...)
3322         */
3323        if (got_write) {
3324                mnt_drop_write(nd->path.mnt);
3325                got_write = false;
3326        }
3327
3328        error = follow_managed(&path, nd);
3329        if (unlikely(error < 0))
3330                return error;
3331
3332        if (unlikely(d_is_negative(path.dentry))) {
3333                path_to_nameidata(&path, nd);
3334                return -ENOENT;
3335        }
3336
3337        /*
3338         * create/update audit record if it already exists.
3339         */
3340        audit_inode(nd->name, path.dentry, 0);
3341
3342        if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
3343                path_to_nameidata(&path, nd);
3344                return -EEXIST;
3345        }
3346
3347        seq = 0;        /* out of RCU mode, so the value doesn't matter */
3348        inode = d_backing_inode(path.dentry);
3349finish_lookup:
3350        error = step_into(nd, &path, 0, inode, seq);
3351        if (unlikely(error))
3352                return error;
3353finish_open:
3354        /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
3355        error = complete_walk(nd);
3356        if (error)
3357                return error;
3358        audit_inode(nd->name, nd->path.dentry, 0);
3359        error = -EISDIR;
3360        if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
3361                goto out;
3362        error = -ENOTDIR;
3363        if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3364                goto out;
3365        if (!d_is_reg(nd->path.dentry))
3366                will_truncate = false;
3367
3368        if (will_truncate) {
3369                error = mnt_want_write(nd->path.mnt);
3370                if (error)
3371                        goto out;
3372                got_write = true;
3373        }
3374finish_open_created:
3375        error = may_open(&nd->path, acc_mode, open_flag);
3376        if (error)
3377                goto out;
3378        BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
3379        error = vfs_open(&nd->path, file, current_cred());
3380        if (error)
3381                goto out;
3382        *opened |= FILE_OPENED;
3383opened:
3384        error = open_check_o_direct(file);
3385        if (!error)
3386                error = ima_file_check(file, op->acc_mode, *opened);
3387        if (!error && will_truncate)
3388                error = handle_truncate(file);
3389out:
3390        if (unlikely(error) && (*opened & FILE_OPENED))
3391                fput(file);
3392        if (unlikely(error > 0)) {
3393                WARN_ON(1);
3394                error = -EINVAL;
3395        }
3396        if (got_write)
3397                mnt_drop_write(nd->path.mnt);
3398        return error;
3399}
3400
3401struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
3402{
3403        struct dentry *child = NULL;
3404        struct inode *dir = dentry->d_inode;
3405        struct inode *inode;
3406        int error;
3407
3408        /* we want directory to be writable */
3409        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
3410        if (error)
3411                goto out_err;
3412        error = -EOPNOTSUPP;
3413        if (!dir->i_op->tmpfile)
3414                goto out_err;
3415        error = -ENOMEM;
3416        child = d_alloc(dentry, &slash_name);
3417        if (unlikely(!child))
3418                goto out_err;
3419        error = dir->i_op->tmpfile(dir, child, mode);
3420        if (error)
3421                goto out_err;
3422        error = -ENOENT;
3423        inode = child->d_inode;
3424        if (unlikely(!inode))
3425                goto out_err;
3426        if (!(open_flag & O_EXCL)) {
3427                spin_lock(&inode->i_lock);
3428                inode->i_state |= I_LINKABLE;
3429                spin_unlock(&inode->i_lock);
3430        }
3431        return child;
3432
3433out_err:
3434        dput(child);
3435        return ERR_PTR(error);
3436}
3437EXPORT_SYMBOL(vfs_tmpfile);
3438
3439static int do_tmpfile(struct nameidata *nd, unsigned flags,
3440                const struct open_flags *op,
3441                struct file *file, int *opened)
3442{
3443        struct dentry *child;
3444        struct path path;
3445        int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3446        if (unlikely(error))
3447                return error;
3448        error = mnt_want_write(path.mnt);
3449        if (unlikely(error))
3450                goto out;
3451        child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
3452        error = PTR_ERR(child);
3453        if (unlikely(IS_ERR(child)))
3454                goto out2;
3455        dput(path.dentry);
3456        path.dentry = child;
3457        audit_inode(nd->name, child, 0);
3458        /* Don't check for other permissions, the inode was just created */
3459        error = may_open(&path, 0, op->open_flag);
3460        if (error)
3461                goto out2;
3462        file->f_path.mnt = path.mnt;
3463        error = finish_open(file, child, NULL, opened);
3464        if (error)
3465                goto out2;
3466        error = open_check_o_direct(file);
3467        if (error)
3468                fput(file);
3469out2:
3470        mnt_drop_write(path.mnt);
3471out:
3472        path_put(&path);
3473        return error;
3474}
3475
3476static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
3477{
3478        struct path path;
3479        int error = path_lookupat(nd, flags, &path);
3480        if (!error) {
3481                audit_inode(nd->name, path.dentry, 0);
3482                error = vfs_open(&path, file, current_cred());
3483                path_put(&path);
3484        }
3485        return error;
3486}
3487
3488static struct file *path_openat(struct nameidata *nd,
3489                        const struct open_flags *op, unsigned flags)
3490{
3491        const char *s;
3492        struct file *file;
3493        int opened = 0;
3494        int error;
3495
3496        file = get_empty_filp();
3497        if (IS_ERR(file))
3498                return file;
3499
3500        file->f_flags = op->open_flag;
3501
3502        if (unlikely(file->f_flags & __O_TMPFILE)) {
3503                error = do_tmpfile(nd, flags, op, file, &opened);
3504                goto out2;
3505        }
3506
3507        if (unlikely(file->f_flags & O_PATH)) {
3508                error = do_o_path(nd, flags, file);
3509                if (!error)
3510                        opened |= FILE_OPENED;
3511                goto out2;
3512        }
3513
3514        s = path_init(nd, flags);
3515        if (IS_ERR(s)) {
3516                put_filp(file);
3517                return ERR_CAST(s);
3518        }
3519        while (!(error = link_path_walk(s, nd)) &&
3520                (error = do_last(nd, file, op, &opened)) > 0) {
3521                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3522                s = trailing_symlink(nd);
3523                if (IS_ERR(s)) {
3524                        error = PTR_ERR(s);
3525                        break;
3526                }
3527        }
3528        terminate_walk(nd);
3529out2:
3530        if (!(opened & FILE_OPENED)) {
3531                BUG_ON(!error);
3532                put_filp(file);
3533        }
3534        if (unlikely(error)) {
3535                if (error == -EOPENSTALE) {
3536                        if (flags & LOOKUP_RCU)
3537                                error = -ECHILD;
3538                        else
3539                                error = -ESTALE;
3540                }
3541                file = ERR_PTR(error);
3542        }
3543        return file;
3544}
3545
3546struct file *do_filp_open(int dfd, struct filename *pathname,
3547                const struct open_flags *op)
3548{
3549        struct nameidata nd;
3550        int flags = op->lookup_flags;
3551        struct file *filp;
3552
3553        set_nameidata(&nd, dfd, pathname);
3554        filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3555        if (unlikely(filp == ERR_PTR(-ECHILD)))
3556                filp = path_openat(&nd, op, flags);
3557        if (unlikely(filp == ERR_PTR(-ESTALE)))
3558                filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3559        restore_nameidata();
3560        return filp;
3561}
3562
3563struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3564                const char *name, const struct open_flags *op)
3565{
3566        struct nameidata nd;
3567        struct file *file;
3568        struct filename *filename;
3569        int flags = op->lookup_flags | LOOKUP_ROOT;
3570
3571        nd.root.mnt = mnt;
3572        nd.root.dentry = dentry;
3573
3574        if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
3575                return ERR_PTR(-ELOOP);
3576
3577        filename = getname_kernel(name);
3578        if (IS_ERR(filename))
3579                return ERR_CAST(filename);
3580
3581        set_nameidata(&nd, -1, filename);
3582        file = path_openat(&nd, op, flags | LOOKUP_RCU);
3583        if (unlikely(file == ERR_PTR(-ECHILD)))
3584                file = path_openat(&nd, op, flags);
3585        if (unlikely(file == ERR_PTR(-ESTALE)))
3586                file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3587        restore_nameidata();
3588        putname(filename);
3589        return file;
3590}
3591
3592static struct dentry *filename_create(int dfd, struct filename *name,
3593                                struct path *path, unsigned int lookup_flags)
3594{
3595        struct dentry *dentry = ERR_PTR(-EEXIST);
3596        struct qstr last;
3597        int type;
3598        int err2;
3599        int error;
3600        bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
3601
3602        /*
3603         * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
3604         * other flags passed in are ignored!
3605         */
3606        lookup_flags &= LOOKUP_REVAL;
3607
3608        name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
3609        if (IS_ERR(name))
3610                return ERR_CAST(name);
3611
3612        /*
3613         * Yucky last component or no last component at all?
3614         * (foo/., foo/.., /////)
3615         */
3616        if (unlikely(type != LAST_NORM))
3617                goto out;
3618
3619        /* don't fail immediately if it's r/o, at least try to report other errors */
3620        err2 = mnt_want_write(path->mnt);
3621        /*
3622         * Do the final lookup.
3623         */
3624        lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
3625        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3626        dentry = __lookup_hash(&last, path->dentry, lookup_flags);
3627        if (IS_ERR(dentry))
3628                goto unlock;
3629
3630        error = -EEXIST;
3631        if (d_is_positive(dentry))
3632                goto fail;
3633
3634        /*
3635         * Special case - lookup gave negative, but... we had foo/bar/
3636         * From the vfs_mknod() POV we just have a negative dentry -
3637         * all is fine. Let's be bastards - you had / on the end, you've
3638         * been asking for (non-existent) directory. -ENOENT for you.
3639         */
3640        if (unlikely(!is_dir && last.name[last.len])) {
3641                error = -ENOENT;
3642                goto fail;
3643        }
3644        if (unlikely(err2)) {
3645                error = err2;
3646                goto fail;
3647        }
3648        putname(name);
3649        return dentry;
3650fail:
3651        dput(dentry);
3652        dentry = ERR_PTR(error);
3653unlock:
3654        inode_unlock(path->dentry->d_inode);
3655        if (!err2)
3656                mnt_drop_write(path->mnt);
3657out:
3658        path_put(path);
3659        putname(name);
3660        return dentry;
3661}
3662
3663struct dentry *kern_path_create(int dfd, const char *pathname,
3664                                struct path *path, unsigned int lookup_flags)
3665{
3666        return filename_create(dfd, getname_kernel(pathname),
3667                                path, lookup_flags);
3668}
3669EXPORT_SYMBOL(kern_path_create);
3670
3671void done_path_create(struct path *path, struct dentry *dentry)
3672{
3673        dput(dentry);
3674        inode_unlock(path->dentry->d_inode);
3675        mnt_drop_write(path->mnt);
3676        path_put(path);
3677}
3678EXPORT_SYMBOL(done_path_create);
3679
3680inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3681                                struct path *path, unsigned int lookup_flags)
3682{
3683        return filename_create(dfd, getname(pathname), path, lookup_flags);
3684}
3685EXPORT_SYMBOL(user_path_create);
3686
3687int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
3688{
3689        int error = may_create(dir, dentry);
3690
3691        if (error)
3692                return error;
3693
3694        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
3695                return -EPERM;
3696
3697        if (!dir->i_op->mknod)
3698                return -EPERM;
3699
3700        error = devcgroup_inode_mknod(mode, dev);
3701        if (error)
3702                return error;
3703
3704        error = security_inode_mknod(dir, dentry, mode, dev);
3705        if (error)
3706                return error;
3707
3708        error = dir->i_op->mknod(dir, dentry, mode, dev);
3709        if (!error)
3710                fsnotify_create(dir, dentry);
3711        return error;
3712}
3713EXPORT_SYMBOL(vfs_mknod);
3714
3715static int may_mknod(umode_t mode)
3716{
3717        switch (mode & S_IFMT) {
3718        case S_IFREG:
3719        case S_IFCHR:
3720        case S_IFBLK:
3721        case S_IFIFO:
3722        case S_IFSOCK:
3723        case 0: /* zero mode translates to S_IFREG */
3724                return 0;
3725        case S_IFDIR:
3726                return -EPERM;
3727        default:
3728                return -EINVAL;
3729        }
3730}
3731
3732SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3733                unsigned, dev)
3734{
3735        struct dentry *dentry;
3736        struct path path;
3737        int error;
3738        unsigned int lookup_flags = 0;
3739
3740        error = may_mknod(mode);
3741        if (error)
3742                return error;
3743retry:
3744        dentry = user_path_create(dfd, filename, &path, lookup_flags);
3745        if (IS_ERR(dentry))
3746                return PTR_ERR(dentry);
3747
3748        if (!IS_POSIXACL(path.dentry->d_inode))
3749                mode &= ~current_umask();
3750        error = security_path_mknod(&path, dentry, mode, dev);
3751        if (error)
3752                goto out;
3753        switch (mode & S_IFMT) {
3754                case 0: case S_IFREG:
3755                        error = vfs_create(path.dentry->d_inode,dentry,mode,true);
3756                        if (!error)
3757                                ima_post_path_mknod(dentry);
3758                        break;
3759                case S_IFCHR: case S_IFBLK:
3760                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,
3761                                        new_decode_dev(dev));
3762                        break;
3763                case S_IFIFO: case S_IFSOCK:
3764                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
3765                        break;
3766        }
3767out:
3768        done_path_create(&path, dentry);
3769        if (retry_estale(error, lookup_flags)) {
3770                lookup_flags |= LOOKUP_REVAL;
3771                goto retry;
3772        }
3773        return error;
3774}
3775
3776SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3777{
3778        return sys_mknodat(AT_FDCWD, filename, mode, dev);
3779}
3780
3781int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3782{
3783        int error = may_create(dir, dentry);
3784        unsigned max_links = dir->i_sb->s_max_links;
3785
3786        if (error)
3787                return error;
3788
3789        if (!dir->i_op->mkdir)
3790                return -EPERM;
3791
3792        mode &= (S_IRWXUGO|S_ISVTX);
3793        error = security_inode_mkdir(dir, dentry, mode);
3794        if (error)
3795                return error;
3796
3797        if (max_links && dir->i_nlink >= max_links)
3798                return -EMLINK;
3799
3800        error = dir->i_op->mkdir(dir, dentry, mode);
3801        if (!error)
3802                fsnotify_mkdir(dir, dentry);
3803        return error;
3804}
3805EXPORT_SYMBOL(vfs_mkdir);
3806
3807SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3808{
3809        struct dentry *dentry;
3810        struct path path;
3811        int error;
3812        unsigned int lookup_flags = LOOKUP_DIRECTORY;
3813
3814retry:
3815        dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3816        if (IS_ERR(dentry))
3817                return PTR_ERR(dentry);
3818
3819        if (!IS_POSIXACL(path.dentry->d_inode))
3820                mode &= ~current_umask();
3821        error = security_path_mkdir(&path, dentry, mode);
3822        if (!error)
3823                error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
3824        done_path_create(&path, dentry);
3825        if (retry_estale(error, lookup_flags)) {
3826                lookup_flags |= LOOKUP_REVAL;
3827                goto retry;
3828        }
3829        return error;
3830}
3831
3832SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3833{
3834        return sys_mkdirat(AT_FDCWD, pathname, mode);
3835}
3836
3837int vfs_rmdir(struct inode *dir, struct dentry *dentry)
3838{
3839        int error = may_delete(dir, dentry, 1);
3840
3841        if (error)
3842                return error;
3843
3844        if (!dir->i_op->rmdir)
3845                return -EPERM;
3846
3847        dget(dentry);
3848        inode_lock(dentry->d_inode);
3849
3850        error = -EBUSY;
3851        if (is_local_mountpoint(dentry))
3852                goto out;
3853
3854        error = security_inode_rmdir(dir, dentry);
3855        if (error)
3856                goto out;
3857
3858        shrink_dcache_parent(dentry);
3859        error = dir->i_op->rmdir(dir, dentry);
3860        if (error)
3861                goto out;
3862
3863        dentry->d_inode->i_flags |= S_DEAD;
3864        dont_mount(dentry);
3865        detach_mounts(dentry);
3866
3867out:
3868        inode_unlock(dentry->d_inode);
3869        dput(dentry);
3870        if (!error)
3871                d_delete(dentry);
3872        return error;
3873}
3874EXPORT_SYMBOL(vfs_rmdir);
3875
3876static long do_rmdir(int dfd, const char __user *pathname)
3877{
3878        int error = 0;
3879        struct filename *name;
3880        struct dentry *dentry;
3881        struct path path;
3882        struct qstr last;
3883        int type;
3884        unsigned int lookup_flags = 0;
3885retry:
3886        name = filename_parentat(dfd, getname(pathname), lookup_flags,
3887                                &path, &last, &type);
3888        if (IS_ERR(name))
3889                return PTR_ERR(name);
3890
3891        switch (type) {
3892        case LAST_DOTDOT:
3893                error = -ENOTEMPTY;
3894                goto exit1;
3895        case LAST_DOT:
3896                error = -EINVAL;
3897                goto exit1;
3898        case LAST_ROOT:
3899                error = -EBUSY;
3900                goto exit1;
3901        }
3902
3903        error = mnt_want_write(path.mnt);
3904        if (error)
3905                goto exit1;
3906
3907        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3908        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
3909        error = PTR_ERR(dentry);
3910        if (IS_ERR(dentry))
3911                goto exit2;
3912        if (!dentry->d_inode) {
3913                error = -ENOENT;
3914                goto exit3;
3915        }
3916        error = security_path_rmdir(&path, dentry);
3917        if (error)
3918                goto exit3;
3919        error = vfs_rmdir(path.dentry->d_inode, dentry);
3920exit3:
3921        dput(dentry);
3922exit2:
3923        inode_unlock(path.dentry->d_inode);
3924        mnt_drop_write(path.mnt);
3925exit1:
3926        path_put(&path);
3927        putname(name);
3928        if (retry_estale(error, lookup_flags)) {
3929                lookup_flags |= LOOKUP_REVAL;
3930                goto retry;
3931        }
3932        return error;
3933}
3934
3935SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3936{
3937        return do_rmdir(AT_FDCWD, pathname);
3938}
3939
3940/**
3941 * vfs_unlink - unlink a filesystem object
3942 * @dir:        parent directory
3943 * @dentry:     victim
3944 * @delegated_inode: returns victim inode, if the inode is delegated.
3945 *
3946 * The caller must hold dir->i_mutex.
3947 *
3948 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
3949 * return a reference to the inode in delegated_inode.  The caller
3950 * should then break the delegation on that inode and retry.  Because
3951 * breaking a delegation may take a long time, the caller should drop
3952 * dir->i_mutex before doing so.
3953 *
3954 * Alternatively, a caller may pass NULL for delegated_inode.  This may
3955 * be appropriate for callers that expect the underlying filesystem not
3956 * to be NFS exported.
3957 */
3958int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
3959{
3960        struct inode *target = dentry->d_inode;
3961        int error = may_delete(dir, dentry, 0);
3962
3963        if (error)
3964                return error;
3965
3966        if (!dir->i_op->unlink)
3967                return -EPERM;
3968
3969        inode_lock(target);
3970        if (is_local_mountpoint(dentry))
3971                error = -EBUSY;
3972        else {
3973                error = security_inode_unlink(dir, dentry);
3974                if (!error) {
3975                        error = try_break_deleg(target, delegated_inode);
3976                        if (error)
3977                                goto out;
3978                        error = dir->i_op->unlink(dir, dentry);
3979                        if (!error) {
3980                                dont_mount(dentry);
3981                                detach_mounts(dentry);
3982                        }
3983                }
3984        }
3985out:
3986        inode_unlock(target);
3987
3988        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
3989        if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
3990                fsnotify_link_count(target);
3991                d_delete(dentry);
3992        }
3993
3994        return error;
3995}
3996EXPORT_SYMBOL(vfs_unlink);
3997
3998/*
3999 * Make sure that the actual truncation of the file will occur outside its
4000 * directory's i_mutex.  Truncate can take a long time if there is a lot of
4001 * writeout happening, and we don't want to prevent access to the directory
4002 * while waiting on the I/O.
4003 */
4004static long do_unlinkat(int dfd, const char __user *pathname)
4005{
4006        int error;
4007        struct filename *name;
4008        struct dentry *dentry;
4009        struct path path;
4010        struct qstr last;
4011        int type;
4012        struct inode *inode = NULL;
4013        struct inode *delegated_inode = NULL;
4014        unsigned int lookup_flags = 0;
4015retry:
4016        name = filename_parentat(dfd, getname(pathname), lookup_flags,
4017                                &path, &last, &type);
4018        if (IS_ERR(name))
4019                return PTR_ERR(name);
4020
4021        error = -EISDIR;
4022        if (type != LAST_NORM)
4023                goto exit1;
4024
4025        error = mnt_want_write(path.mnt);
4026        if (error)
4027                goto exit1;
4028retry_deleg:
4029        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4030        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
4031        error = PTR_ERR(dentry);
4032        if (!IS_ERR(dentry)) {
4033                /* Why not before? Because we want correct error value */
4034                if (last.name[last.len])
4035                        goto slashes;
4036                inode = dentry->d_inode;
4037                if (d_is_negative(dentry))
4038                        goto slashes;
4039                ihold(inode);
4040                error = security_path_unlink(&path, dentry);
4041                if (error)
4042                        goto exit2;
4043                error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
4044exit2:
4045                dput(dentry);
4046        }
4047        inode_unlock(path.dentry->d_inode);
4048        if (inode)
4049                iput(inode);    /* truncate the inode here */
4050        inode = NULL;
4051        if (delegated_inode) {
4052                error = break_deleg_wait(&delegated_inode);
4053                if (!error)
4054                        goto retry_deleg;
4055        }
4056        mnt_drop_write(path.mnt);
4057exit1:
4058        path_put(&path);
4059        putname(name);
4060        if (retry_estale(error, lookup_flags)) {
4061                lookup_flags |= LOOKUP_REVAL;
4062                inode = NULL;
4063                goto retry;
4064        }
4065        return error;
4066
4067slashes:
4068        if (d_is_negative(dentry))
4069                error = -ENOENT;
4070        else if (d_is_dir(dentry))
4071                error = -EISDIR;
4072        else
4073                error = -ENOTDIR;
4074        goto exit2;
4075}
4076
4077SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4078{
4079        if ((flag & ~AT_REMOVEDIR) != 0)
4080                return -EINVAL;
4081
4082        if (flag & AT_REMOVEDIR)
4083                return do_rmdir(dfd, pathname);
4084
4085        return do_unlinkat(dfd, pathname);
4086}
4087
4088SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4089{
4090        return do_unlinkat(AT_FDCWD, pathname);
4091}
4092
4093int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
4094{
4095        int error = may_create(dir, dentry);
4096
4097        if (error)
4098                return error;
4099
4100        if (!dir->i_op->symlink)
4101                return -EPERM;
4102
4103        error = security_inode_symlink(dir, dentry, oldname);
4104        if (error)
4105                return error;
4106
4107        error = dir->i_op->symlink(dir, dentry, oldname);
4108        if (!error)
4109                fsnotify_create(dir, dentry);
4110        return error;
4111}
4112EXPORT_SYMBOL(vfs_symlink);
4113
4114SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
4115                int, newdfd, const char __user *, newname)
4116{
4117        int error;
4118        struct filename *from;
4119        struct dentry *dentry;
4120        struct path path;
4121        unsigned int lookup_flags = 0;
4122
4123        from = getname(oldname);
4124        if (IS_ERR(from))
4125                return PTR_ERR(from);
4126retry:
4127        dentry = user_path_create(newdfd, newname, &path, lookup_flags);
4128        error = PTR_ERR(dentry);
4129        if (IS_ERR(dentry))
4130                goto out_putname;
4131
4132        error = security_path_symlink(&path, dentry, from->name);
4133        if (!error)
4134                error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
4135        done_path_create(&path, dentry);
4136        if (retry_estale(error, lookup_flags)) {
4137                lookup_flags |= LOOKUP_REVAL;
4138                goto retry;
4139        }
4140out_putname:
4141        putname(from);
4142        return error;
4143}
4144
4145SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4146{
4147        return sys_symlinkat(oldname, AT_FDCWD, newname);
4148}
4149
4150/**
4151 * vfs_link - create a new link
4152 * @old_dentry: object to be linked
4153 * @dir:        new parent
4154 * @new_dentry: where to create the new link
4155 * @delegated_inode: returns inode needing a delegation break
4156 *
4157 * The caller must hold dir->i_mutex
4158 *
4159 * If vfs_link discovers a delegation on the to-be-linked file in need
4160 * of breaking, it will return -EWOULDBLOCK and return a reference to the
4161 * inode in delegated_inode.  The caller should then break the delegation
4162 * and retry.  Because breaking a delegation may take a long time, the
4163 * caller should drop the i_mutex before doing so.
4164 *
4165 * Alternatively, a caller may pass NULL for delegated_inode.  This may
4166 * be appropriate for callers that expect the underlying filesystem not
4167 * to be NFS exported.
4168 */
4169int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
4170{
4171        struct inode *inode = old_dentry->d_inode;
4172        unsigned max_links = dir->i_sb->s_max_links;
4173        int error;
4174
4175        if (!inode)
4176                return -ENOENT;
4177
4178        error = may_create(dir, new_dentry);
4179        if (error)
4180                return error;
4181
4182        if (dir->i_sb != inode->i_sb)
4183                return -EXDEV;
4184
4185        /*
4186         * A link to an append-only or immutable file cannot be created.
4187         */
4188        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4189                return -EPERM;
4190        /*
4191         * Updating the link count will likely cause i_uid and i_gid to
4192         * be writen back improperly if their true value is unknown to
4193         * the vfs.
4194         */
4195        if (HAS_UNMAPPED_ID(inode))
4196                return -EPERM;
4197        if (!dir->i_op->link)
4198                return -EPERM;
4199        if (S_ISDIR(inode->i_mode))
4200                return -EPERM;
4201
4202        error = security_inode_link(old_dentry, dir, new_dentry);
4203        if (error)
4204                return error;
4205
4206        inode_lock(inode);
4207        /* Make sure we don't allow creating hardlink to an unlinked file */
4208        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4209                error =  -ENOENT;
4210        else if (max_links && inode->i_nlink >= max_links)
4211                error = -EMLINK;
4212        else {
4213                error = try_break_deleg(inode, delegated_inode);
4214                if (!error)
4215                        error = dir->i_op->link(old_dentry, dir, new_dentry);
4216        }
4217
4218        if (!error && (inode->i_state & I_LINKABLE)) {
4219                spin_lock(&inode->i_lock);
4220                inode->i_state &= ~I_LINKABLE;
4221                spin_unlock(&inode->i_lock);
4222        }
4223        inode_unlock(inode);
4224        if (!error)
4225                fsnotify_link(dir, inode, new_dentry);
4226        return error;
4227}
4228EXPORT_SYMBOL(vfs_link);
4229
4230/*
4231 * Hardlinks are often used in delicate situations.  We avoid
4232 * security-related surprises by not following symlinks on the
4233 * newname.  --KAB
4234 *
4235 * We don't follow them on the oldname either to be compatible
4236 * with linux 2.0, and to avoid hard-linking to directories
4237 * and other special files.  --ADM
4238 */
4239SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4240                int, newdfd, const char __user *, newname, int, flags)
4241{
4242        struct dentry *new_dentry;
4243        struct path old_path, new_path;
4244        struct inode *delegated_inode = NULL;
4245        int how = 0;
4246        int error;
4247
4248        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4249                return -EINVAL;
4250        /*
4251         * To use null names we require CAP_DAC_READ_SEARCH
4252         * This ensures that not everyone will be able to create
4253         * handlink using the passed filedescriptor.
4254         */
4255        if (flags & AT_EMPTY_PATH) {
4256                if (!capable(CAP_DAC_READ_SEARCH))
4257                        return -ENOENT;
4258                how = LOOKUP_EMPTY;
4259        }
4260
4261        if (flags & AT_SYMLINK_FOLLOW)
4262                how |= LOOKUP_FOLLOW;
4263retry:
4264        error = user_path_at(olddfd, oldname, how, &old_path);
4265        if (error)
4266                return error;
4267
4268        new_dentry = user_path_create(newdfd, newname, &new_path,
4269                                        (how & LOOKUP_REVAL));
4270        error = PTR_ERR(new_dentry);
4271        if (IS_ERR(new_dentry))
4272                goto out;
4273
4274        error = -EXDEV;
4275        if (old_path.mnt != new_path.mnt)
4276                goto out_dput;
4277        error = may_linkat(&old_path);
4278        if (unlikely(error))
4279                goto out_dput;
4280        error = security_path_link(old_path.dentry, &new_path, new_dentry);
4281        if (error)
4282                goto out_dput;
4283        error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
4284out_dput:
4285        done_path_create(&new_path, new_dentry);
4286        if (delegated_inode) {
4287                error = break_deleg_wait(&delegated_inode);
4288                if (!error) {
4289                        path_put(&old_path);
4290                        goto retry;
4291                }
4292        }
4293        if (retry_estale(error, how)) {
4294                path_put(&old_path);
4295                how |= LOOKUP_REVAL;
4296                goto retry;
4297        }
4298out:
4299        path_put(&old_path);
4300
4301        return error;
4302}
4303
4304SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4305{
4306        return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4307}
4308
4309/**
4310 * vfs_rename - rename a filesystem object
4311 * @old_dir:    parent of source
4312 * @old_dentry: source
4313 * @new_dir:    parent of destination
4314 * @new_dentry: destination
4315 * @delegated_inode: returns an inode needing a delegation break
4316 * @flags:      rename flags
4317 *
4318 * The caller must hold multiple mutexes--see lock_rename()).
4319 *
4320 * If vfs_rename discovers a delegation in need of breaking at either
4321 * the source or destination, it will return -EWOULDBLOCK and return a
4322 * reference to the inode in delegated_inode.  The caller should then
4323 * break the delegation and retry.  Because breaking a delegation may
4324 * take a long time, the caller should drop all locks before doing
4325 * so.
4326 *
4327 * Alternatively, a caller may pass NULL for delegated_inode.  This may
4328 * be appropriate for callers that expect the underlying filesystem not
4329 * to be NFS exported.
4330 *
4331 * The worst of all namespace operations - renaming directory. "Perverted"
4332 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4333 * Problems:
4334 *
4335 *      a) we can get into loop creation.
4336 *      b) race potential - two innocent renames can create a loop together.
4337 *         That's where 4.4 screws up. Current fix: serialization on
4338 *         sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4339 *         story.
4340 *      c) we have to lock _four_ objects - parents and victim (if it exists),
4341 *         and source (if it is not a directory).
4342 *         And that - after we got ->i_mutex on parents (until then we don't know
4343 *         whether the target exists).  Solution: try to be smart with locking
4344 *         order for inodes.  We rely on the fact that tree topology may change
4345 *         only under ->s_vfs_rename_mutex _and_ that parent of the object we
4346 *         move will be locked.  Thus we can rank directories by the tree
4347 *         (ancestors first) and rank all non-directories after them.
4348 *         That works since everybody except rename does "lock parent, lookup,
4349 *         lock child" and rename is under ->s_vfs_rename_mutex.
4350 *         HOWEVER, it relies on the assumption that any object with ->lookup()
4351 *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
4352 *         we'd better make sure that there's no link(2) for them.
4353 *      d) conversion from fhandle to dentry may come in the wrong moment - when
4354 *         we are removing the target. Solution: we will have to grab ->i_mutex
4355 *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4356 *         ->i_mutex on parents, which works but leads to some truly excessive
4357 *         locking].
4358 */
4359int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4360               struct inode *new_dir, struct dentry *new_dentry,
4361               struct inode **delegated_inode, unsigned int flags)
4362{
4363        int error;
4364        bool is_dir = d_is_dir(old_dentry);
4365        struct inode *source = old_dentry->d_inode;
4366        struct inode *target = new_dentry->d_inode;
4367        bool new_is_dir = false;
4368        unsigned max_links = new_dir->i_sb->s_max_links;
4369        struct name_snapshot old_name;
4370
4371        if (source == target)
4372                return 0;
4373
4374        error = may_delete(old_dir, old_dentry, is_dir);
4375        if (error)
4376                return error;
4377
4378        if (!target) {
4379                error = may_create(new_dir, new_dentry);
4380        } else {
4381                new_is_dir = d_is_dir(new_dentry);
4382
4383                if (!(flags & RENAME_EXCHANGE))
4384                        error = may_delete(new_dir, new_dentry, is_dir);
4385                else
4386                        error = may_delete(new_dir, new_dentry, new_is_dir);
4387        }
4388        if (error)
4389                return error;
4390
4391        if (!old_dir->i_op->rename)
4392                return -EPERM;
4393
4394        /*
4395         * If we are going to change the parent - check write permissions,
4396         * we'll need to flip '..'.
4397         */
4398        if (new_dir != old_dir) {
4399                if (is_dir) {
4400                        error = inode_permission(source, MAY_WRITE);
4401                        if (error)
4402                                return error;
4403                }
4404                if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4405                        error = inode_permission(target, MAY_WRITE);
4406                        if (error)
4407                                return error;
4408                }
4409        }
4410
4411        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4412                                      flags);
4413        if (error)
4414                return error;
4415
4416        take_dentry_name_snapshot(&old_name, old_dentry);
4417        dget(new_dentry);
4418        if (!is_dir || (flags & RENAME_EXCHANGE))
4419                lock_two_nondirectories(source, target);
4420        else if (target)
4421                inode_lock(target);
4422
4423        error = -EBUSY;
4424        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
4425                goto out;
4426
4427        if (max_links && new_dir != old_dir) {
4428                error = -EMLINK;
4429                if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4430                        goto out;
4431                if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4432                    old_dir->i_nlink >= max_links)
4433                        goto out;
4434        }
4435        if (is_dir && !(flags & RENAME_EXCHANGE) && target)
4436                shrink_dcache_parent(new_dentry);
4437        if (!is_dir) {
4438                error = try_break_deleg(source, delegated_inode);
4439                if (error)
4440                        goto out;
4441        }
4442        if (target && !new_is_dir) {
4443                error = try_break_deleg(target, delegated_inode);
4444                if (error)
4445                        goto out;
4446        }
4447        error = old_dir->i_op->rename(old_dir, old_dentry,
4448                                       new_dir, new_dentry, flags);
4449        if (error)
4450                goto out;
4451
4452        if (!(flags & RENAME_EXCHANGE) && target) {
4453                if (is_dir)
4454                        target->i_flags |= S_DEAD;
4455                dont_mount(new_dentry);
4456                detach_mounts(new_dentry);
4457        }
4458        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4459                if (!(flags & RENAME_EXCHANGE))
4460                        d_move(old_dentry, new_dentry);
4461                else
4462                        d_exchange(old_dentry, new_dentry);
4463        }
4464out:
4465        if (!is_dir || (flags & RENAME_EXCHANGE))
4466                unlock_two_nondirectories(source, target);
4467        else if (target)
4468                inode_unlock(target);
4469        dput(new_dentry);
4470        if (!error) {
4471                fsnotify_move(old_dir, new_dir, old_name.name, is_dir,
4472                              !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
4473                if (flags & RENAME_EXCHANGE) {
4474                        fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
4475                                      new_is_dir, NULL, new_dentry);
4476                }
4477        }
4478        release_dentry_name_snapshot(&old_name);
4479
4480        return error;
4481}
4482EXPORT_SYMBOL(vfs_rename);
4483
4484SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4485                int, newdfd, const char __user *, newname, unsigned int, flags)
4486{
4487        struct dentry *old_dentry, *new_dentry;
4488        struct dentry *trap;
4489        struct path old_path, new_path;
4490        struct qstr old_last, new_last;
4491        int old_type, new_type;
4492        struct inode *delegated_inode = NULL;
4493        struct filename *from;
4494        struct filename *to;
4495        unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4496        bool should_retry = false;
4497        int error;
4498
4499        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4500                return -EINVAL;
4501
4502        if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
4503            (flags & RENAME_EXCHANGE))
4504                return -EINVAL;
4505
4506        if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
4507                return -EPERM;
4508
4509        if (flags & RENAME_EXCHANGE)
4510                target_flags = 0;
4511
4512retry:
4513        from = filename_parentat(olddfd, getname(oldname), lookup_flags,
4514                                &old_path, &old_last, &old_type);
4515        if (IS_ERR(from)) {
4516                error = PTR_ERR(from);
4517                goto exit;
4518        }
4519
4520        to = filename_parentat(newdfd, getname(newname), lookup_flags,
4521                                &new_path, &new_last, &new_type);
4522        if (IS_ERR(to)) {
4523                error = PTR_ERR(to);
4524                goto exit1;
4525        }
4526
4527        error = -EXDEV;
4528        if (old_path.mnt != new_path.mnt)
4529                goto exit2;
4530
4531        error = -EBUSY;
4532        if (old_type != LAST_NORM)
4533                goto exit2;
4534
4535        if (flags & RENAME_NOREPLACE)
4536                error = -EEXIST;
4537        if (new_type != LAST_NORM)
4538                goto exit2;
4539
4540        error = mnt_want_write(old_path.mnt);
4541        if (error)
4542                goto exit2;
4543
4544retry_deleg:
4545        trap = lock_rename(new_path.dentry, old_path.dentry);
4546
4547        old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
4548        error = PTR_ERR(old_dentry);
4549        if (IS_ERR(old_dentry))
4550                goto exit3;
4551        /* source must exist */
4552        error = -ENOENT;
4553        if (d_is_negative(old_dentry))
4554                goto exit4;
4555        new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
4556        error = PTR_ERR(new_dentry);
4557        if (IS_ERR(new_dentry))
4558                goto exit4;
4559        error = -EEXIST;
4560        if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
4561                goto exit5;
4562        if (flags & RENAME_EXCHANGE) {
4563                error = -ENOENT;
4564                if (d_is_negative(new_dentry))
4565                        goto exit5;
4566
4567                if (!d_is_dir(new_dentry)) {
4568                        error = -ENOTDIR;
4569                        if (new_last.name[new_last.len])
4570                                goto exit5;
4571                }
4572        }
4573        /* unless the source is a directory trailing slashes give -ENOTDIR */
4574        if (!d_is_dir(old_dentry)) {
4575                error = -ENOTDIR;
4576                if (old_last.name[old_last.len])
4577                        goto exit5;
4578                if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
4579                        goto exit5;
4580        }
4581        /* source should not be ancestor of target */
4582        error = -EINVAL;
4583        if (old_dentry == trap)
4584                goto exit5;
4585        /* target should not be an ancestor of source */
4586        if (!(flags & RENAME_EXCHANGE))
4587                error = -ENOTEMPTY;
4588        if (new_dentry == trap)
4589                goto exit5;
4590
4591        error = security_path_rename(&old_path, old_dentry,
4592                                     &new_path, new_dentry, flags);
4593        if (error)
4594                goto exit5;
4595        error = vfs_rename(old_path.dentry->d_inode, old_dentry,
4596                           new_path.dentry->d_inode, new_dentry,
4597                           &delegated_inode, flags);
4598exit5:
4599        dput(new_dentry);
4600exit4:
4601        dput(old_dentry);
4602exit3:
4603        unlock_rename(new_path.dentry, old_path.dentry);
4604        if (delegated_inode) {
4605                error = break_deleg_wait(&delegated_inode);
4606                if (!error)
4607                        goto retry_deleg;
4608        }
4609        mnt_drop_write(old_path.mnt);
4610exit2:
4611        if (retry_estale(error, lookup_flags))
4612                should_retry = true;
4613        path_put(&new_path);
4614        putname(to);
4615exit1:
4616        path_put(&old_path);
4617        putname(from);
4618        if (should_retry) {
4619                should_retry = false;
4620                lookup_flags |= LOOKUP_REVAL;
4621                goto retry;
4622        }
4623exit:
4624        return error;
4625}
4626
4627SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4628                int, newdfd, const char __user *, newname)
4629{
4630        return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
4631}
4632
4633SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4634{
4635        return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4636}
4637
4638int vfs_whiteout(struct inode *dir, struct dentry *dentry)
4639{
4640        int error = may_create(dir, dentry);
4641        if (error)
4642                return error;
4643
4644        if (!dir->i_op->mknod)
4645                return -EPERM;
4646
4647        return dir->i_op->mknod(dir, dentry,
4648                                S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
4649}
4650EXPORT_SYMBOL(vfs_whiteout);
4651
4652int readlink_copy(char __user *buffer, int buflen, const char *link)
4653{
4654        int len = PTR_ERR(link);
4655        if (IS_ERR(link))
4656                goto out;
4657
4658        len = strlen(link);
4659        if (len > (unsigned) buflen)
4660                len = buflen;
4661        if (copy_to_user(buffer, link, len))
4662                len = -EFAULT;
4663out:
4664        return len;
4665}
4666
4667/*
4668 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
4669 * have ->get_link() not calling nd_jump_link().  Using (or not using) it
4670 * for any given inode is up to filesystem.
4671 */
4672static int generic_readlink(struct dentry *dentry, char __user *buffer,
4673                            int buflen)
4674{
4675        DEFINE_DELAYED_CALL(done);
4676        struct inode *inode = d_inode(dentry);
4677        const char *link = inode->i_link;
4678        int res;
4679
4680        if (!link) {
4681                link = inode->i_op->get_link(dentry, inode, &done);
4682                if (IS_ERR(link))
4683                        return PTR_ERR(link);
4684        }
4685        res = readlink_copy(buffer, buflen, link);
4686        do_delayed_call(&done);
4687        return res;
4688}
4689
4690/**
4691 * vfs_readlink - copy symlink body into userspace buffer
4692 * @dentry: dentry on which to get symbolic link
4693 * @buffer: user memory pointer
4694 * @buflen: size of buffer
4695 *
4696 * Does not touch atime.  That's up to the caller if necessary
4697 *
4698 * Does not call security hook.
4699 */
4700int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4701{
4702        struct inode *inode = d_inode(dentry);
4703
4704        if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
4705                if (unlikely(inode->i_op->readlink))
4706                        return inode->i_op->readlink(dentry, buffer, buflen);
4707
4708                if (!d_is_symlink(dentry))
4709                        return -EINVAL;
4710
4711                spin_lock(&inode->i_lock);
4712                inode->i_opflags |= IOP_DEFAULT_READLINK;
4713                spin_unlock(&inode->i_lock);
4714        }
4715
4716        return generic_readlink(dentry, buffer, buflen);
4717}
4718EXPORT_SYMBOL(vfs_readlink);
4719
4720/**
4721 * vfs_get_link - get symlink body
4722 * @dentry: dentry on which to get symbolic link
4723 * @done: caller needs to free returned data with this
4724 *
4725 * Calls security hook and i_op->get_link() on the supplied inode.
4726 *
4727 * It does not touch atime.  That's up to the caller if necessary.
4728 *
4729 * Does not work on "special" symlinks like /proc/$$/fd/N
4730 */
4731const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
4732{
4733        const char *res = ERR_PTR(-EINVAL);
4734        struct inode *inode = d_inode(dentry);
4735
4736        if (d_is_symlink(dentry)) {
4737                res = ERR_PTR(security_inode_readlink(dentry));
4738                if (!res)
4739                        res = inode->i_op->get_link(dentry, inode, done);
4740        }
4741        return res;
4742}
4743EXPORT_SYMBOL(vfs_get_link);
4744
4745/* get the link contents into pagecache */
4746const char *page_get_link(struct dentry *dentry, struct inode *inode,
4747                          struct delayed_call *callback)
4748{
4749        char *kaddr;
4750        struct page *page;
4751        struct address_space *mapping = inode->i_mapping;
4752
4753        if (!dentry) {
4754                page = find_get_page(mapping, 0);
4755                if (!page)
4756                        return ERR_PTR(-ECHILD);
4757                if (!PageUptodate(page)) {
4758                        put_page(page);
4759                        return ERR_PTR(-ECHILD);
4760                }
4761        } else {
4762                page = read_mapping_page(mapping, 0, NULL);
4763                if (IS_ERR(page))
4764                        return (char*)page;
4765        }
4766        set_delayed_call(callback, page_put_link, page);
4767        BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
4768        kaddr = page_address(page);
4769        nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4770        return kaddr;
4771}
4772
4773EXPORT_SYMBOL(page_get_link);
4774
4775void page_put_link(void *arg)
4776{
4777        put_page(arg);
4778}
4779EXPORT_SYMBOL(page_put_link);
4780
4781int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4782{
4783        DEFINE_DELAYED_CALL(done);
4784        int res = readlink_copy(buffer, buflen,
4785                                page_get_link(dentry, d_inode(dentry),
4786                                              &done));
4787        do_delayed_call(&done);
4788        return res;
4789}
4790EXPORT_SYMBOL(page_readlink);
4791
4792/*
4793 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
4794 */
4795int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
4796{
4797        struct address_space *mapping = inode->i_mapping;
4798        struct page *page;
4799        void *fsdata;
4800        int err;
4801        unsigned int flags = 0;
4802        if (nofs)
4803                flags |= AOP_FLAG_NOFS;
4804
4805retry:
4806        err = pagecache_write_begin(NULL, mapping, 0, len-1,
4807                                flags, &page, &fsdata);
4808        if (err)
4809                goto fail;
4810
4811        memcpy(page_address(page), symname, len-1);
4812
4813        err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
4814                                                        page, fsdata);
4815        if (err < 0)
4816                goto fail;
4817        if (err < len-1)
4818                goto retry;
4819
4820        mark_inode_dirty(inode);
4821        return 0;
4822fail:
4823        return err;
4824}
4825EXPORT_SYMBOL(__page_symlink);
4826
4827int page_symlink(struct inode *inode, const char *symname, int len)
4828{
4829        return __page_symlink(inode, symname, len,
4830                        !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4831}
4832EXPORT_SYMBOL(page_symlink);
4833
4834const struct inode_operations page_symlink_inode_operations = {
4835        .get_link       = page_get_link,
4836};
4837EXPORT_SYMBOL(page_symlink_inode_operations);
4838