linux/fs/namei.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/namei.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8/*
   9 * Some corrections by tytso.
  10 */
  11
  12/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  13 * lookup logic.
  14 */
  15/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  16 */
  17
  18#include <linux/init.h>
  19#include <linux/export.h>
  20#include <linux/kernel.h>
  21#include <linux/slab.h>
  22#include <linux/fs.h>
  23#include <linux/namei.h>
  24#include <linux/pagemap.h>
  25#include <linux/fsnotify.h>
  26#include <linux/personality.h>
  27#include <linux/security.h>
  28#include <linux/ima.h>
  29#include <linux/syscalls.h>
  30#include <linux/mount.h>
  31#include <linux/audit.h>
  32#include <linux/capability.h>
  33#include <linux/file.h>
  34#include <linux/fcntl.h>
  35#include <linux/device_cgroup.h>
  36#include <linux/fs_struct.h>
  37#include <linux/posix_acl.h>
  38#include <linux/hash.h>
  39#include <linux/bitops.h>
  40#include <linux/init_task.h>
  41#include <linux/uaccess.h>
  42
  43#include "internal.h"
  44#include "mount.h"
  45
  46/* [Feb-1997 T. Schoebel-Theuer]
  47 * Fundamental changes in the pathname lookup mechanisms (namei)
  48 * were necessary because of omirr.  The reason is that omirr needs
  49 * to know the _real_ pathname, not the user-supplied one, in case
  50 * of symlinks (and also when transname replacements occur).
  51 *
  52 * The new code replaces the old recursive symlink resolution with
  53 * an iterative one (in case of non-nested symlink chains).  It does
  54 * this with calls to <fs>_follow_link().
  55 * As a side effect, dir_namei(), _namei() and follow_link() are now 
  56 * replaced with a single function lookup_dentry() that can handle all 
  57 * the special cases of the former code.
  58 *
  59 * With the new dcache, the pathname is stored at each inode, at least as
  60 * long as the refcount of the inode is positive.  As a side effect, the
  61 * size of the dcache depends on the inode cache and thus is dynamic.
  62 *
  63 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  64 * resolution to correspond with current state of the code.
  65 *
  66 * Note that the symlink resolution is not *completely* iterative.
  67 * There is still a significant amount of tail- and mid- recursion in
  68 * the algorithm.  Also, note that <fs>_readlink() is not used in
  69 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  70 * may return different results than <fs>_follow_link().  Many virtual
  71 * filesystems (including /proc) exhibit this behavior.
  72 */
  73
  74/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  75 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  76 * and the name already exists in form of a symlink, try to create the new
  77 * name indicated by the symlink. The old code always complained that the
  78 * name already exists, due to not following the symlink even if its target
  79 * is nonexistent.  The new semantics affects also mknod() and link() when
  80 * the name is a symlink pointing to a non-existent name.
  81 *
  82 * I don't know which semantics is the right one, since I have no access
  83 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  84 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  85 * "old" one. Personally, I think the new semantics is much more logical.
  86 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  87 * file does succeed in both HP-UX and SunOs, but not in Solaris
  88 * and in the old Linux semantics.
  89 */
  90
  91/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  92 * semantics.  See the comments in "open_namei" and "do_link" below.
  93 *
  94 * [10-Sep-98 Alan Modra] Another symlink change.
  95 */
  96
  97/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  98 *      inside the path - always follow.
  99 *      in the last component in creation/removal/renaming - never follow.
 100 *      if LOOKUP_FOLLOW passed - follow.
 101 *      if the pathname has trailing slashes - follow.
 102 *      otherwise - don't follow.
 103 * (applied in that order).
 104 *
 105 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 106 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 107 * During the 2.4 we need to fix the userland stuff depending on it -
 108 * hopefully we will be able to get rid of that wart in 2.5. So far only
 109 * XEmacs seems to be relying on it...
 110 */
 111/*
 112 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 113 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 114 * any extra contention...
 115 */
 116
 117/* In order to reduce some races, while at the same time doing additional
 118 * checking and hopefully speeding things up, we copy filenames to the
 119 * kernel data space before using them..
 120 *
 121 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 122 * PATH_MAX includes the nul terminator --RR.
 123 */
 124
 125#define EMBEDDED_NAME_MAX       (PATH_MAX - offsetof(struct filename, iname))
 126
 127struct filename *
 128getname_flags(const char __user *filename, int flags, int *empty)
 129{
 130        struct filename *result;
 131        char *kname;
 132        int len;
 133
 134        result = audit_reusename(filename);
 135        if (result)
 136                return result;
 137
 138        result = __getname();
 139        if (unlikely(!result))
 140                return ERR_PTR(-ENOMEM);
 141
 142        /*
 143         * First, try to embed the struct filename inside the names_cache
 144         * allocation
 145         */
 146        kname = (char *)result->iname;
 147        result->name = kname;
 148
 149        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
 150        if (unlikely(len < 0)) {
 151                __putname(result);
 152                return ERR_PTR(len);
 153        }
 154
 155        /*
 156         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
 157         * separate struct filename so we can dedicate the entire
 158         * names_cache allocation for the pathname, and re-do the copy from
 159         * userland.
 160         */
 161        if (unlikely(len == EMBEDDED_NAME_MAX)) {
 162                const size_t size = offsetof(struct filename, iname[1]);
 163                kname = (char *)result;
 164
 165                /*
 166                 * size is chosen that way we to guarantee that
 167                 * result->iname[0] is within the same object and that
 168                 * kname can't be equal to result->iname, no matter what.
 169                 */
 170                result = kzalloc(size, GFP_KERNEL);
 171                if (unlikely(!result)) {
 172                        __putname(kname);
 173                        return ERR_PTR(-ENOMEM);
 174                }
 175                result->name = kname;
 176                len = strncpy_from_user(kname, filename, PATH_MAX);
 177                if (unlikely(len < 0)) {
 178                        __putname(kname);
 179                        kfree(result);
 180                        return ERR_PTR(len);
 181                }
 182                if (unlikely(len == PATH_MAX)) {
 183                        __putname(kname);
 184                        kfree(result);
 185                        return ERR_PTR(-ENAMETOOLONG);
 186                }
 187        }
 188
 189        result->refcnt = 1;
 190        /* The empty path is special. */
 191        if (unlikely(!len)) {
 192                if (empty)
 193                        *empty = 1;
 194                if (!(flags & LOOKUP_EMPTY)) {
 195                        putname(result);
 196                        return ERR_PTR(-ENOENT);
 197                }
 198        }
 199
 200        result->uptr = filename;
 201        result->aname = NULL;
 202        audit_getname(result);
 203        return result;
 204}
 205
 206struct filename *
 207getname(const char __user * filename)
 208{
 209        return getname_flags(filename, 0, NULL);
 210}
 211
 212struct filename *
 213getname_kernel(const char * filename)
 214{
 215        struct filename *result;
 216        int len = strlen(filename) + 1;
 217
 218        result = __getname();
 219        if (unlikely(!result))
 220                return ERR_PTR(-ENOMEM);
 221
 222        if (len <= EMBEDDED_NAME_MAX) {
 223                result->name = (char *)result->iname;
 224        } else if (len <= PATH_MAX) {
 225                const size_t size = offsetof(struct filename, iname[1]);
 226                struct filename *tmp;
 227
 228                tmp = kmalloc(size, GFP_KERNEL);
 229                if (unlikely(!tmp)) {
 230                        __putname(result);
 231                        return ERR_PTR(-ENOMEM);
 232                }
 233                tmp->name = (char *)result;
 234                result = tmp;
 235        } else {
 236                __putname(result);
 237                return ERR_PTR(-ENAMETOOLONG);
 238        }
 239        memcpy((char *)result->name, filename, len);
 240        result->uptr = NULL;
 241        result->aname = NULL;
 242        result->refcnt = 1;
 243        audit_getname(result);
 244
 245        return result;
 246}
 247
 248void putname(struct filename *name)
 249{
 250        BUG_ON(name->refcnt <= 0);
 251
 252        if (--name->refcnt > 0)
 253                return;
 254
 255        if (name->name != name->iname) {
 256                __putname(name->name);
 257                kfree(name);
 258        } else
 259                __putname(name);
 260}
 261
 262static int check_acl(struct inode *inode, int mask)
 263{
 264#ifdef CONFIG_FS_POSIX_ACL
 265        struct posix_acl *acl;
 266
 267        if (mask & MAY_NOT_BLOCK) {
 268                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
 269                if (!acl)
 270                        return -EAGAIN;
 271                /* no ->get_acl() calls in RCU mode... */
 272                if (is_uncached_acl(acl))
 273                        return -ECHILD;
 274                return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
 275        }
 276
 277        acl = get_acl(inode, ACL_TYPE_ACCESS);
 278        if (IS_ERR(acl))
 279                return PTR_ERR(acl);
 280        if (acl) {
 281                int error = posix_acl_permission(inode, acl, mask);
 282                posix_acl_release(acl);
 283                return error;
 284        }
 285#endif
 286
 287        return -EAGAIN;
 288}
 289
 290/*
 291 * This does the basic permission checking
 292 */
 293static int acl_permission_check(struct inode *inode, int mask)
 294{
 295        unsigned int mode = inode->i_mode;
 296
 297        if (likely(uid_eq(current_fsuid(), inode->i_uid)))
 298                mode >>= 6;
 299        else {
 300                if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
 301                        int error = check_acl(inode, mask);
 302                        if (error != -EAGAIN)
 303                                return error;
 304                }
 305
 306                if (in_group_p(inode->i_gid))
 307                        mode >>= 3;
 308        }
 309
 310        /*
 311         * If the DACs are ok we don't need any capability check.
 312         */
 313        if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 314                return 0;
 315        return -EACCES;
 316}
 317
 318/**
 319 * generic_permission -  check for access rights on a Posix-like filesystem
 320 * @inode:      inode to check access rights for
 321 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
 322 *
 323 * Used to check for read/write/execute permissions on a file.
 324 * We use "fsuid" for this, letting us set arbitrary permissions
 325 * for filesystem access without changing the "normal" uids which
 326 * are used for other things.
 327 *
 328 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 329 * request cannot be satisfied (eg. requires blocking or too much complexity).
 330 * It would then be called again in ref-walk mode.
 331 */
 332int generic_permission(struct inode *inode, int mask)
 333{
 334        int ret;
 335
 336        /*
 337         * Do the basic permission checks.
 338         */
 339        ret = acl_permission_check(inode, mask);
 340        if (ret != -EACCES)
 341                return ret;
 342
 343        if (S_ISDIR(inode->i_mode)) {
 344                /* DACs are overridable for directories */
 345                if (!(mask & MAY_WRITE))
 346                        if (capable_wrt_inode_uidgid(inode,
 347                                                     CAP_DAC_READ_SEARCH))
 348                                return 0;
 349                if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
 350                        return 0;
 351                return -EACCES;
 352        }
 353
 354        /*
 355         * Searching includes executable on directories, else just read.
 356         */
 357        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 358        if (mask == MAY_READ)
 359                if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
 360                        return 0;
 361        /*
 362         * Read/write DACs are always overridable.
 363         * Executable DACs are overridable when there is
 364         * at least one exec bit set.
 365         */
 366        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
 367                if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
 368                        return 0;
 369
 370        return -EACCES;
 371}
 372EXPORT_SYMBOL(generic_permission);
 373
 374/*
 375 * We _really_ want to just do "generic_permission()" without
 376 * even looking at the inode->i_op values. So we keep a cache
 377 * flag in inode->i_opflags, that says "this has not special
 378 * permission function, use the fast case".
 379 */
 380static inline int do_inode_permission(struct inode *inode, int mask)
 381{
 382        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 383                if (likely(inode->i_op->permission))
 384                        return inode->i_op->permission(inode, mask);
 385
 386                /* This gets set once for the inode lifetime */
 387                spin_lock(&inode->i_lock);
 388                inode->i_opflags |= IOP_FASTPERM;
 389                spin_unlock(&inode->i_lock);
 390        }
 391        return generic_permission(inode, mask);
 392}
 393
 394/**
 395 * sb_permission - Check superblock-level permissions
 396 * @sb: Superblock of inode to check permission on
 397 * @inode: Inode to check permission on
 398 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 399 *
 400 * Separate out file-system wide checks from inode-specific permission checks.
 401 */
 402static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 403{
 404        if (unlikely(mask & MAY_WRITE)) {
 405                umode_t mode = inode->i_mode;
 406
 407                /* Nobody gets write access to a read-only fs. */
 408                if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 409                        return -EROFS;
 410        }
 411        return 0;
 412}
 413
 414/**
 415 * inode_permission - Check for access rights to a given inode
 416 * @inode: Inode to check permission on
 417 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 418 *
 419 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 420 * this, letting us set arbitrary permissions for filesystem access without
 421 * changing the "normal" UIDs which are used for other things.
 422 *
 423 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 424 */
 425int inode_permission(struct inode *inode, int mask)
 426{
 427        int retval;
 428
 429        retval = sb_permission(inode->i_sb, inode, mask);
 430        if (retval)
 431                return retval;
 432
 433        if (unlikely(mask & MAY_WRITE)) {
 434                /*
 435                 * Nobody gets write access to an immutable file.
 436                 */
 437                if (IS_IMMUTABLE(inode))
 438                        return -EPERM;
 439
 440                /*
 441                 * Updating mtime will likely cause i_uid and i_gid to be
 442                 * written back improperly if their true value is unknown
 443                 * to the vfs.
 444                 */
 445                if (HAS_UNMAPPED_ID(inode))
 446                        return -EACCES;
 447        }
 448
 449        retval = do_inode_permission(inode, mask);
 450        if (retval)
 451                return retval;
 452
 453        retval = devcgroup_inode_permission(inode, mask);
 454        if (retval)
 455                return retval;
 456
 457        return security_inode_permission(inode, mask);
 458}
 459EXPORT_SYMBOL(inode_permission);
 460
 461/**
 462 * path_get - get a reference to a path
 463 * @path: path to get the reference to
 464 *
 465 * Given a path increment the reference count to the dentry and the vfsmount.
 466 */
 467void path_get(const struct path *path)
 468{
 469        mntget(path->mnt);
 470        dget(path->dentry);
 471}
 472EXPORT_SYMBOL(path_get);
 473
 474/**
 475 * path_put - put a reference to a path
 476 * @path: path to put the reference to
 477 *
 478 * Given a path decrement the reference count to the dentry and the vfsmount.
 479 */
 480void path_put(const struct path *path)
 481{
 482        dput(path->dentry);
 483        mntput(path->mnt);
 484}
 485EXPORT_SYMBOL(path_put);
 486
 487#define EMBEDDED_LEVELS 2
 488struct nameidata {
 489        struct path     path;
 490        struct qstr     last;
 491        struct path     root;
 492        struct inode    *inode; /* path.dentry.d_inode */
 493        unsigned int    flags;
 494        unsigned        seq, m_seq;
 495        int             last_type;
 496        unsigned        depth;
 497        int             total_link_count;
 498        struct saved {
 499                struct path link;
 500                struct delayed_call done;
 501                const char *name;
 502                unsigned seq;
 503        } *stack, internal[EMBEDDED_LEVELS];
 504        struct filename *name;
 505        struct nameidata *saved;
 506        struct inode    *link_inode;
 507        unsigned        root_seq;
 508        int             dfd;
 509} __randomize_layout;
 510
 511static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 512{
 513        struct nameidata *old = current->nameidata;
 514        p->stack = p->internal;
 515        p->dfd = dfd;
 516        p->name = name;
 517        p->total_link_count = old ? old->total_link_count : 0;
 518        p->saved = old;
 519        current->nameidata = p;
 520}
 521
 522static void restore_nameidata(void)
 523{
 524        struct nameidata *now = current->nameidata, *old = now->saved;
 525
 526        current->nameidata = old;
 527        if (old)
 528                old->total_link_count = now->total_link_count;
 529        if (now->stack != now->internal)
 530                kfree(now->stack);
 531}
 532
 533static int __nd_alloc_stack(struct nameidata *nd)
 534{
 535        struct saved *p;
 536
 537        if (nd->flags & LOOKUP_RCU) {
 538                p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
 539                                  GFP_ATOMIC);
 540                if (unlikely(!p))
 541                        return -ECHILD;
 542        } else {
 543                p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
 544                                  GFP_KERNEL);
 545                if (unlikely(!p))
 546                        return -ENOMEM;
 547        }
 548        memcpy(p, nd->internal, sizeof(nd->internal));
 549        nd->stack = p;
 550        return 0;
 551}
 552
 553/**
 554 * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
 555 * @path: nameidate to verify
 556 *
 557 * Rename can sometimes move a file or directory outside of a bind
 558 * mount, path_connected allows those cases to be detected.
 559 */
 560static bool path_connected(const struct path *path)
 561{
 562        struct vfsmount *mnt = path->mnt;
 563        struct super_block *sb = mnt->mnt_sb;
 564
 565        /* Bind mounts and multi-root filesystems can have disconnected paths */
 566        if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
 567                return true;
 568
 569        return is_subdir(path->dentry, mnt->mnt_root);
 570}
 571
 572static inline int nd_alloc_stack(struct nameidata *nd)
 573{
 574        if (likely(nd->depth != EMBEDDED_LEVELS))
 575                return 0;
 576        if (likely(nd->stack != nd->internal))
 577                return 0;
 578        return __nd_alloc_stack(nd);
 579}
 580
 581static void drop_links(struct nameidata *nd)
 582{
 583        int i = nd->depth;
 584        while (i--) {
 585                struct saved *last = nd->stack + i;
 586                do_delayed_call(&last->done);
 587                clear_delayed_call(&last->done);
 588        }
 589}
 590
 591static void terminate_walk(struct nameidata *nd)
 592{
 593        drop_links(nd);
 594        if (!(nd->flags & LOOKUP_RCU)) {
 595                int i;
 596                path_put(&nd->path);
 597                for (i = 0; i < nd->depth; i++)
 598                        path_put(&nd->stack[i].link);
 599                if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 600                        path_put(&nd->root);
 601                        nd->root.mnt = NULL;
 602                }
 603        } else {
 604                nd->flags &= ~LOOKUP_RCU;
 605                if (!(nd->flags & LOOKUP_ROOT))
 606                        nd->root.mnt = NULL;
 607                rcu_read_unlock();
 608        }
 609        nd->depth = 0;
 610}
 611
 612/* path_put is needed afterwards regardless of success or failure */
 613static bool legitimize_path(struct nameidata *nd,
 614                            struct path *path, unsigned seq)
 615{
 616        int res = __legitimize_mnt(path->mnt, nd->m_seq);
 617        if (unlikely(res)) {
 618                if (res > 0)
 619                        path->mnt = NULL;
 620                path->dentry = NULL;
 621                return false;
 622        }
 623        if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
 624                path->dentry = NULL;
 625                return false;
 626        }
 627        return !read_seqcount_retry(&path->dentry->d_seq, seq);
 628}
 629
 630static bool legitimize_links(struct nameidata *nd)
 631{
 632        int i;
 633        for (i = 0; i < nd->depth; i++) {
 634                struct saved *last = nd->stack + i;
 635                if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
 636                        drop_links(nd);
 637                        nd->depth = i + 1;
 638                        return false;
 639                }
 640        }
 641        return true;
 642}
 643
 644/*
 645 * Path walking has 2 modes, rcu-walk and ref-walk (see
 646 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 647 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 648 * normal reference counts on dentries and vfsmounts to transition to ref-walk
 649 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 650 * got stuck, so ref-walk may continue from there. If this is not successful
 651 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 652 * to restart the path walk from the beginning in ref-walk mode.
 653 */
 654
 655/**
 656 * unlazy_walk - try to switch to ref-walk mode.
 657 * @nd: nameidata pathwalk data
 658 * Returns: 0 on success, -ECHILD on failure
 659 *
 660 * unlazy_walk attempts to legitimize the current nd->path and nd->root
 661 * for ref-walk mode.
 662 * Must be called from rcu-walk context.
 663 * Nothing should touch nameidata between unlazy_walk() failure and
 664 * terminate_walk().
 665 */
 666static int unlazy_walk(struct nameidata *nd)
 667{
 668        struct dentry *parent = nd->path.dentry;
 669
 670        BUG_ON(!(nd->flags & LOOKUP_RCU));
 671
 672        nd->flags &= ~LOOKUP_RCU;
 673        if (unlikely(!legitimize_links(nd)))
 674                goto out2;
 675        if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
 676                goto out1;
 677        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 678                if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq)))
 679                        goto out;
 680        }
 681        rcu_read_unlock();
 682        BUG_ON(nd->inode != parent->d_inode);
 683        return 0;
 684
 685out2:
 686        nd->path.mnt = NULL;
 687        nd->path.dentry = NULL;
 688out1:
 689        if (!(nd->flags & LOOKUP_ROOT))
 690                nd->root.mnt = NULL;
 691out:
 692        rcu_read_unlock();
 693        return -ECHILD;
 694}
 695
 696/**
 697 * unlazy_child - try to switch to ref-walk mode.
 698 * @nd: nameidata pathwalk data
 699 * @dentry: child of nd->path.dentry
 700 * @seq: seq number to check dentry against
 701 * Returns: 0 on success, -ECHILD on failure
 702 *
 703 * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
 704 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 705 * @nd.  Must be called from rcu-walk context.
 706 * Nothing should touch nameidata between unlazy_child() failure and
 707 * terminate_walk().
 708 */
 709static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
 710{
 711        BUG_ON(!(nd->flags & LOOKUP_RCU));
 712
 713        nd->flags &= ~LOOKUP_RCU;
 714        if (unlikely(!legitimize_links(nd)))
 715                goto out2;
 716        if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
 717                goto out2;
 718        if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
 719                goto out1;
 720
 721        /*
 722         * We need to move both the parent and the dentry from the RCU domain
 723         * to be properly refcounted. And the sequence number in the dentry
 724         * validates *both* dentry counters, since we checked the sequence
 725         * number of the parent after we got the child sequence number. So we
 726         * know the parent must still be valid if the child sequence number is
 727         */
 728        if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
 729                goto out;
 730        if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) {
 731                rcu_read_unlock();
 732                dput(dentry);
 733                goto drop_root_mnt;
 734        }
 735        /*
 736         * Sequence counts matched. Now make sure that the root is
 737         * still valid and get it if required.
 738         */
 739        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 740                if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
 741                        rcu_read_unlock();
 742                        dput(dentry);
 743                        return -ECHILD;
 744                }
 745        }
 746
 747        rcu_read_unlock();
 748        return 0;
 749
 750out2:
 751        nd->path.mnt = NULL;
 752out1:
 753        nd->path.dentry = NULL;
 754out:
 755        rcu_read_unlock();
 756drop_root_mnt:
 757        if (!(nd->flags & LOOKUP_ROOT))
 758                nd->root.mnt = NULL;
 759        return -ECHILD;
 760}
 761
 762static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
 763{
 764        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
 765                return dentry->d_op->d_revalidate(dentry, flags);
 766        else
 767                return 1;
 768}
 769
 770/**
 771 * complete_walk - successful completion of path walk
 772 * @nd:  pointer nameidata
 773 *
 774 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 775 * Revalidate the final result, unless we'd already done that during
 776 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 777 * success, -error on failure.  In case of failure caller does not
 778 * need to drop nd->path.
 779 */
 780static int complete_walk(struct nameidata *nd)
 781{
 782        struct dentry *dentry = nd->path.dentry;
 783        int status;
 784
 785        if (nd->flags & LOOKUP_RCU) {
 786                if (!(nd->flags & LOOKUP_ROOT))
 787                        nd->root.mnt = NULL;
 788                if (unlikely(unlazy_walk(nd)))
 789                        return -ECHILD;
 790        }
 791
 792        if (likely(!(nd->flags & LOOKUP_JUMPED)))
 793                return 0;
 794
 795        if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
 796                return 0;
 797
 798        status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
 799        if (status > 0)
 800                return 0;
 801
 802        if (!status)
 803                status = -ESTALE;
 804
 805        return status;
 806}
 807
 808static void set_root(struct nameidata *nd)
 809{
 810        struct fs_struct *fs = current->fs;
 811
 812        if (nd->flags & LOOKUP_RCU) {
 813                unsigned seq;
 814
 815                do {
 816                        seq = read_seqcount_begin(&fs->seq);
 817                        nd->root = fs->root;
 818                        nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
 819                } while (read_seqcount_retry(&fs->seq, seq));
 820        } else {
 821                get_fs_root(fs, &nd->root);
 822        }
 823}
 824
 825static void path_put_conditional(struct path *path, struct nameidata *nd)
 826{
 827        dput(path->dentry);
 828        if (path->mnt != nd->path.mnt)
 829                mntput(path->mnt);
 830}
 831
 832static inline void path_to_nameidata(const struct path *path,
 833                                        struct nameidata *nd)
 834{
 835        if (!(nd->flags & LOOKUP_RCU)) {
 836                dput(nd->path.dentry);
 837                if (nd->path.mnt != path->mnt)
 838                        mntput(nd->path.mnt);
 839        }
 840        nd->path.mnt = path->mnt;
 841        nd->path.dentry = path->dentry;
 842}
 843
 844static int nd_jump_root(struct nameidata *nd)
 845{
 846        if (nd->flags & LOOKUP_RCU) {
 847                struct dentry *d;
 848                nd->path = nd->root;
 849                d = nd->path.dentry;
 850                nd->inode = d->d_inode;
 851                nd->seq = nd->root_seq;
 852                if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
 853                        return -ECHILD;
 854        } else {
 855                path_put(&nd->path);
 856                nd->path = nd->root;
 857                path_get(&nd->path);
 858                nd->inode = nd->path.dentry->d_inode;
 859        }
 860        nd->flags |= LOOKUP_JUMPED;
 861        return 0;
 862}
 863
 864/*
 865 * Helper to directly jump to a known parsed path from ->get_link,
 866 * caller must have taken a reference to path beforehand.
 867 */
 868void nd_jump_link(struct path *path)
 869{
 870        struct nameidata *nd = current->nameidata;
 871        path_put(&nd->path);
 872
 873        nd->path = *path;
 874        nd->inode = nd->path.dentry->d_inode;
 875        nd->flags |= LOOKUP_JUMPED;
 876}
 877
 878static inline void put_link(struct nameidata *nd)
 879{
 880        struct saved *last = nd->stack + --nd->depth;
 881        do_delayed_call(&last->done);
 882        if (!(nd->flags & LOOKUP_RCU))
 883                path_put(&last->link);
 884}
 885
 886int sysctl_protected_symlinks __read_mostly = 0;
 887int sysctl_protected_hardlinks __read_mostly = 0;
 888int sysctl_protected_fifos __read_mostly;
 889int sysctl_protected_regular __read_mostly;
 890
 891/**
 892 * may_follow_link - Check symlink following for unsafe situations
 893 * @nd: nameidata pathwalk data
 894 *
 895 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 896 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 897 * in a sticky world-writable directory. This is to protect privileged
 898 * processes from failing races against path names that may change out
 899 * from under them by way of other users creating malicious symlinks.
 900 * It will permit symlinks to be followed only when outside a sticky
 901 * world-writable directory, or when the uid of the symlink and follower
 902 * match, or when the directory owner matches the symlink's owner.
 903 *
 904 * Returns 0 if following the symlink is allowed, -ve on error.
 905 */
 906static inline int may_follow_link(struct nameidata *nd)
 907{
 908        const struct inode *inode;
 909        const struct inode *parent;
 910        kuid_t puid;
 911
 912        if (!sysctl_protected_symlinks)
 913                return 0;
 914
 915        /* Allowed if owner and follower match. */
 916        inode = nd->link_inode;
 917        if (uid_eq(current_cred()->fsuid, inode->i_uid))
 918                return 0;
 919
 920        /* Allowed if parent directory not sticky and world-writable. */
 921        parent = nd->inode;
 922        if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
 923                return 0;
 924
 925        /* Allowed if parent directory and link owner match. */
 926        puid = parent->i_uid;
 927        if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
 928                return 0;
 929
 930        if (nd->flags & LOOKUP_RCU)
 931                return -ECHILD;
 932
 933        audit_inode(nd->name, nd->stack[0].link.dentry, 0);
 934        audit_log_link_denied("follow_link");
 935        return -EACCES;
 936}
 937
 938/**
 939 * safe_hardlink_source - Check for safe hardlink conditions
 940 * @inode: the source inode to hardlink from
 941 *
 942 * Return false if at least one of the following conditions:
 943 *    - inode is not a regular file
 944 *    - inode is setuid
 945 *    - inode is setgid and group-exec
 946 *    - access failure for read and write
 947 *
 948 * Otherwise returns true.
 949 */
 950static bool safe_hardlink_source(struct inode *inode)
 951{
 952        umode_t mode = inode->i_mode;
 953
 954        /* Special files should not get pinned to the filesystem. */
 955        if (!S_ISREG(mode))
 956                return false;
 957
 958        /* Setuid files should not get pinned to the filesystem. */
 959        if (mode & S_ISUID)
 960                return false;
 961
 962        /* Executable setgid files should not get pinned to the filesystem. */
 963        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
 964                return false;
 965
 966        /* Hardlinking to unreadable or unwritable sources is dangerous. */
 967        if (inode_permission(inode, MAY_READ | MAY_WRITE))
 968                return false;
 969
 970        return true;
 971}
 972
 973/**
 974 * may_linkat - Check permissions for creating a hardlink
 975 * @link: the source to hardlink from
 976 *
 977 * Block hardlink when all of:
 978 *  - sysctl_protected_hardlinks enabled
 979 *  - fsuid does not match inode
 980 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 981 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
 982 *
 983 * Returns 0 if successful, -ve on error.
 984 */
 985static int may_linkat(struct path *link)
 986{
 987        struct inode *inode = link->dentry->d_inode;
 988
 989        /* Inode writeback is not safe when the uid or gid are invalid. */
 990        if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
 991                return -EOVERFLOW;
 992
 993        if (!sysctl_protected_hardlinks)
 994                return 0;
 995
 996        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 997         * otherwise, it must be a safe source.
 998         */
 999        if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
1000                return 0;
1001
1002        audit_log_link_denied("linkat");
1003        return -EPERM;
1004}
1005
1006/**
1007 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1008 *                        should be allowed, or not, on files that already
1009 *                        exist.
1010 * @dir: the sticky parent directory
1011 * @inode: the inode of the file to open
1012 *
1013 * Block an O_CREAT open of a FIFO (or a regular file) when:
1014 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1015 *   - the file already exists
1016 *   - we are in a sticky directory
1017 *   - we don't own the file
1018 *   - the owner of the directory doesn't own the file
1019 *   - the directory is world writable
1020 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1021 * the directory doesn't have to be world writable: being group writable will
1022 * be enough.
1023 *
1024 * Returns 0 if the open is allowed, -ve on error.
1025 */
1026static int may_create_in_sticky(struct dentry * const dir,
1027                                struct inode * const inode)
1028{
1029        if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
1030            (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
1031            likely(!(dir->d_inode->i_mode & S_ISVTX)) ||
1032            uid_eq(inode->i_uid, dir->d_inode->i_uid) ||
1033            uid_eq(current_fsuid(), inode->i_uid))
1034                return 0;
1035
1036        if (likely(dir->d_inode->i_mode & 0002) ||
1037            (dir->d_inode->i_mode & 0020 &&
1038             ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
1039              (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
1040                return -EACCES;
1041        }
1042        return 0;
1043}
1044
1045static __always_inline
1046const char *get_link(struct nameidata *nd)
1047{
1048        struct saved *last = nd->stack + nd->depth - 1;
1049        struct dentry *dentry = last->link.dentry;
1050        struct inode *inode = nd->link_inode;
1051        int error;
1052        const char *res;
1053
1054        if (!(nd->flags & LOOKUP_RCU)) {
1055                touch_atime(&last->link);
1056                cond_resched();
1057        } else if (atime_needs_update(&last->link, inode)) {
1058                if (unlikely(unlazy_walk(nd)))
1059                        return ERR_PTR(-ECHILD);
1060                touch_atime(&last->link);
1061        }
1062
1063        error = security_inode_follow_link(dentry, inode,
1064                                           nd->flags & LOOKUP_RCU);
1065        if (unlikely(error))
1066                return ERR_PTR(error);
1067
1068        nd->last_type = LAST_BIND;
1069        res = READ_ONCE(inode->i_link);
1070        if (!res) {
1071                const char * (*get)(struct dentry *, struct inode *,
1072                                struct delayed_call *);
1073                get = inode->i_op->get_link;
1074                if (nd->flags & LOOKUP_RCU) {
1075                        res = get(NULL, inode, &last->done);
1076                        if (res == ERR_PTR(-ECHILD)) {
1077                                if (unlikely(unlazy_walk(nd)))
1078                                        return ERR_PTR(-ECHILD);
1079                                res = get(dentry, inode, &last->done);
1080                        }
1081                } else {
1082                        res = get(dentry, inode, &last->done);
1083                }
1084                if (IS_ERR_OR_NULL(res))
1085                        return res;
1086        }
1087        if (*res == '/') {
1088                if (!nd->root.mnt)
1089                        set_root(nd);
1090                if (unlikely(nd_jump_root(nd)))
1091                        return ERR_PTR(-ECHILD);
1092                while (unlikely(*++res == '/'))
1093                        ;
1094        }
1095        if (!*res)
1096                res = NULL;
1097        return res;
1098}
1099
1100/*
1101 * follow_up - Find the mountpoint of path's vfsmount
1102 *
1103 * Given a path, find the mountpoint of its source file system.
1104 * Replace @path with the path of the mountpoint in the parent mount.
1105 * Up is towards /.
1106 *
1107 * Return 1 if we went up a level and 0 if we were already at the
1108 * root.
1109 */
1110int follow_up(struct path *path)
1111{
1112        struct mount *mnt = real_mount(path->mnt);
1113        struct mount *parent;
1114        struct dentry *mountpoint;
1115
1116        read_seqlock_excl(&mount_lock);
1117        parent = mnt->mnt_parent;
1118        if (parent == mnt) {
1119                read_sequnlock_excl(&mount_lock);
1120                return 0;
1121        }
1122        mntget(&parent->mnt);
1123        mountpoint = dget(mnt->mnt_mountpoint);
1124        read_sequnlock_excl(&mount_lock);
1125        dput(path->dentry);
1126        path->dentry = mountpoint;
1127        mntput(path->mnt);
1128        path->mnt = &parent->mnt;
1129        return 1;
1130}
1131EXPORT_SYMBOL(follow_up);
1132
1133/*
1134 * Perform an automount
1135 * - return -EISDIR to tell follow_managed() to stop and return the path we
1136 *   were called with.
1137 */
1138static int follow_automount(struct path *path, struct nameidata *nd,
1139                            bool *need_mntput)
1140{
1141        struct vfsmount *mnt;
1142        int err;
1143
1144        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
1145                return -EREMOTE;
1146
1147        /* We don't want to mount if someone's just doing a stat -
1148         * unless they're stat'ing a directory and appended a '/' to
1149         * the name.
1150         *
1151         * We do, however, want to mount if someone wants to open or
1152         * create a file of any type under the mountpoint, wants to
1153         * traverse through the mountpoint or wants to open the
1154         * mounted directory.  Also, autofs may mark negative dentries
1155         * as being automount points.  These will need the attentions
1156         * of the daemon to instantiate them before they can be used.
1157         */
1158        if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1159                           LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1160            path->dentry->d_inode)
1161                return -EISDIR;
1162
1163        nd->total_link_count++;
1164        if (nd->total_link_count >= 40)
1165                return -ELOOP;
1166
1167        mnt = path->dentry->d_op->d_automount(path);
1168        if (IS_ERR(mnt)) {
1169                /*
1170                 * The filesystem is allowed to return -EISDIR here to indicate
1171                 * it doesn't want to automount.  For instance, autofs would do
1172                 * this so that its userspace daemon can mount on this dentry.
1173                 *
1174                 * However, we can only permit this if it's a terminal point in
1175                 * the path being looked up; if it wasn't then the remainder of
1176                 * the path is inaccessible and we should say so.
1177                 */
1178                if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
1179                        return -EREMOTE;
1180                return PTR_ERR(mnt);
1181        }
1182
1183        if (!mnt) /* mount collision */
1184                return 0;
1185
1186        if (!*need_mntput) {
1187                /* lock_mount() may release path->mnt on error */
1188                mntget(path->mnt);
1189                *need_mntput = true;
1190        }
1191        err = finish_automount(mnt, path);
1192
1193        switch (err) {
1194        case -EBUSY:
1195                /* Someone else made a mount here whilst we were busy */
1196                return 0;
1197        case 0:
1198                path_put(path);
1199                path->mnt = mnt;
1200                path->dentry = dget(mnt->mnt_root);
1201                return 0;
1202        default:
1203                return err;
1204        }
1205
1206}
1207
1208/*
1209 * Handle a dentry that is managed in some way.
1210 * - Flagged for transit management (autofs)
1211 * - Flagged as mountpoint
1212 * - Flagged as automount point
1213 *
1214 * This may only be called in refwalk mode.
1215 *
1216 * Serialization is taken care of in namespace.c
1217 */
1218static int follow_managed(struct path *path, struct nameidata *nd)
1219{
1220        struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1221        unsigned managed;
1222        bool need_mntput = false;
1223        int ret = 0;
1224
1225        /* Given that we're not holding a lock here, we retain the value in a
1226         * local variable for each dentry as we look at it so that we don't see
1227         * the components of that value change under us */
1228        while (managed = READ_ONCE(path->dentry->d_flags),
1229               managed &= DCACHE_MANAGED_DENTRY,
1230               unlikely(managed != 0)) {
1231                /* Allow the filesystem to manage the transit without i_mutex
1232                 * being held. */
1233                if (managed & DCACHE_MANAGE_TRANSIT) {
1234                        BUG_ON(!path->dentry->d_op);
1235                        BUG_ON(!path->dentry->d_op->d_manage);
1236                        ret = path->dentry->d_op->d_manage(path, false);
1237                        if (ret < 0)
1238                                break;
1239                }
1240
1241                /* Transit to a mounted filesystem. */
1242                if (managed & DCACHE_MOUNTED) {
1243                        struct vfsmount *mounted = lookup_mnt(path);
1244                        if (mounted) {
1245                                dput(path->dentry);
1246                                if (need_mntput)
1247                                        mntput(path->mnt);
1248                                path->mnt = mounted;
1249                                path->dentry = dget(mounted->mnt_root);
1250                                need_mntput = true;
1251                                continue;
1252                        }
1253
1254                        /* Something is mounted on this dentry in another
1255                         * namespace and/or whatever was mounted there in this
1256                         * namespace got unmounted before lookup_mnt() could
1257                         * get it */
1258                }
1259
1260                /* Handle an automount point */
1261                if (managed & DCACHE_NEED_AUTOMOUNT) {
1262                        ret = follow_automount(path, nd, &need_mntput);
1263                        if (ret < 0)
1264                                break;
1265                        continue;
1266                }
1267
1268                /* We didn't change the current path point */
1269                break;
1270        }
1271
1272        if (need_mntput && path->mnt == mnt)
1273                mntput(path->mnt);
1274        if (ret == -EISDIR || !ret)
1275                ret = 1;
1276        if (need_mntput)
1277                nd->flags |= LOOKUP_JUMPED;
1278        if (unlikely(ret < 0))
1279                path_put_conditional(path, nd);
1280        return ret;
1281}
1282
1283int follow_down_one(struct path *path)
1284{
1285        struct vfsmount *mounted;
1286
1287        mounted = lookup_mnt(path);
1288        if (mounted) {
1289                dput(path->dentry);
1290                mntput(path->mnt);
1291                path->mnt = mounted;
1292                path->dentry = dget(mounted->mnt_root);
1293                return 1;
1294        }
1295        return 0;
1296}
1297EXPORT_SYMBOL(follow_down_one);
1298
1299static inline int managed_dentry_rcu(const struct path *path)
1300{
1301        return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
1302                path->dentry->d_op->d_manage(path, true) : 0;
1303}
1304
1305/*
1306 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
1307 * we meet a managed dentry that would need blocking.
1308 */
1309static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1310                               struct inode **inode, unsigned *seqp)
1311{
1312        for (;;) {
1313                struct mount *mounted;
1314                /*
1315                 * Don't forget we might have a non-mountpoint managed dentry
1316                 * that wants to block transit.
1317                 */
1318                switch (managed_dentry_rcu(path)) {
1319                case -ECHILD:
1320                default:
1321                        return false;
1322                case -EISDIR:
1323                        return true;
1324                case 0:
1325                        break;
1326                }
1327
1328                if (!d_mountpoint(path->dentry))
1329                        return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1330
1331                mounted = __lookup_mnt(path->mnt, path->dentry);
1332                if (!mounted)
1333                        break;
1334                path->mnt = &mounted->mnt;
1335                path->dentry = mounted->mnt.mnt_root;
1336                nd->flags |= LOOKUP_JUMPED;
1337                *seqp = read_seqcount_begin(&path->dentry->d_seq);
1338                /*
1339                 * Update the inode too. We don't need to re-check the
1340                 * dentry sequence number here after this d_inode read,
1341                 * because a mount-point is always pinned.
1342                 */
1343                *inode = path->dentry->d_inode;
1344        }
1345        return !read_seqretry(&mount_lock, nd->m_seq) &&
1346                !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1347}
1348
1349static int follow_dotdot_rcu(struct nameidata *nd)
1350{
1351        struct inode *inode = nd->inode;
1352
1353        while (1) {
1354                if (path_equal(&nd->path, &nd->root))
1355                        break;
1356                if (nd->path.dentry != nd->path.mnt->mnt_root) {
1357                        struct dentry *old = nd->path.dentry;
1358                        struct dentry *parent = old->d_parent;
1359                        unsigned seq;
1360
1361                        inode = parent->d_inode;
1362                        seq = read_seqcount_begin(&parent->d_seq);
1363                        if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
1364                                return -ECHILD;
1365                        nd->path.dentry = parent;
1366                        nd->seq = seq;
1367                        if (unlikely(!path_connected(&nd->path)))
1368                                return -ENOENT;
1369                        break;
1370                } else {
1371                        struct mount *mnt = real_mount(nd->path.mnt);
1372                        struct mount *mparent = mnt->mnt_parent;
1373                        struct dentry *mountpoint = mnt->mnt_mountpoint;
1374                        struct inode *inode2 = mountpoint->d_inode;
1375                        unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
1376                        if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1377                                return -ECHILD;
1378                        if (&mparent->mnt == nd->path.mnt)
1379                                break;
1380                        /* we know that mountpoint was pinned */
1381                        nd->path.dentry = mountpoint;
1382                        nd->path.mnt = &mparent->mnt;
1383                        inode = inode2;
1384                        nd->seq = seq;
1385                }
1386        }
1387        while (unlikely(d_mountpoint(nd->path.dentry))) {
1388                struct mount *mounted;
1389                mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
1390                if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
1391                        return -ECHILD;
1392                if (!mounted)
1393                        break;
1394                nd->path.mnt = &mounted->mnt;
1395                nd->path.dentry = mounted->mnt.mnt_root;
1396                inode = nd->path.dentry->d_inode;
1397                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1398        }
1399        nd->inode = inode;
1400        return 0;
1401}
1402
1403/*
1404 * Follow down to the covering mount currently visible to userspace.  At each
1405 * point, the filesystem owning that dentry may be queried as to whether the
1406 * caller is permitted to proceed or not.
1407 */
1408int follow_down(struct path *path)
1409{
1410        unsigned managed;
1411        int ret;
1412
1413        while (managed = READ_ONCE(path->dentry->d_flags),
1414               unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1415                /* Allow the filesystem to manage the transit without i_mutex
1416                 * being held.
1417                 *
1418                 * We indicate to the filesystem if someone is trying to mount
1419                 * something here.  This gives autofs the chance to deny anyone
1420                 * other than its daemon the right to mount on its
1421                 * superstructure.
1422                 *
1423                 * The filesystem may sleep at this point.
1424                 */
1425                if (managed & DCACHE_MANAGE_TRANSIT) {
1426                        BUG_ON(!path->dentry->d_op);
1427                        BUG_ON(!path->dentry->d_op->d_manage);
1428                        ret = path->dentry->d_op->d_manage(path, false);
1429                        if (ret < 0)
1430                                return ret == -EISDIR ? 0 : ret;
1431                }
1432
1433                /* Transit to a mounted filesystem. */
1434                if (managed & DCACHE_MOUNTED) {
1435                        struct vfsmount *mounted = lookup_mnt(path);
1436                        if (!mounted)
1437                                break;
1438                        dput(path->dentry);
1439                        mntput(path->mnt);
1440                        path->mnt = mounted;
1441                        path->dentry = dget(mounted->mnt_root);
1442                        continue;
1443                }
1444
1445                /* Don't handle automount points here */
1446                break;
1447        }
1448        return 0;
1449}
1450EXPORT_SYMBOL(follow_down);
1451
1452/*
1453 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1454 */
1455static void follow_mount(struct path *path)
1456{
1457        while (d_mountpoint(path->dentry)) {
1458                struct vfsmount *mounted = lookup_mnt(path);
1459                if (!mounted)
1460                        break;
1461                dput(path->dentry);
1462                mntput(path->mnt);
1463                path->mnt = mounted;
1464                path->dentry = dget(mounted->mnt_root);
1465        }
1466}
1467
1468static int path_parent_directory(struct path *path)
1469{
1470        struct dentry *old = path->dentry;
1471        /* rare case of legitimate dget_parent()... */
1472        path->dentry = dget_parent(path->dentry);
1473        dput(old);
1474        if (unlikely(!path_connected(path)))
1475                return -ENOENT;
1476        return 0;
1477}
1478
1479static int follow_dotdot(struct nameidata *nd)
1480{
1481        while(1) {
1482                if (path_equal(&nd->path, &nd->root))
1483                        break;
1484                if (nd->path.dentry != nd->path.mnt->mnt_root) {
1485                        int ret = path_parent_directory(&nd->path);
1486                        if (ret)
1487                                return ret;
1488                        break;
1489                }
1490                if (!follow_up(&nd->path))
1491                        break;
1492        }
1493        follow_mount(&nd->path);
1494        nd->inode = nd->path.dentry->d_inode;
1495        return 0;
1496}
1497
1498/*
1499 * This looks up the name in dcache and possibly revalidates the found dentry.
1500 * NULL is returned if the dentry does not exist in the cache.
1501 */
1502static struct dentry *lookup_dcache(const struct qstr *name,
1503                                    struct dentry *dir,
1504                                    unsigned int flags)
1505{
1506        struct dentry *dentry = d_lookup(dir, name);
1507        if (dentry) {
1508                int error = d_revalidate(dentry, flags);
1509                if (unlikely(error <= 0)) {
1510                        if (!error)
1511                                d_invalidate(dentry);
1512                        dput(dentry);
1513                        return ERR_PTR(error);
1514                }
1515        }
1516        return dentry;
1517}
1518
1519/*
1520 * Parent directory has inode locked exclusive.  This is one
1521 * and only case when ->lookup() gets called on non in-lookup
1522 * dentries - as the matter of fact, this only gets called
1523 * when directory is guaranteed to have no in-lookup children
1524 * at all.
1525 */
1526static struct dentry *__lookup_hash(const struct qstr *name,
1527                struct dentry *base, unsigned int flags)
1528{
1529        struct dentry *dentry = lookup_dcache(name, base, flags);
1530        struct dentry *old;
1531        struct inode *dir = base->d_inode;
1532
1533        if (dentry)
1534                return dentry;
1535
1536        /* Don't create child dentry for a dead directory. */
1537        if (unlikely(IS_DEADDIR(dir)))
1538                return ERR_PTR(-ENOENT);
1539
1540        dentry = d_alloc(base, name);
1541        if (unlikely(!dentry))
1542                return ERR_PTR(-ENOMEM);
1543
1544        old = dir->i_op->lookup(dir, dentry, flags);
1545        if (unlikely(old)) {
1546                dput(dentry);
1547                dentry = old;
1548        }
1549        return dentry;
1550}
1551
1552static int lookup_fast(struct nameidata *nd,
1553                       struct path *path, struct inode **inode,
1554                       unsigned *seqp)
1555{
1556        struct vfsmount *mnt = nd->path.mnt;
1557        struct dentry *dentry, *parent = nd->path.dentry;
1558        int status = 1;
1559        int err;
1560
1561        /*
1562         * Rename seqlock is not required here because in the off chance
1563         * of a false negative due to a concurrent rename, the caller is
1564         * going to fall back to non-racy lookup.
1565         */
1566        if (nd->flags & LOOKUP_RCU) {
1567                unsigned seq;
1568                bool negative;
1569                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1570                if (unlikely(!dentry)) {
1571                        if (unlazy_walk(nd))
1572                                return -ECHILD;
1573                        return 0;
1574                }
1575
1576                /*
1577                 * This sequence count validates that the inode matches
1578                 * the dentry name information from lookup.
1579                 */
1580                *inode = d_backing_inode(dentry);
1581                negative = d_is_negative(dentry);
1582                if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
1583                        return -ECHILD;
1584
1585                /*
1586                 * This sequence count validates that the parent had no
1587                 * changes while we did the lookup of the dentry above.
1588                 *
1589                 * The memory barrier in read_seqcount_begin of child is
1590                 *  enough, we can use __read_seqcount_retry here.
1591                 */
1592                if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
1593                        return -ECHILD;
1594
1595                *seqp = seq;
1596                status = d_revalidate(dentry, nd->flags);
1597                if (likely(status > 0)) {
1598                        /*
1599                         * Note: do negative dentry check after revalidation in
1600                         * case that drops it.
1601                         */
1602                        if (unlikely(negative))
1603                                return -ENOENT;
1604                        path->mnt = mnt;
1605                        path->dentry = dentry;
1606                        if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
1607                                return 1;
1608                }
1609                if (unlazy_child(nd, dentry, seq))
1610                        return -ECHILD;
1611                if (unlikely(status == -ECHILD))
1612                        /* we'd been told to redo it in non-rcu mode */
1613                        status = d_revalidate(dentry, nd->flags);
1614        } else {
1615                dentry = __d_lookup(parent, &nd->last);
1616                if (unlikely(!dentry))
1617                        return 0;
1618                status = d_revalidate(dentry, nd->flags);
1619        }
1620        if (unlikely(status <= 0)) {
1621                if (!status)
1622                        d_invalidate(dentry);
1623                dput(dentry);
1624                return status;
1625        }
1626        if (unlikely(d_is_negative(dentry))) {
1627                dput(dentry);
1628                return -ENOENT;
1629        }
1630
1631        path->mnt = mnt;
1632        path->dentry = dentry;
1633        err = follow_managed(path, nd);
1634        if (likely(err > 0))
1635                *inode = d_backing_inode(path->dentry);
1636        return err;
1637}
1638
1639/* Fast lookup failed, do it the slow way */
1640static struct dentry *__lookup_slow(const struct qstr *name,
1641                                    struct dentry *dir,
1642                                    unsigned int flags)
1643{
1644        struct dentry *dentry, *old;
1645        struct inode *inode = dir->d_inode;
1646        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1647
1648        /* Don't go there if it's already dead */
1649        if (unlikely(IS_DEADDIR(inode)))
1650                return ERR_PTR(-ENOENT);
1651again:
1652        dentry = d_alloc_parallel(dir, name, &wq);
1653        if (IS_ERR(dentry))
1654                return dentry;
1655        if (unlikely(!d_in_lookup(dentry))) {
1656                if (!(flags & LOOKUP_NO_REVAL)) {
1657                        int error = d_revalidate(dentry, flags);
1658                        if (unlikely(error <= 0)) {
1659                                if (!error) {
1660                                        d_invalidate(dentry);
1661                                        dput(dentry);
1662                                        goto again;
1663                                }
1664                                dput(dentry);
1665                                dentry = ERR_PTR(error);
1666                        }
1667                }
1668        } else {
1669                old = inode->i_op->lookup(inode, dentry, flags);
1670                d_lookup_done(dentry);
1671                if (unlikely(old)) {
1672                        dput(dentry);
1673                        dentry = old;
1674                }
1675        }
1676        return dentry;
1677}
1678
1679static struct dentry *lookup_slow(const struct qstr *name,
1680                                  struct dentry *dir,
1681                                  unsigned int flags)
1682{
1683        struct inode *inode = dir->d_inode;
1684        struct dentry *res;
1685        inode_lock_shared(inode);
1686        res = __lookup_slow(name, dir, flags);
1687        inode_unlock_shared(inode);
1688        return res;
1689}
1690
1691static inline int may_lookup(struct nameidata *nd)
1692{
1693        if (nd->flags & LOOKUP_RCU) {
1694                int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1695                if (err != -ECHILD)
1696                        return err;
1697                if (unlazy_walk(nd))
1698                        return -ECHILD;
1699        }
1700        return inode_permission(nd->inode, MAY_EXEC);
1701}
1702
1703static inline int handle_dots(struct nameidata *nd, int type)
1704{
1705        if (type == LAST_DOTDOT) {
1706                if (!nd->root.mnt)
1707                        set_root(nd);
1708                if (nd->flags & LOOKUP_RCU) {
1709                        return follow_dotdot_rcu(nd);
1710                } else
1711                        return follow_dotdot(nd);
1712        }
1713        return 0;
1714}
1715
1716static int pick_link(struct nameidata *nd, struct path *link,
1717                     struct inode *inode, unsigned seq)
1718{
1719        int error;
1720        struct saved *last;
1721        if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
1722                path_to_nameidata(link, nd);
1723                return -ELOOP;
1724        }
1725        if (!(nd->flags & LOOKUP_RCU)) {
1726                if (link->mnt == nd->path.mnt)
1727                        mntget(link->mnt);
1728        }
1729        error = nd_alloc_stack(nd);
1730        if (unlikely(error)) {
1731                if (error == -ECHILD) {
1732                        if (unlikely(!legitimize_path(nd, link, seq))) {
1733                                drop_links(nd);
1734                                nd->depth = 0;
1735                                nd->flags &= ~LOOKUP_RCU;
1736                                nd->path.mnt = NULL;
1737                                nd->path.dentry = NULL;
1738                                if (!(nd->flags & LOOKUP_ROOT))
1739                                        nd->root.mnt = NULL;
1740                                rcu_read_unlock();
1741                        } else if (likely(unlazy_walk(nd)) == 0)
1742                                error = nd_alloc_stack(nd);
1743                }
1744                if (error) {
1745                        path_put(link);
1746                        return error;
1747                }
1748        }
1749
1750        last = nd->stack + nd->depth++;
1751        last->link = *link;
1752        clear_delayed_call(&last->done);
1753        nd->link_inode = inode;
1754        last->seq = seq;
1755        return 1;
1756}
1757
1758enum {WALK_FOLLOW = 1, WALK_MORE = 2};
1759
1760/*
1761 * Do we need to follow links? We _really_ want to be able
1762 * to do this check without having to look at inode->i_op,
1763 * so we keep a cache of "no, this doesn't need follow_link"
1764 * for the common case.
1765 */
1766static inline int step_into(struct nameidata *nd, struct path *path,
1767                            int flags, struct inode *inode, unsigned seq)
1768{
1769        if (!(flags & WALK_MORE) && nd->depth)
1770                put_link(nd);
1771        if (likely(!d_is_symlink(path->dentry)) ||
1772           !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
1773                /* not a symlink or should not follow */
1774                path_to_nameidata(path, nd);
1775                nd->inode = inode;
1776                nd->seq = seq;
1777                return 0;
1778        }
1779        /* make sure that d_is_symlink above matches inode */
1780        if (nd->flags & LOOKUP_RCU) {
1781                if (read_seqcount_retry(&path->dentry->d_seq, seq))
1782                        return -ECHILD;
1783        }
1784        return pick_link(nd, path, inode, seq);
1785}
1786
1787static int walk_component(struct nameidata *nd, int flags)
1788{
1789        struct path path;
1790        struct inode *inode;
1791        unsigned seq;
1792        int err;
1793        /*
1794         * "." and ".." are special - ".." especially so because it has
1795         * to be able to know about the current root directory and
1796         * parent relationships.
1797         */
1798        if (unlikely(nd->last_type != LAST_NORM)) {
1799                err = handle_dots(nd, nd->last_type);
1800                if (!(flags & WALK_MORE) && nd->depth)
1801                        put_link(nd);
1802                return err;
1803        }
1804        err = lookup_fast(nd, &path, &inode, &seq);
1805        if (unlikely(err <= 0)) {
1806                if (err < 0)
1807                        return err;
1808                path.dentry = lookup_slow(&nd->last, nd->path.dentry,
1809                                          nd->flags);
1810                if (IS_ERR(path.dentry))
1811                        return PTR_ERR(path.dentry);
1812
1813                path.mnt = nd->path.mnt;
1814                err = follow_managed(&path, nd);
1815                if (unlikely(err < 0))
1816                        return err;
1817
1818                if (unlikely(d_is_negative(path.dentry))) {
1819                        path_to_nameidata(&path, nd);
1820                        return -ENOENT;
1821                }
1822
1823                seq = 0;        /* we are already out of RCU mode */
1824                inode = d_backing_inode(path.dentry);
1825        }
1826
1827        return step_into(nd, &path, flags, inode, seq);
1828}
1829
1830/*
1831 * We can do the critical dentry name comparison and hashing
1832 * operations one word at a time, but we are limited to:
1833 *
1834 * - Architectures with fast unaligned word accesses. We could
1835 *   do a "get_unaligned()" if this helps and is sufficiently
1836 *   fast.
1837 *
1838 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
1839 *   do not trap on the (extremely unlikely) case of a page
1840 *   crossing operation.
1841 *
1842 * - Furthermore, we need an efficient 64-bit compile for the
1843 *   64-bit case in order to generate the "number of bytes in
1844 *   the final mask". Again, that could be replaced with a
1845 *   efficient population count instruction or similar.
1846 */
1847#ifdef CONFIG_DCACHE_WORD_ACCESS
1848
1849#include <asm/word-at-a-time.h>
1850
1851#ifdef HASH_MIX
1852
1853/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
1854
1855#elif defined(CONFIG_64BIT)
1856/*
1857 * Register pressure in the mixing function is an issue, particularly
1858 * on 32-bit x86, but almost any function requires one state value and
1859 * one temporary.  Instead, use a function designed for two state values
1860 * and no temporaries.
1861 *
1862 * This function cannot create a collision in only two iterations, so
1863 * we have two iterations to achieve avalanche.  In those two iterations,
1864 * we have six layers of mixing, which is enough to spread one bit's
1865 * influence out to 2^6 = 64 state bits.
1866 *
1867 * Rotate constants are scored by considering either 64 one-bit input
1868 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
1869 * probability of that delta causing a change to each of the 128 output
1870 * bits, using a sample of random initial states.
1871 *
1872 * The Shannon entropy of the computed probabilities is then summed
1873 * to produce a score.  Ideally, any input change has a 50% chance of
1874 * toggling any given output bit.
1875 *
1876 * Mixing scores (in bits) for (12,45):
1877 * Input delta: 1-bit      2-bit
1878 * 1 round:     713.3    42542.6
1879 * 2 rounds:   2753.7   140389.8
1880 * 3 rounds:   5954.1   233458.2
1881 * 4 rounds:   7862.6   256672.2
1882 * Perfect:    8192     258048
1883 *            (64*128) (64*63/2 * 128)
1884 */
1885#define HASH_MIX(x, y, a)       \
1886        (       x ^= (a),       \
1887        y ^= x, x = rol64(x,12),\
1888        x += y, y = rol64(y,45),\
1889        y *= 9                  )
1890
1891/*
1892 * Fold two longs into one 32-bit hash value.  This must be fast, but
1893 * latency isn't quite as critical, as there is a fair bit of additional
1894 * work done before the hash value is used.
1895 */
1896static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1897{
1898        y ^= x * GOLDEN_RATIO_64;
1899        y *= GOLDEN_RATIO_64;
1900        return y >> 32;
1901}
1902
1903#else   /* 32-bit case */
1904
1905/*
1906 * Mixing scores (in bits) for (7,20):
1907 * Input delta: 1-bit      2-bit
1908 * 1 round:     330.3     9201.6
1909 * 2 rounds:   1246.4    25475.4
1910 * 3 rounds:   1907.1    31295.1
1911 * 4 rounds:   2042.3    31718.6
1912 * Perfect:    2048      31744
1913 *            (32*64)   (32*31/2 * 64)
1914 */
1915#define HASH_MIX(x, y, a)       \
1916        (       x ^= (a),       \
1917        y ^= x, x = rol32(x, 7),\
1918        x += y, y = rol32(y,20),\
1919        y *= 9                  )
1920
1921static inline unsigned int fold_hash(unsigned long x, unsigned long y)
1922{
1923        /* Use arch-optimized multiply if one exists */
1924        return __hash_32(y ^ __hash_32(x));
1925}
1926
1927#endif
1928
1929/*
1930 * Return the hash of a string of known length.  This is carfully
1931 * designed to match hash_name(), which is the more critical function.
1932 * In particular, we must end by hashing a final word containing 0..7
1933 * payload bytes, to match the way that hash_name() iterates until it
1934 * finds the delimiter after the name.
1935 */
1936unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
1937{
1938        unsigned long a, x = 0, y = (unsigned long)salt;
1939
1940        for (;;) {
1941                if (!len)
1942                        goto done;
1943                a = load_unaligned_zeropad(name);
1944                if (len < sizeof(unsigned long))
1945                        break;
1946                HASH_MIX(x, y, a);
1947                name += sizeof(unsigned long);
1948                len -= sizeof(unsigned long);
1949        }
1950        x ^= a & bytemask_from_count(len);
1951done:
1952        return fold_hash(x, y);
1953}
1954EXPORT_SYMBOL(full_name_hash);
1955
1956/* Return the "hash_len" (hash and length) of a null-terminated string */
1957u64 hashlen_string(const void *salt, const char *name)
1958{
1959        unsigned long a = 0, x = 0, y = (unsigned long)salt;
1960        unsigned long adata, mask, len;
1961        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1962
1963        len = 0;
1964        goto inside;
1965
1966        do {
1967                HASH_MIX(x, y, a);
1968                len += sizeof(unsigned long);
1969inside:
1970                a = load_unaligned_zeropad(name+len);
1971        } while (!has_zero(a, &adata, &constants));
1972
1973        adata = prep_zero_mask(a, adata, &constants);
1974        mask = create_zero_mask(adata);
1975        x ^= a & zero_bytemask(mask);
1976
1977        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
1978}
1979EXPORT_SYMBOL(hashlen_string);
1980
1981/*
1982 * Calculate the length and hash of the path component, and
1983 * return the "hash_len" as the result.
1984 */
1985static inline u64 hash_name(const void *salt, const char *name)
1986{
1987        unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
1988        unsigned long adata, bdata, mask, len;
1989        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1990
1991        len = 0;
1992        goto inside;
1993
1994        do {
1995                HASH_MIX(x, y, a);
1996                len += sizeof(unsigned long);
1997inside:
1998                a = load_unaligned_zeropad(name+len);
1999                b = a ^ REPEAT_BYTE('/');
2000        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
2001
2002        adata = prep_zero_mask(a, adata, &constants);
2003        bdata = prep_zero_mask(b, bdata, &constants);
2004        mask = create_zero_mask(adata | bdata);
2005        x ^= a & zero_bytemask(mask);
2006
2007        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2008}
2009
2010#else   /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2011
2012/* Return the hash of a string of known length */
2013unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2014{
2015        unsigned long hash = init_name_hash(salt);
2016        while (len--)
2017                hash = partial_name_hash((unsigned char)*name++, hash);
2018        return end_name_hash(hash);
2019}
2020EXPORT_SYMBOL(full_name_hash);
2021
2022/* Return the "hash_len" (hash and length) of a null-terminated string */
2023u64 hashlen_string(const void *salt, const char *name)
2024{
2025        unsigned long hash = init_name_hash(salt);
2026        unsigned long len = 0, c;
2027
2028        c = (unsigned char)*name;
2029        while (c) {
2030                len++;
2031                hash = partial_name_hash(c, hash);
2032                c = (unsigned char)name[len];
2033        }
2034        return hashlen_create(end_name_hash(hash), len);
2035}
2036EXPORT_SYMBOL(hashlen_string);
2037
2038/*
2039 * We know there's a real path component here of at least
2040 * one character.
2041 */
2042static inline u64 hash_name(const void *salt, const char *name)
2043{
2044        unsigned long hash = init_name_hash(salt);
2045        unsigned long len = 0, c;
2046
2047        c = (unsigned char)*name;
2048        do {
2049                len++;
2050                hash = partial_name_hash(c, hash);
2051                c = (unsigned char)name[len];
2052        } while (c && c != '/');
2053        return hashlen_create(end_name_hash(hash), len);
2054}
2055
2056#endif
2057
2058/*
2059 * Name resolution.
2060 * This is the basic name resolution function, turning a pathname into
2061 * the final dentry. We expect 'base' to be positive and a directory.
2062 *
2063 * Returns 0 and nd will have valid dentry and mnt on success.
2064 * Returns error and drops reference to input namei data on failure.
2065 */
2066static int link_path_walk(const char *name, struct nameidata *nd)
2067{
2068        int err;
2069
2070        if (IS_ERR(name))
2071                return PTR_ERR(name);
2072        while (*name=='/')
2073                name++;
2074        if (!*name)
2075                return 0;
2076
2077        /* At this point we know we have a real path component. */
2078        for(;;) {
2079                u64 hash_len;
2080                int type;
2081
2082                err = may_lookup(nd);
2083                if (err)
2084                        return err;
2085
2086                hash_len = hash_name(nd->path.dentry, name);
2087
2088                type = LAST_NORM;
2089                if (name[0] == '.') switch (hashlen_len(hash_len)) {
2090                        case 2:
2091                                if (name[1] == '.') {
2092                                        type = LAST_DOTDOT;
2093                                        nd->flags |= LOOKUP_JUMPED;
2094                                }
2095                                break;
2096                        case 1:
2097                                type = LAST_DOT;
2098                }
2099                if (likely(type == LAST_NORM)) {
2100                        struct dentry *parent = nd->path.dentry;
2101                        nd->flags &= ~LOOKUP_JUMPED;
2102                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2103                                struct qstr this = { { .hash_len = hash_len }, .name = name };
2104                                err = parent->d_op->d_hash(parent, &this);
2105                                if (err < 0)
2106                                        return err;
2107                                hash_len = this.hash_len;
2108                                name = this.name;
2109                        }
2110                }
2111
2112                nd->last.hash_len = hash_len;
2113                nd->last.name = name;
2114                nd->last_type = type;
2115
2116                name += hashlen_len(hash_len);
2117                if (!*name)
2118                        goto OK;
2119                /*
2120                 * If it wasn't NUL, we know it was '/'. Skip that
2121                 * slash, and continue until no more slashes.
2122                 */
2123                do {
2124                        name++;
2125                } while (unlikely(*name == '/'));
2126                if (unlikely(!*name)) {
2127OK:
2128                        /* pathname body, done */
2129                        if (!nd->depth)
2130                                return 0;
2131                        name = nd->stack[nd->depth - 1].name;
2132                        /* trailing symlink, done */
2133                        if (!name)
2134                                return 0;
2135                        /* last component of nested symlink */
2136                        err = walk_component(nd, WALK_FOLLOW);
2137                } else {
2138                        /* not the last component */
2139                        err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
2140                }
2141                if (err < 0)
2142                        return err;
2143
2144                if (err) {
2145                        const char *s = get_link(nd);
2146
2147                        if (IS_ERR(s))
2148                                return PTR_ERR(s);
2149                        err = 0;
2150                        if (unlikely(!s)) {
2151                                /* jumped */
2152                                put_link(nd);
2153                        } else {
2154                                nd->stack[nd->depth - 1].name = name;
2155                                name = s;
2156                                continue;
2157                        }
2158                }
2159                if (unlikely(!d_can_lookup(nd->path.dentry))) {
2160                        if (nd->flags & LOOKUP_RCU) {
2161                                if (unlazy_walk(nd))
2162                                        return -ECHILD;
2163                        }
2164                        return -ENOTDIR;
2165                }
2166        }
2167}
2168
2169/* must be paired with terminate_walk() */
2170static const char *path_init(struct nameidata *nd, unsigned flags)
2171{
2172        const char *s = nd->name->name;
2173
2174        if (!*s)
2175                flags &= ~LOOKUP_RCU;
2176        if (flags & LOOKUP_RCU)
2177                rcu_read_lock();
2178
2179        nd->last_type = LAST_ROOT; /* if there are only slashes... */
2180        nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
2181        nd->depth = 0;
2182        if (flags & LOOKUP_ROOT) {
2183                struct dentry *root = nd->root.dentry;
2184                struct inode *inode = root->d_inode;
2185                if (*s && unlikely(!d_can_lookup(root)))
2186                        return ERR_PTR(-ENOTDIR);
2187                nd->path = nd->root;
2188                nd->inode = inode;
2189                if (flags & LOOKUP_RCU) {
2190                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2191                        nd->root_seq = nd->seq;
2192                        nd->m_seq = read_seqbegin(&mount_lock);
2193                } else {
2194                        path_get(&nd->path);
2195                }
2196                return s;
2197        }
2198
2199        nd->root.mnt = NULL;
2200        nd->path.mnt = NULL;
2201        nd->path.dentry = NULL;
2202
2203        nd->m_seq = read_seqbegin(&mount_lock);
2204        if (*s == '/') {
2205                set_root(nd);
2206                if (likely(!nd_jump_root(nd)))
2207                        return s;
2208                return ERR_PTR(-ECHILD);
2209        } else if (nd->dfd == AT_FDCWD) {
2210                if (flags & LOOKUP_RCU) {
2211                        struct fs_struct *fs = current->fs;
2212                        unsigned seq;
2213
2214                        do {
2215                                seq = read_seqcount_begin(&fs->seq);
2216                                nd->path = fs->pwd;
2217                                nd->inode = nd->path.dentry->d_inode;
2218                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2219                        } while (read_seqcount_retry(&fs->seq, seq));
2220                } else {
2221                        get_fs_pwd(current->fs, &nd->path);
2222                        nd->inode = nd->path.dentry->d_inode;
2223                }
2224                return s;
2225        } else {
2226                /* Caller must check execute permissions on the starting path component */
2227                struct fd f = fdget_raw(nd->dfd);
2228                struct dentry *dentry;
2229
2230                if (!f.file)
2231                        return ERR_PTR(-EBADF);
2232
2233                dentry = f.file->f_path.dentry;
2234
2235                if (*s && unlikely(!d_can_lookup(dentry))) {
2236                        fdput(f);
2237                        return ERR_PTR(-ENOTDIR);
2238                }
2239
2240                nd->path = f.file->f_path;
2241                if (flags & LOOKUP_RCU) {
2242                        nd->inode = nd->path.dentry->d_inode;
2243                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2244                } else {
2245                        path_get(&nd->path);
2246                        nd->inode = nd->path.dentry->d_inode;
2247                }
2248                fdput(f);
2249                return s;
2250        }
2251}
2252
2253static const char *trailing_symlink(struct nameidata *nd)
2254{
2255        const char *s;
2256        int error = may_follow_link(nd);
2257        if (unlikely(error))
2258                return ERR_PTR(error);
2259        nd->flags |= LOOKUP_PARENT;
2260        nd->stack[0].name = NULL;
2261        s = get_link(nd);
2262        return s ? s : "";
2263}
2264
2265static inline int lookup_last(struct nameidata *nd)
2266{
2267        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2268                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2269
2270        nd->flags &= ~LOOKUP_PARENT;
2271        return walk_component(nd, 0);
2272}
2273
2274static int handle_lookup_down(struct nameidata *nd)
2275{
2276        struct path path = nd->path;
2277        struct inode *inode = nd->inode;
2278        unsigned seq = nd->seq;
2279        int err;
2280
2281        if (nd->flags & LOOKUP_RCU) {
2282                /*
2283                 * don't bother with unlazy_walk on failure - we are
2284                 * at the very beginning of walk, so we lose nothing
2285                 * if we simply redo everything in non-RCU mode
2286                 */
2287                if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
2288                        return -ECHILD;
2289        } else {
2290                dget(path.dentry);
2291                err = follow_managed(&path, nd);
2292                if (unlikely(err < 0))
2293                        return err;
2294                inode = d_backing_inode(path.dentry);
2295                seq = 0;
2296        }
2297        path_to_nameidata(&path, nd);
2298        nd->inode = inode;
2299        nd->seq = seq;
2300        return 0;
2301}
2302
2303/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2304static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2305{
2306        const char *s = path_init(nd, flags);
2307        int err;
2308
2309        if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2310                err = handle_lookup_down(nd);
2311                if (unlikely(err < 0))
2312                        s = ERR_PTR(err);
2313        }
2314
2315        while (!(err = link_path_walk(s, nd))
2316                && ((err = lookup_last(nd)) > 0)) {
2317                s = trailing_symlink(nd);
2318        }
2319        if (!err)
2320                err = complete_walk(nd);
2321
2322        if (!err && nd->flags & LOOKUP_DIRECTORY)
2323                if (!d_can_lookup(nd->path.dentry))
2324                        err = -ENOTDIR;
2325        if (!err) {
2326                *path = nd->path;
2327                nd->path.mnt = NULL;
2328                nd->path.dentry = NULL;
2329        }
2330        terminate_walk(nd);
2331        return err;
2332}
2333
2334int filename_lookup(int dfd, struct filename *name, unsigned flags,
2335                    struct path *path, struct path *root)
2336{
2337        int retval;
2338        struct nameidata nd;
2339        if (IS_ERR(name))
2340                return PTR_ERR(name);
2341        if (unlikely(root)) {
2342                nd.root = *root;
2343                flags |= LOOKUP_ROOT;
2344        }
2345        set_nameidata(&nd, dfd, name);
2346        retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
2347        if (unlikely(retval == -ECHILD))
2348                retval = path_lookupat(&nd, flags, path);
2349        if (unlikely(retval == -ESTALE))
2350                retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
2351
2352        if (likely(!retval))
2353                audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
2354        restore_nameidata();
2355        putname(name);
2356        return retval;
2357}
2358
2359/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
2360static int path_parentat(struct nameidata *nd, unsigned flags,
2361                                struct path *parent)
2362{
2363        const char *s = path_init(nd, flags);
2364        int err = link_path_walk(s, nd);
2365        if (!err)
2366                err = complete_walk(nd);
2367        if (!err) {
2368                *parent = nd->path;
2369                nd->path.mnt = NULL;
2370                nd->path.dentry = NULL;
2371        }
2372        terminate_walk(nd);
2373        return err;
2374}
2375
2376static struct filename *filename_parentat(int dfd, struct filename *name,
2377                                unsigned int flags, struct path *parent,
2378                                struct qstr *last, int *type)
2379{
2380        int retval;
2381        struct nameidata nd;
2382
2383        if (IS_ERR(name))
2384                return name;
2385        set_nameidata(&nd, dfd, name);
2386        retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2387        if (unlikely(retval == -ECHILD))
2388                retval = path_parentat(&nd, flags, parent);
2389        if (unlikely(retval == -ESTALE))
2390                retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2391        if (likely(!retval)) {
2392                *last = nd.last;
2393                *type = nd.last_type;
2394                audit_inode(name, parent->dentry, LOOKUP_PARENT);
2395        } else {
2396                putname(name);
2397                name = ERR_PTR(retval);
2398        }
2399        restore_nameidata();
2400        return name;
2401}
2402
2403/* does lookup, returns the object with parent locked */
2404struct dentry *kern_path_locked(const char *name, struct path *path)
2405{
2406        struct filename *filename;
2407        struct dentry *d;
2408        struct qstr last;
2409        int type;
2410
2411        filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
2412                                    &last, &type);
2413        if (IS_ERR(filename))
2414                return ERR_CAST(filename);
2415        if (unlikely(type != LAST_NORM)) {
2416                path_put(path);
2417                putname(filename);
2418                return ERR_PTR(-EINVAL);
2419        }
2420        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2421        d = __lookup_hash(&last, path->dentry, 0);
2422        if (IS_ERR(d)) {
2423                inode_unlock(path->dentry->d_inode);
2424                path_put(path);
2425        }
2426        putname(filename);
2427        return d;
2428}
2429
2430int kern_path(const char *name, unsigned int flags, struct path *path)
2431{
2432        return filename_lookup(AT_FDCWD, getname_kernel(name),
2433                               flags, path, NULL);
2434}
2435EXPORT_SYMBOL(kern_path);
2436
2437/**
2438 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2439 * @dentry:  pointer to dentry of the base directory
2440 * @mnt: pointer to vfs mount of the base directory
2441 * @name: pointer to file name
2442 * @flags: lookup flags
2443 * @path: pointer to struct path to fill
2444 */
2445int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2446                    const char *name, unsigned int flags,
2447                    struct path *path)
2448{
2449        struct path root = {.mnt = mnt, .dentry = dentry};
2450        /* the first argument of filename_lookup() is ignored with root */
2451        return filename_lookup(AT_FDCWD, getname_kernel(name),
2452                               flags , path, &root);
2453}
2454EXPORT_SYMBOL(vfs_path_lookup);
2455
2456static int lookup_one_len_common(const char *name, struct dentry *base,
2457                                 int len, struct qstr *this)
2458{
2459        this->name = name;
2460        this->len = len;
2461        this->hash = full_name_hash(base, name, len);
2462        if (!len)
2463                return -EACCES;
2464
2465        if (unlikely(name[0] == '.')) {
2466                if (len < 2 || (len == 2 && name[1] == '.'))
2467                        return -EACCES;
2468        }
2469
2470        while (len--) {
2471                unsigned int c = *(const unsigned char *)name++;
2472                if (c == '/' || c == '\0')
2473                        return -EACCES;
2474        }
2475        /*
2476         * See if the low-level filesystem might want
2477         * to use its own hash..
2478         */
2479        if (base->d_flags & DCACHE_OP_HASH) {
2480                int err = base->d_op->d_hash(base, this);
2481                if (err < 0)
2482                        return err;
2483        }
2484
2485        return inode_permission(base->d_inode, MAY_EXEC);
2486}
2487
2488/**
2489 * try_lookup_one_len - filesystem helper to lookup single pathname component
2490 * @name:       pathname component to lookup
2491 * @base:       base directory to lookup from
2492 * @len:        maximum length @len should be interpreted to
2493 *
2494 * Look up a dentry by name in the dcache, returning NULL if it does not
2495 * currently exist.  The function does not try to create a dentry.
2496 *
2497 * Note that this routine is purely a helper for filesystem usage and should
2498 * not be called by generic code.
2499 *
2500 * The caller must hold base->i_mutex.
2501 */
2502struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
2503{
2504        struct qstr this;
2505        int err;
2506
2507        WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2508
2509        err = lookup_one_len_common(name, base, len, &this);
2510        if (err)
2511                return ERR_PTR(err);
2512
2513        return lookup_dcache(&this, base, 0);
2514}
2515EXPORT_SYMBOL(try_lookup_one_len);
2516
2517/**
2518 * lookup_one_len - filesystem helper to lookup single pathname component
2519 * @name:       pathname component to lookup
2520 * @base:       base directory to lookup from
2521 * @len:        maximum length @len should be interpreted to
2522 *
2523 * Note that this routine is purely a helper for filesystem usage and should
2524 * not be called by generic code.
2525 *
2526 * The caller must hold base->i_mutex.
2527 */
2528struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2529{
2530        struct dentry *dentry;
2531        struct qstr this;
2532        int err;
2533
2534        WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2535
2536        err = lookup_one_len_common(name, base, len, &this);
2537        if (err)
2538                return ERR_PTR(err);
2539
2540        dentry = lookup_dcache(&this, base, 0);
2541        return dentry ? dentry : __lookup_slow(&this, base, 0);
2542}
2543EXPORT_SYMBOL(lookup_one_len);
2544
2545/**
2546 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
2547 * @name:       pathname component to lookup
2548 * @base:       base directory to lookup from
2549 * @len:        maximum length @len should be interpreted to
2550 *
2551 * Note that this routine is purely a helper for filesystem usage and should
2552 * not be called by generic code.
2553 *
2554 * Unlike lookup_one_len, it should be called without the parent
2555 * i_mutex held, and will take the i_mutex itself if necessary.
2556 */
2557struct dentry *lookup_one_len_unlocked(const char *name,
2558                                       struct dentry *base, int len)
2559{
2560        struct qstr this;
2561        int err;
2562        struct dentry *ret;
2563
2564        err = lookup_one_len_common(name, base, len, &this);
2565        if (err)
2566                return ERR_PTR(err);
2567
2568        ret = lookup_dcache(&this, base, 0);
2569        if (!ret)
2570                ret = lookup_slow(&this, base, 0);
2571        return ret;
2572}
2573EXPORT_SYMBOL(lookup_one_len_unlocked);
2574
2575#ifdef CONFIG_UNIX98_PTYS
2576int path_pts(struct path *path)
2577{
2578        /* Find something mounted on "pts" in the same directory as
2579         * the input path.
2580         */
2581        struct dentry *child, *parent;
2582        struct qstr this;
2583        int ret;
2584
2585        ret = path_parent_directory(path);
2586        if (ret)
2587                return ret;
2588
2589        parent = path->dentry;
2590        this.name = "pts";
2591        this.len = 3;
2592        child = d_hash_and_lookup(parent, &this);
2593        if (!child)
2594                return -ENOENT;
2595
2596        path->dentry = child;
2597        dput(parent);
2598        follow_mount(path);
2599        return 0;
2600}
2601#endif
2602
2603int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2604                 struct path *path, int *empty)
2605{
2606        return filename_lookup(dfd, getname_flags(name, flags, empty),
2607                               flags, path, NULL);
2608}
2609EXPORT_SYMBOL(user_path_at_empty);
2610
2611/**
2612 * mountpoint_last - look up last component for umount
2613 * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
2614 *
2615 * This is a special lookup_last function just for umount. In this case, we
2616 * need to resolve the path without doing any revalidation.
2617 *
2618 * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
2619 * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
2620 * in almost all cases, this lookup will be served out of the dcache. The only
2621 * cases where it won't are if nd->last refers to a symlink or the path is
2622 * bogus and it doesn't exist.
2623 *
2624 * Returns:
2625 * -error: if there was an error during lookup. This includes -ENOENT if the
2626 *         lookup found a negative dentry.
2627 *
2628 * 0:      if we successfully resolved nd->last and found it to not to be a
2629 *         symlink that needs to be followed.
2630 *
2631 * 1:      if we successfully resolved nd->last and found it to be a symlink
2632 *         that needs to be followed.
2633 */
2634static int
2635mountpoint_last(struct nameidata *nd)
2636{
2637        int error = 0;
2638        struct dentry *dir = nd->path.dentry;
2639        struct path path;
2640
2641        /* If we're in rcuwalk, drop out of it to handle last component */
2642        if (nd->flags & LOOKUP_RCU) {
2643                if (unlazy_walk(nd))
2644                        return -ECHILD;
2645        }
2646
2647        nd->flags &= ~LOOKUP_PARENT;
2648
2649        if (unlikely(nd->last_type != LAST_NORM)) {
2650                error = handle_dots(nd, nd->last_type);
2651                if (error)
2652                        return error;
2653                path.dentry = dget(nd->path.dentry);
2654        } else {
2655                path.dentry = d_lookup(dir, &nd->last);
2656                if (!path.dentry) {
2657                        /*
2658                         * No cached dentry. Mounted dentries are pinned in the
2659                         * cache, so that means that this dentry is probably
2660                         * a symlink or the path doesn't actually point
2661                         * to a mounted dentry.
2662                         */
2663                        path.dentry = lookup_slow(&nd->last, dir,
2664                                             nd->flags | LOOKUP_NO_REVAL);
2665                        if (IS_ERR(path.dentry))
2666                                return PTR_ERR(path.dentry);
2667                }
2668        }
2669        if (d_is_negative(path.dentry)) {
2670                dput(path.dentry);
2671                return -ENOENT;
2672        }
2673        path.mnt = nd->path.mnt;
2674        return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
2675}
2676
2677/**
2678 * path_mountpoint - look up a path to be umounted
2679 * @nd:         lookup context
2680 * @flags:      lookup flags
2681 * @path:       pointer to container for result
2682 *
2683 * Look up the given name, but don't attempt to revalidate the last component.
2684 * Returns 0 and "path" will be valid on success; Returns error otherwise.
2685 */
2686static int
2687path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
2688{
2689        const char *s = path_init(nd, flags);
2690        int err;
2691
2692        while (!(err = link_path_walk(s, nd)) &&
2693                (err = mountpoint_last(nd)) > 0) {
2694                s = trailing_symlink(nd);
2695        }
2696        if (!err) {
2697                *path = nd->path;
2698                nd->path.mnt = NULL;
2699                nd->path.dentry = NULL;
2700                follow_mount(path);
2701        }
2702        terminate_walk(nd);
2703        return err;
2704}
2705
2706static int
2707filename_mountpoint(int dfd, struct filename *name, struct path *path,
2708                        unsigned int flags)
2709{
2710        struct nameidata nd;
2711        int error;
2712        if (IS_ERR(name))
2713                return PTR_ERR(name);
2714        set_nameidata(&nd, dfd, name);
2715        error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
2716        if (unlikely(error == -ECHILD))
2717                error = path_mountpoint(&nd, flags, path);
2718        if (unlikely(error == -ESTALE))
2719                error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
2720        if (likely(!error))
2721                audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL);
2722        restore_nameidata();
2723        putname(name);
2724        return error;
2725}
2726
2727/**
2728 * user_path_mountpoint_at - lookup a path from userland in order to umount it
2729 * @dfd:        directory file descriptor
2730 * @name:       pathname from userland
2731 * @flags:      lookup flags
2732 * @path:       pointer to container to hold result
2733 *
2734 * A umount is a special case for path walking. We're not actually interested
2735 * in the inode in this situation, and ESTALE errors can be a problem. We
2736 * simply want track down the dentry and vfsmount attached at the mountpoint
2737 * and avoid revalidating the last component.
2738 *
2739 * Returns 0 and populates "path" on success.
2740 */
2741int
2742user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2743                        struct path *path)
2744{
2745        return filename_mountpoint(dfd, getname(name), path, flags);
2746}
2747
2748int
2749kern_path_mountpoint(int dfd, const char *name, struct path *path,
2750                        unsigned int flags)
2751{
2752        return filename_mountpoint(dfd, getname_kernel(name), path, flags);
2753}
2754EXPORT_SYMBOL(kern_path_mountpoint);
2755
2756int __check_sticky(struct inode *dir, struct inode *inode)
2757{
2758        kuid_t fsuid = current_fsuid();
2759
2760        if (uid_eq(inode->i_uid, fsuid))
2761                return 0;
2762        if (uid_eq(dir->i_uid, fsuid))
2763                return 0;
2764        return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
2765}
2766EXPORT_SYMBOL(__check_sticky);
2767
2768/*
2769 *      Check whether we can remove a link victim from directory dir, check
2770 *  whether the type of victim is right.
2771 *  1. We can't do it if dir is read-only (done in permission())
2772 *  2. We should have write and exec permissions on dir
2773 *  3. We can't remove anything from append-only dir
2774 *  4. We can't do anything with immutable dir (done in permission())
2775 *  5. If the sticky bit on dir is set we should either
2776 *      a. be owner of dir, or
2777 *      b. be owner of victim, or
2778 *      c. have CAP_FOWNER capability
2779 *  6. If the victim is append-only or immutable we can't do antyhing with
2780 *     links pointing to it.
2781 *  7. If the victim has an unknown uid or gid we can't change the inode.
2782 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2783 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2784 * 10. We can't remove a root or mountpoint.
2785 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
2786 *     nfs_async_unlink().
2787 */
2788static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
2789{
2790        struct inode *inode = d_backing_inode(victim);
2791        int error;
2792
2793        if (d_is_negative(victim))
2794                return -ENOENT;
2795        BUG_ON(!inode);
2796
2797        BUG_ON(victim->d_parent->d_inode != dir);
2798
2799        /* Inode writeback is not safe when the uid or gid are invalid. */
2800        if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
2801                return -EOVERFLOW;
2802
2803        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2804
2805        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
2806        if (error)
2807                return error;
2808        if (IS_APPEND(dir))
2809                return -EPERM;
2810
2811        if (check_sticky(dir, inode) || IS_APPEND(inode) ||
2812            IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
2813                return -EPERM;
2814        if (isdir) {
2815                if (!d_is_dir(victim))
2816                        return -ENOTDIR;
2817                if (IS_ROOT(victim))
2818                        return -EBUSY;
2819        } else if (d_is_dir(victim))
2820                return -EISDIR;
2821        if (IS_DEADDIR(dir))
2822                return -ENOENT;
2823        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2824                return -EBUSY;
2825        return 0;
2826}
2827
2828/*      Check whether we can create an object with dentry child in directory
2829 *  dir.
2830 *  1. We can't do it if child already exists (open has special treatment for
2831 *     this case, but since we are inlined it's OK)
2832 *  2. We can't do it if dir is read-only (done in permission())
2833 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
2834 *  4. We should have write and exec permissions on dir
2835 *  5. We can't do it if dir is immutable (done in permission())
2836 */
2837static inline int may_create(struct inode *dir, struct dentry *child)
2838{
2839        struct user_namespace *s_user_ns;
2840        audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
2841        if (child->d_inode)
2842                return -EEXIST;
2843        if (IS_DEADDIR(dir))
2844                return -ENOENT;
2845        s_user_ns = dir->i_sb->s_user_ns;
2846        if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
2847            !kgid_has_mapping(s_user_ns, current_fsgid()))
2848                return -EOVERFLOW;
2849        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
2850}
2851
2852/*
2853 * p1 and p2 should be directories on the same fs.
2854 */
2855struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
2856{
2857        struct dentry *p;
2858
2859        if (p1 == p2) {
2860                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2861                return NULL;
2862        }
2863
2864        mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
2865
2866        p = d_ancestor(p2, p1);
2867        if (p) {
2868                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
2869                inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
2870                return p;
2871        }
2872
2873        p = d_ancestor(p1, p2);
2874        if (p) {
2875                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2876                inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
2877                return p;
2878        }
2879
2880        inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
2881        inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
2882        return NULL;
2883}
2884EXPORT_SYMBOL(lock_rename);
2885
2886void unlock_rename(struct dentry *p1, struct dentry *p2)
2887{
2888        inode_unlock(p1->d_inode);
2889        if (p1 != p2) {
2890                inode_unlock(p2->d_inode);
2891                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
2892        }
2893}
2894EXPORT_SYMBOL(unlock_rename);
2895
2896int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2897                bool want_excl)
2898{
2899        int error = may_create(dir, dentry);
2900        if (error)
2901                return error;
2902
2903        if (!dir->i_op->create)
2904                return -EACCES; /* shouldn't it be ENOSYS? */
2905        mode &= S_IALLUGO;
2906        mode |= S_IFREG;
2907        error = security_inode_create(dir, dentry, mode);
2908        if (error)
2909                return error;
2910        error = dir->i_op->create(dir, dentry, mode, want_excl);
2911        if (!error)
2912                fsnotify_create(dir, dentry);
2913        return error;
2914}
2915EXPORT_SYMBOL(vfs_create);
2916
2917int vfs_mkobj(struct dentry *dentry, umode_t mode,
2918                int (*f)(struct dentry *, umode_t, void *),
2919                void *arg)
2920{
2921        struct inode *dir = dentry->d_parent->d_inode;
2922        int error = may_create(dir, dentry);
2923        if (error)
2924                return error;
2925
2926        mode &= S_IALLUGO;
2927        mode |= S_IFREG;
2928        error = security_inode_create(dir, dentry, mode);
2929        if (error)
2930                return error;
2931        error = f(dentry, mode, arg);
2932        if (!error)
2933                fsnotify_create(dir, dentry);
2934        return error;
2935}
2936EXPORT_SYMBOL(vfs_mkobj);
2937
2938bool may_open_dev(const struct path *path)
2939{
2940        return !(path->mnt->mnt_flags & MNT_NODEV) &&
2941                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
2942}
2943
2944static int may_open(const struct path *path, int acc_mode, int flag)
2945{
2946        struct dentry *dentry = path->dentry;
2947        struct inode *inode = dentry->d_inode;
2948        int error;
2949
2950        if (!inode)
2951                return -ENOENT;
2952
2953        switch (inode->i_mode & S_IFMT) {
2954        case S_IFLNK:
2955                return -ELOOP;
2956        case S_IFDIR:
2957                if (acc_mode & MAY_WRITE)
2958                        return -EISDIR;
2959                break;
2960        case S_IFBLK:
2961        case S_IFCHR:
2962                if (!may_open_dev(path))
2963                        return -EACCES;
2964                /*FALLTHRU*/
2965        case S_IFIFO:
2966        case S_IFSOCK:
2967                flag &= ~O_TRUNC;
2968                break;
2969        }
2970
2971        error = inode_permission(inode, MAY_OPEN | acc_mode);
2972        if (error)
2973                return error;
2974
2975        /*
2976         * An append-only file must be opened in append mode for writing.
2977         */
2978        if (IS_APPEND(inode)) {
2979                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2980                        return -EPERM;
2981                if (flag & O_TRUNC)
2982                        return -EPERM;
2983        }
2984
2985        /* O_NOATIME can only be set by the owner or superuser */
2986        if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2987                return -EPERM;
2988
2989        return 0;
2990}
2991
2992static int handle_truncate(struct file *filp)
2993{
2994        const struct path *path = &filp->f_path;
2995        struct inode *inode = path->dentry->d_inode;
2996        int error = get_write_access(inode);
2997        if (error)
2998                return error;
2999        /*
3000         * Refuse to truncate files with mandatory locks held on them.
3001         */
3002        error = locks_verify_locked(filp);
3003        if (!error)
3004                error = security_path_truncate(path);
3005        if (!error) {
3006                error = do_truncate(path->dentry, 0,
3007                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
3008                                    filp);
3009        }
3010        put_write_access(inode);
3011        return error;
3012}
3013
3014static inline int open_to_namei_flags(int flag)
3015{
3016        if ((flag & O_ACCMODE) == 3)
3017                flag--;
3018        return flag;
3019}
3020
3021static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
3022{
3023        struct user_namespace *s_user_ns;
3024        int error = security_path_mknod(dir, dentry, mode, 0);
3025        if (error)
3026                return error;
3027
3028        s_user_ns = dir->dentry->d_sb->s_user_ns;
3029        if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
3030            !kgid_has_mapping(s_user_ns, current_fsgid()))
3031                return -EOVERFLOW;
3032
3033        error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
3034        if (error)
3035                return error;
3036
3037        return security_inode_create(dir->dentry->d_inode, dentry, mode);
3038}
3039
3040/*
3041 * Attempt to atomically look up, create and open a file from a negative
3042 * dentry.
3043 *
3044 * Returns 0 if successful.  The file will have been created and attached to
3045 * @file by the filesystem calling finish_open().
3046 *
3047 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
3048 * be set.  The caller will need to perform the open themselves.  @path will
3049 * have been updated to point to the new dentry.  This may be negative.
3050 *
3051 * Returns an error code otherwise.
3052 */
3053static int atomic_open(struct nameidata *nd, struct dentry *dentry,
3054                        struct path *path, struct file *file,
3055                        const struct open_flags *op,
3056                        int open_flag, umode_t mode)
3057{
3058        struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
3059        struct inode *dir =  nd->path.dentry->d_inode;
3060        int error;
3061
3062        if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */
3063                open_flag &= ~O_TRUNC;
3064
3065        if (nd->flags & LOOKUP_DIRECTORY)
3066                open_flag |= O_DIRECTORY;
3067
3068        file->f_path.dentry = DENTRY_NOT_SET;
3069        file->f_path.mnt = nd->path.mnt;
3070        error = dir->i_op->atomic_open(dir, dentry, file,
3071                                       open_to_namei_flags(open_flag), mode);
3072        d_lookup_done(dentry);
3073        if (!error) {
3074                if (file->f_mode & FMODE_OPENED) {
3075                        /*
3076                         * We didn't have the inode before the open, so check open
3077                         * permission here.
3078                         */
3079                        int acc_mode = op->acc_mode;
3080                        if (file->f_mode & FMODE_CREATED) {
3081                                WARN_ON(!(open_flag & O_CREAT));
3082                                fsnotify_create(dir, dentry);
3083                                acc_mode = 0;
3084                        }
3085                        error = may_open(&file->f_path, acc_mode, open_flag);
3086                        if (WARN_ON(error > 0))
3087                                error = -EINVAL;
3088                } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3089                        error = -EIO;
3090                } else {
3091                        if (file->f_path.dentry) {
3092                                dput(dentry);
3093                                dentry = file->f_path.dentry;
3094                        }
3095                        if (file->f_mode & FMODE_CREATED)
3096                                fsnotify_create(dir, dentry);
3097                        if (unlikely(d_is_negative(dentry))) {
3098                                error = -ENOENT;
3099                        } else {
3100                                path->dentry = dentry;
3101                                path->mnt = nd->path.mnt;
3102                                return 0;
3103                        }
3104                }
3105        }
3106        dput(dentry);
3107        return error;
3108}
3109
3110/*
3111 * Look up and maybe create and open the last component.
3112 *
3113 * Must be called with parent locked (exclusive in O_CREAT case).
3114 *
3115 * Returns 0 on success, that is, if
3116 *  the file was successfully atomically created (if necessary) and opened, or
3117 *  the file was not completely opened at this time, though lookups and
3118 *  creations were performed.
3119 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
3120 * In the latter case dentry returned in @path might be negative if O_CREAT
3121 * hadn't been specified.
3122 *
3123 * An error code is returned on failure.
3124 */
3125static int lookup_open(struct nameidata *nd, struct path *path,
3126                        struct file *file,
3127                        const struct open_flags *op,
3128                        bool got_write)
3129{
3130        struct dentry *dir = nd->path.dentry;
3131        struct inode *dir_inode = dir->d_inode;
3132        int open_flag = op->open_flag;
3133        struct dentry *dentry;
3134        int error, create_error = 0;
3135        umode_t mode = op->mode;
3136        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
3137
3138        if (unlikely(IS_DEADDIR(dir_inode)))
3139                return -ENOENT;
3140
3141        file->f_mode &= ~FMODE_CREATED;
3142        dentry = d_lookup(dir, &nd->last);
3143        for (;;) {
3144                if (!dentry) {
3145                        dentry = d_alloc_parallel(dir, &nd->last, &wq);
3146                        if (IS_ERR(dentry))
3147                                return PTR_ERR(dentry);
3148                }
3149                if (d_in_lookup(dentry))
3150                        break;
3151
3152                error = d_revalidate(dentry, nd->flags);
3153                if (likely(error > 0))
3154                        break;
3155                if (error)
3156                        goto out_dput;
3157                d_invalidate(dentry);
3158                dput(dentry);
3159                dentry = NULL;
3160        }
3161        if (dentry->d_inode) {
3162                /* Cached positive dentry: will open in f_op->open */
3163                goto out_no_open;
3164        }
3165
3166        /*
3167         * Checking write permission is tricky, bacuse we don't know if we are
3168         * going to actually need it: O_CREAT opens should work as long as the
3169         * file exists.  But checking existence breaks atomicity.  The trick is
3170         * to check access and if not granted clear O_CREAT from the flags.
3171         *
3172         * Another problem is returing the "right" error value (e.g. for an
3173         * O_EXCL open we want to return EEXIST not EROFS).
3174         */
3175        if (open_flag & O_CREAT) {
3176                if (!IS_POSIXACL(dir->d_inode))
3177                        mode &= ~current_umask();
3178                if (unlikely(!got_write)) {
3179                        create_error = -EROFS;
3180                        open_flag &= ~O_CREAT;
3181                        if (open_flag & (O_EXCL | O_TRUNC))
3182                                goto no_open;
3183                        /* No side effects, safe to clear O_CREAT */
3184                } else {
3185                        create_error = may_o_create(&nd->path, dentry, mode);
3186                        if (create_error) {
3187                                open_flag &= ~O_CREAT;
3188                                if (open_flag & O_EXCL)
3189                                        goto no_open;
3190                        }
3191                }
3192        } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
3193                   unlikely(!got_write)) {
3194                /*
3195                 * No O_CREATE -> atomicity not a requirement -> fall
3196                 * back to lookup + open
3197                 */
3198                goto no_open;
3199        }
3200
3201        if (dir_inode->i_op->atomic_open) {
3202                error = atomic_open(nd, dentry, path, file, op, open_flag,
3203                                    mode);
3204                if (unlikely(error == -ENOENT) && create_error)
3205                        error = create_error;
3206                return error;
3207        }
3208
3209no_open:
3210        if (d_in_lookup(dentry)) {
3211                struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
3212                                                             nd->flags);
3213                d_lookup_done(dentry);
3214                if (unlikely(res)) {
3215                        if (IS_ERR(res)) {
3216                                error = PTR_ERR(res);
3217                                goto out_dput;
3218                        }
3219                        dput(dentry);
3220                        dentry = res;
3221                }
3222        }
3223
3224        /* Negative dentry, just create the file */
3225        if (!dentry->d_inode && (open_flag & O_CREAT)) {
3226                file->f_mode |= FMODE_CREATED;
3227                audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
3228                if (!dir_inode->i_op->create) {
3229                        error = -EACCES;
3230                        goto out_dput;
3231                }
3232                error = dir_inode->i_op->create(dir_inode, dentry, mode,
3233                                                open_flag & O_EXCL);
3234                if (error)
3235                        goto out_dput;
3236                fsnotify_create(dir_inode, dentry);
3237        }
3238        if (unlikely(create_error) && !dentry->d_inode) {
3239                error = create_error;
3240                goto out_dput;
3241        }
3242out_no_open:
3243        path->dentry = dentry;
3244        path->mnt = nd->path.mnt;
3245        return 0;
3246
3247out_dput:
3248        dput(dentry);
3249        return error;
3250}
3251
3252/*
3253 * Handle the last step of open()
3254 */
3255static int do_last(struct nameidata *nd,
3256                   struct file *file, const struct open_flags *op)
3257{
3258        struct dentry *dir = nd->path.dentry;
3259        int open_flag = op->open_flag;
3260        bool will_truncate = (open_flag & O_TRUNC) != 0;
3261        bool got_write = false;
3262        int acc_mode = op->acc_mode;
3263        unsigned seq;
3264        struct inode *inode;
3265        struct path path;
3266        int error;
3267
3268        nd->flags &= ~LOOKUP_PARENT;
3269        nd->flags |= op->intent;
3270
3271        if (nd->last_type != LAST_NORM) {
3272                error = handle_dots(nd, nd->last_type);
3273                if (unlikely(error))
3274                        return error;
3275                goto finish_open;
3276        }
3277
3278        if (!(open_flag & O_CREAT)) {
3279                if (nd->last.name[nd->last.len])
3280                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
3281                /* we _can_ be in RCU mode here */
3282                error = lookup_fast(nd, &path, &inode, &seq);
3283                if (likely(error > 0))
3284                        goto finish_lookup;
3285
3286                if (error < 0)
3287                        return error;
3288
3289                BUG_ON(nd->inode != dir->d_inode);
3290                BUG_ON(nd->flags & LOOKUP_RCU);
3291        } else {
3292                /* create side of things */
3293                /*
3294                 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
3295                 * has been cleared when we got to the last component we are
3296                 * about to look up
3297                 */
3298                error = complete_walk(nd);
3299                if (error)
3300                        return error;
3301
3302                audit_inode(nd->name, dir, LOOKUP_PARENT);
3303                /* trailing slashes? */
3304                if (unlikely(nd->last.name[nd->last.len]))
3305                        return -EISDIR;
3306        }
3307
3308        if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3309                error = mnt_want_write(nd->path.mnt);
3310                if (!error)
3311                        got_write = true;
3312                /*
3313                 * do _not_ fail yet - we might not need that or fail with
3314                 * a different error; let lookup_open() decide; we'll be
3315                 * dropping this one anyway.
3316                 */
3317        }
3318        if (open_flag & O_CREAT)
3319                inode_lock(dir->d_inode);
3320        else
3321                inode_lock_shared(dir->d_inode);
3322        error = lookup_open(nd, &path, file, op, got_write);
3323        if (open_flag & O_CREAT)
3324                inode_unlock(dir->d_inode);
3325        else
3326                inode_unlock_shared(dir->d_inode);
3327
3328        if (error)
3329                goto out;
3330
3331        if (file->f_mode & FMODE_OPENED) {
3332                if ((file->f_mode & FMODE_CREATED) ||
3333                    !S_ISREG(file_inode(file)->i_mode))
3334                        will_truncate = false;
3335
3336                audit_inode(nd->name, file->f_path.dentry, 0);
3337                goto opened;
3338        }
3339
3340        if (file->f_mode & FMODE_CREATED) {
3341                /* Don't check for write permission, don't truncate */
3342                open_flag &= ~O_TRUNC;
3343                will_truncate = false;
3344                acc_mode = 0;
3345                path_to_nameidata(&path, nd);
3346                goto finish_open_created;
3347        }
3348
3349        /*
3350         * If atomic_open() acquired write access it is dropped now due to
3351         * possible mount and symlink following (this might be optimized away if
3352         * necessary...)
3353         */
3354        if (got_write) {
3355                mnt_drop_write(nd->path.mnt);
3356                got_write = false;
3357        }
3358
3359        error = follow_managed(&path, nd);
3360        if (unlikely(error < 0))
3361                return error;
3362
3363        if (unlikely(d_is_negative(path.dentry))) {
3364                path_to_nameidata(&path, nd);
3365                return -ENOENT;
3366        }
3367
3368        /*
3369         * create/update audit record if it already exists.
3370         */
3371        audit_inode(nd->name, path.dentry, 0);
3372
3373        if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
3374                path_to_nameidata(&path, nd);
3375                return -EEXIST;
3376        }
3377
3378        seq = 0;        /* out of RCU mode, so the value doesn't matter */
3379        inode = d_backing_inode(path.dentry);
3380finish_lookup:
3381        error = step_into(nd, &path, 0, inode, seq);
3382        if (unlikely(error))
3383                return error;
3384finish_open:
3385        /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
3386        error = complete_walk(nd);
3387        if (error)
3388                return error;
3389        audit_inode(nd->name, nd->path.dentry, 0);
3390        if (open_flag & O_CREAT) {
3391                error = -EISDIR;
3392                if (d_is_dir(nd->path.dentry))
3393                        goto out;
3394                error = may_create_in_sticky(dir,
3395                                             d_backing_inode(nd->path.dentry));
3396                if (unlikely(error))
3397                        goto out;
3398        }
3399        error = -ENOTDIR;
3400        if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3401                goto out;
3402        if (!d_is_reg(nd->path.dentry))
3403                will_truncate = false;
3404
3405        if (will_truncate) {
3406                error = mnt_want_write(nd->path.mnt);
3407                if (error)
3408                        goto out;
3409                got_write = true;
3410        }
3411finish_open_created:
3412        error = may_open(&nd->path, acc_mode, open_flag);
3413        if (error)
3414                goto out;
3415        BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
3416        error = vfs_open(&nd->path, file);
3417        if (error)
3418                goto out;
3419opened:
3420        error = ima_file_check(file, op->acc_mode);
3421        if (!error && will_truncate)
3422                error = handle_truncate(file);
3423out:
3424        if (unlikely(error > 0)) {
3425                WARN_ON(1);
3426                error = -EINVAL;
3427        }
3428        if (got_write)
3429                mnt_drop_write(nd->path.mnt);
3430        return error;
3431}
3432
3433struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
3434{
3435        struct dentry *child = NULL;
3436        struct inode *dir = dentry->d_inode;
3437        struct inode *inode;
3438        int error;
3439
3440        /* we want directory to be writable */
3441        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
3442        if (error)
3443                goto out_err;
3444        error = -EOPNOTSUPP;
3445        if (!dir->i_op->tmpfile)
3446                goto out_err;
3447        error = -ENOMEM;
3448        child = d_alloc(dentry, &slash_name);
3449        if (unlikely(!child))
3450                goto out_err;
3451        error = dir->i_op->tmpfile(dir, child, mode);
3452        if (error)
3453                goto out_err;
3454        error = -ENOENT;
3455        inode = child->d_inode;
3456        if (unlikely(!inode))
3457                goto out_err;
3458        if (!(open_flag & O_EXCL)) {
3459                spin_lock(&inode->i_lock);
3460                inode->i_state |= I_LINKABLE;
3461                spin_unlock(&inode->i_lock);
3462        }
3463        ima_post_create_tmpfile(inode);
3464        return child;
3465
3466out_err:
3467        dput(child);
3468        return ERR_PTR(error);
3469}
3470EXPORT_SYMBOL(vfs_tmpfile);
3471
3472static int do_tmpfile(struct nameidata *nd, unsigned flags,
3473                const struct open_flags *op,
3474                struct file *file)
3475{
3476        struct dentry *child;
3477        struct path path;
3478        int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3479        if (unlikely(error))
3480                return error;
3481        error = mnt_want_write(path.mnt);
3482        if (unlikely(error))
3483                goto out;
3484        child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
3485        error = PTR_ERR(child);
3486        if (IS_ERR(child))
3487                goto out2;
3488        dput(path.dentry);
3489        path.dentry = child;
3490        audit_inode(nd->name, child, 0);
3491        /* Don't check for other permissions, the inode was just created */
3492        error = may_open(&path, 0, op->open_flag);
3493        if (error)
3494                goto out2;
3495        file->f_path.mnt = path.mnt;
3496        error = finish_open(file, child, NULL);
3497out2:
3498        mnt_drop_write(path.mnt);
3499out:
3500        path_put(&path);
3501        return error;
3502}
3503
3504static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
3505{
3506        struct path path;
3507        int error = path_lookupat(nd, flags, &path);
3508        if (!error) {
3509                audit_inode(nd->name, path.dentry, 0);
3510                error = vfs_open(&path, file);
3511                path_put(&path);
3512        }
3513        return error;
3514}
3515
3516static struct file *path_openat(struct nameidata *nd,
3517                        const struct open_flags *op, unsigned flags)
3518{
3519        struct file *file;
3520        int error;
3521
3522        file = alloc_empty_file(op->open_flag, current_cred());
3523        if (IS_ERR(file))
3524                return file;
3525
3526        if (unlikely(file->f_flags & __O_TMPFILE)) {
3527                error = do_tmpfile(nd, flags, op, file);
3528        } else if (unlikely(file->f_flags & O_PATH)) {
3529                error = do_o_path(nd, flags, file);
3530        } else {
3531                const char *s = path_init(nd, flags);
3532                while (!(error = link_path_walk(s, nd)) &&
3533                        (error = do_last(nd, file, op)) > 0) {
3534                        nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3535                        s = trailing_symlink(nd);
3536                }
3537                terminate_walk(nd);
3538        }
3539        if (likely(!error)) {
3540                if (likely(file->f_mode & FMODE_OPENED))
3541                        return file;
3542                WARN_ON(1);
3543                error = -EINVAL;
3544        }
3545        fput(file);
3546        if (error == -EOPENSTALE) {
3547                if (flags & LOOKUP_RCU)
3548                        error = -ECHILD;
3549                else
3550                        error = -ESTALE;
3551        }
3552        return ERR_PTR(error);
3553}
3554
3555struct file *do_filp_open(int dfd, struct filename *pathname,
3556                const struct open_flags *op)
3557{
3558        struct nameidata nd;
3559        int flags = op->lookup_flags;
3560        struct file *filp;
3561
3562        set_nameidata(&nd, dfd, pathname);
3563        filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3564        if (unlikely(filp == ERR_PTR(-ECHILD)))
3565                filp = path_openat(&nd, op, flags);
3566        if (unlikely(filp == ERR_PTR(-ESTALE)))
3567                filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3568        restore_nameidata();
3569        return filp;
3570}
3571
3572struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3573                const char *name, const struct open_flags *op)
3574{
3575        struct nameidata nd;
3576        struct file *file;
3577        struct filename *filename;
3578        int flags = op->lookup_flags | LOOKUP_ROOT;
3579
3580        nd.root.mnt = mnt;
3581        nd.root.dentry = dentry;
3582
3583        if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
3584                return ERR_PTR(-ELOOP);
3585
3586        filename = getname_kernel(name);
3587        if (IS_ERR(filename))
3588                return ERR_CAST(filename);
3589
3590        set_nameidata(&nd, -1, filename);
3591        file = path_openat(&nd, op, flags | LOOKUP_RCU);
3592        if (unlikely(file == ERR_PTR(-ECHILD)))
3593                file = path_openat(&nd, op, flags);
3594        if (unlikely(file == ERR_PTR(-ESTALE)))
3595                file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3596        restore_nameidata();
3597        putname(filename);
3598        return file;
3599}
3600
3601static struct dentry *filename_create(int dfd, struct filename *name,
3602                                struct path *path, unsigned int lookup_flags)
3603{
3604        struct dentry *dentry = ERR_PTR(-EEXIST);
3605        struct qstr last;
3606        int type;
3607        int err2;
3608        int error;
3609        bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
3610
3611        /*
3612         * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
3613         * other flags passed in are ignored!
3614         */
3615        lookup_flags &= LOOKUP_REVAL;
3616
3617        name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
3618        if (IS_ERR(name))
3619                return ERR_CAST(name);
3620
3621        /*
3622         * Yucky last component or no last component at all?
3623         * (foo/., foo/.., /////)
3624         */
3625        if (unlikely(type != LAST_NORM))
3626                goto out;
3627
3628        /* don't fail immediately if it's r/o, at least try to report other errors */
3629        err2 = mnt_want_write(path->mnt);
3630        /*
3631         * Do the final lookup.
3632         */
3633        lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
3634        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3635        dentry = __lookup_hash(&last, path->dentry, lookup_flags);
3636        if (IS_ERR(dentry))
3637                goto unlock;
3638
3639        error = -EEXIST;
3640        if (d_is_positive(dentry))
3641                goto fail;
3642
3643        /*
3644         * Special case - lookup gave negative, but... we had foo/bar/
3645         * From the vfs_mknod() POV we just have a negative dentry -
3646         * all is fine. Let's be bastards - you had / on the end, you've
3647         * been asking for (non-existent) directory. -ENOENT for you.
3648         */
3649        if (unlikely(!is_dir && last.name[last.len])) {
3650                error = -ENOENT;
3651                goto fail;
3652        }
3653        if (unlikely(err2)) {
3654                error = err2;
3655                goto fail;
3656        }
3657        putname(name);
3658        return dentry;
3659fail:
3660        dput(dentry);
3661        dentry = ERR_PTR(error);
3662unlock:
3663        inode_unlock(path->dentry->d_inode);
3664        if (!err2)
3665                mnt_drop_write(path->mnt);
3666out:
3667        path_put(path);
3668        putname(name);
3669        return dentry;
3670}
3671
3672struct dentry *kern_path_create(int dfd, const char *pathname,
3673                                struct path *path, unsigned int lookup_flags)
3674{
3675        return filename_create(dfd, getname_kernel(pathname),
3676                                path, lookup_flags);
3677}
3678EXPORT_SYMBOL(kern_path_create);
3679
3680void done_path_create(struct path *path, struct dentry *dentry)
3681{
3682        dput(dentry);
3683        inode_unlock(path->dentry->d_inode);
3684        mnt_drop_write(path->mnt);
3685        path_put(path);
3686}
3687EXPORT_SYMBOL(done_path_create);
3688
3689inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3690                                struct path *path, unsigned int lookup_flags)
3691{
3692        return filename_create(dfd, getname(pathname), path, lookup_flags);
3693}
3694EXPORT_SYMBOL(user_path_create);
3695
3696int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
3697{
3698        int error = may_create(dir, dentry);
3699
3700        if (error)
3701                return error;
3702
3703        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
3704                return -EPERM;
3705
3706        if (!dir->i_op->mknod)
3707                return -EPERM;
3708
3709        error = devcgroup_inode_mknod(mode, dev);
3710        if (error)
3711                return error;
3712
3713        error = security_inode_mknod(dir, dentry, mode, dev);
3714        if (error)
3715                return error;
3716
3717        error = dir->i_op->mknod(dir, dentry, mode, dev);
3718        if (!error)
3719                fsnotify_create(dir, dentry);
3720        return error;
3721}
3722EXPORT_SYMBOL(vfs_mknod);
3723
3724static int may_mknod(umode_t mode)
3725{
3726        switch (mode & S_IFMT) {
3727        case S_IFREG:
3728        case S_IFCHR:
3729        case S_IFBLK:
3730        case S_IFIFO:
3731        case S_IFSOCK:
3732        case 0: /* zero mode translates to S_IFREG */
3733                return 0;
3734        case S_IFDIR:
3735                return -EPERM;
3736        default:
3737                return -EINVAL;
3738        }
3739}
3740
3741long do_mknodat(int dfd, const char __user *filename, umode_t mode,
3742                unsigned int dev)
3743{
3744        struct dentry *dentry;
3745        struct path path;
3746        int error;
3747        unsigned int lookup_flags = 0;
3748
3749        error = may_mknod(mode);
3750        if (error)
3751                return error;
3752retry:
3753        dentry = user_path_create(dfd, filename, &path, lookup_flags);
3754        if (IS_ERR(dentry))
3755                return PTR_ERR(dentry);
3756
3757        if (!IS_POSIXACL(path.dentry->d_inode))
3758                mode &= ~current_umask();
3759        error = security_path_mknod(&path, dentry, mode, dev);
3760        if (error)
3761                goto out;
3762        switch (mode & S_IFMT) {
3763                case 0: case S_IFREG:
3764                        error = vfs_create(path.dentry->d_inode,dentry,mode,true);
3765                        if (!error)
3766                                ima_post_path_mknod(dentry);
3767                        break;
3768                case S_IFCHR: case S_IFBLK:
3769                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,
3770                                        new_decode_dev(dev));
3771                        break;
3772                case S_IFIFO: case S_IFSOCK:
3773                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
3774                        break;
3775        }
3776out:
3777        done_path_create(&path, dentry);
3778        if (retry_estale(error, lookup_flags)) {
3779                lookup_flags |= LOOKUP_REVAL;
3780                goto retry;
3781        }
3782        return error;
3783}
3784
3785SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3786                unsigned int, dev)
3787{
3788        return do_mknodat(dfd, filename, mode, dev);
3789}
3790
3791SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3792{
3793        return do_mknodat(AT_FDCWD, filename, mode, dev);
3794}
3795
3796int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3797{
3798        int error = may_create(dir, dentry);
3799        unsigned max_links = dir->i_sb->s_max_links;
3800
3801        if (error)
3802                return error;
3803
3804        if (!dir->i_op->mkdir)
3805                return -EPERM;
3806
3807        mode &= (S_IRWXUGO|S_ISVTX);
3808        error = security_inode_mkdir(dir, dentry, mode);
3809        if (error)
3810                return error;
3811
3812        if (max_links && dir->i_nlink >= max_links)
3813                return -EMLINK;
3814
3815        error = dir->i_op->mkdir(dir, dentry, mode);
3816        if (!error)
3817                fsnotify_mkdir(dir, dentry);
3818        return error;
3819}
3820EXPORT_SYMBOL(vfs_mkdir);
3821
3822long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
3823{
3824        struct dentry *dentry;
3825        struct path path;
3826        int error;
3827        unsigned int lookup_flags = LOOKUP_DIRECTORY;
3828
3829retry:
3830        dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3831        if (IS_ERR(dentry))
3832                return PTR_ERR(dentry);
3833
3834        if (!IS_POSIXACL(path.dentry->d_inode))
3835                mode &= ~current_umask();
3836        error = security_path_mkdir(&path, dentry, mode);
3837        if (!error)
3838                error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
3839        done_path_create(&path, dentry);
3840        if (retry_estale(error, lookup_flags)) {
3841                lookup_flags |= LOOKUP_REVAL;
3842                goto retry;
3843        }
3844        return error;
3845}
3846
3847SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3848{
3849        return do_mkdirat(dfd, pathname, mode);
3850}
3851
3852SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3853{
3854        return do_mkdirat(AT_FDCWD, pathname, mode);
3855}
3856
3857int vfs_rmdir(struct inode *dir, struct dentry *dentry)
3858{
3859        int error = may_delete(dir, dentry, 1);
3860
3861        if (error)
3862                return error;
3863
3864        if (!dir->i_op->rmdir)
3865                return -EPERM;
3866
3867        dget(dentry);
3868        inode_lock(dentry->d_inode);
3869
3870        error = -EBUSY;
3871        if (is_local_mountpoint(dentry))
3872                goto out;
3873
3874        error = security_inode_rmdir(dir, dentry);
3875        if (error)
3876                goto out;
3877
3878        error = dir->i_op->rmdir(dir, dentry);
3879        if (error)
3880                goto out;
3881
3882        shrink_dcache_parent(dentry);
3883        dentry->d_inode->i_flags |= S_DEAD;
3884        dont_mount(dentry);
3885        detach_mounts(dentry);
3886        fsnotify_rmdir(dir, dentry);
3887
3888out:
3889        inode_unlock(dentry->d_inode);
3890        dput(dentry);
3891        if (!error)
3892                d_delete(dentry);
3893        return error;
3894}
3895EXPORT_SYMBOL(vfs_rmdir);
3896
3897long do_rmdir(int dfd, const char __user *pathname)
3898{
3899        int error = 0;
3900        struct filename *name;
3901        struct dentry *dentry;
3902        struct path path;
3903        struct qstr last;
3904        int type;
3905        unsigned int lookup_flags = 0;
3906retry:
3907        name = filename_parentat(dfd, getname(pathname), lookup_flags,
3908                                &path, &last, &type);
3909        if (IS_ERR(name))
3910                return PTR_ERR(name);
3911
3912        switch (type) {
3913        case LAST_DOTDOT:
3914                error = -ENOTEMPTY;
3915                goto exit1;
3916        case LAST_DOT:
3917                error = -EINVAL;
3918                goto exit1;
3919        case LAST_ROOT:
3920                error = -EBUSY;
3921                goto exit1;
3922        }
3923
3924        error = mnt_want_write(path.mnt);
3925        if (error)
3926                goto exit1;
3927
3928        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
3929        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
3930        error = PTR_ERR(dentry);
3931        if (IS_ERR(dentry))
3932                goto exit2;
3933        if (!dentry->d_inode) {
3934                error = -ENOENT;
3935                goto exit3;
3936        }
3937        error = security_path_rmdir(&path, dentry);
3938        if (error)
3939                goto exit3;
3940        error = vfs_rmdir(path.dentry->d_inode, dentry);
3941exit3:
3942        dput(dentry);
3943exit2:
3944        inode_unlock(path.dentry->d_inode);
3945        mnt_drop_write(path.mnt);
3946exit1:
3947        path_put(&path);
3948        putname(name);
3949        if (retry_estale(error, lookup_flags)) {
3950                lookup_flags |= LOOKUP_REVAL;
3951                goto retry;
3952        }
3953        return error;
3954}
3955
3956SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3957{
3958        return do_rmdir(AT_FDCWD, pathname);
3959}
3960
3961/**
3962 * vfs_unlink - unlink a filesystem object
3963 * @dir:        parent directory
3964 * @dentry:     victim
3965 * @delegated_inode: returns victim inode, if the inode is delegated.
3966 *
3967 * The caller must hold dir->i_mutex.
3968 *
3969 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
3970 * return a reference to the inode in delegated_inode.  The caller
3971 * should then break the delegation on that inode and retry.  Because
3972 * breaking a delegation may take a long time, the caller should drop
3973 * dir->i_mutex before doing so.
3974 *
3975 * Alternatively, a caller may pass NULL for delegated_inode.  This may
3976 * be appropriate for callers that expect the underlying filesystem not
3977 * to be NFS exported.
3978 */
3979int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
3980{
3981        struct inode *target = dentry->d_inode;
3982        int error = may_delete(dir, dentry, 0);
3983
3984        if (error)
3985                return error;
3986
3987        if (!dir->i_op->unlink)
3988                return -EPERM;
3989
3990        inode_lock(target);
3991        if (is_local_mountpoint(dentry))
3992                error = -EBUSY;
3993        else {
3994                error = security_inode_unlink(dir, dentry);
3995                if (!error) {
3996                        error = try_break_deleg(target, delegated_inode);
3997                        if (error)
3998                                goto out;
3999                        error = dir->i_op->unlink(dir, dentry);
4000                        if (!error) {
4001                                dont_mount(dentry);
4002                                detach_mounts(dentry);
4003                                fsnotify_unlink(dir, dentry);
4004                        }
4005                }
4006        }
4007out:
4008        inode_unlock(target);
4009
4010        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
4011        if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
4012                fsnotify_link_count(target);
4013                d_delete(dentry);
4014        }
4015
4016        return error;
4017}
4018EXPORT_SYMBOL(vfs_unlink);
4019
4020/*
4021 * Make sure that the actual truncation of the file will occur outside its
4022 * directory's i_mutex.  Truncate can take a long time if there is a lot of
4023 * writeout happening, and we don't want to prevent access to the directory
4024 * while waiting on the I/O.
4025 */
4026long do_unlinkat(int dfd, struct filename *name)
4027{
4028        int error;
4029        struct dentry *dentry;
4030        struct path path;
4031        struct qstr last;
4032        int type;
4033        struct inode *inode = NULL;
4034        struct inode *delegated_inode = NULL;
4035        unsigned int lookup_flags = 0;
4036retry:
4037        name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
4038        if (IS_ERR(name))
4039                return PTR_ERR(name);
4040
4041        error = -EISDIR;
4042        if (type != LAST_NORM)
4043                goto exit1;
4044
4045        error = mnt_want_write(path.mnt);
4046        if (error)
4047                goto exit1;
4048retry_deleg:
4049        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4050        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
4051        error = PTR_ERR(dentry);
4052        if (!IS_ERR(dentry)) {
4053                /* Why not before? Because we want correct error value */
4054                if (last.name[last.len])
4055                        goto slashes;
4056                inode = dentry->d_inode;
4057                if (d_is_negative(dentry))
4058                        goto slashes;
4059                ihold(inode);
4060                error = security_path_unlink(&path, dentry);
4061                if (error)
4062                        goto exit2;
4063                error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
4064exit2:
4065                dput(dentry);
4066        }
4067        inode_unlock(path.dentry->d_inode);
4068        if (inode)
4069                iput(inode);    /* truncate the inode here */
4070        inode = NULL;
4071        if (delegated_inode) {
4072                error = break_deleg_wait(&delegated_inode);
4073                if (!error)
4074                        goto retry_deleg;
4075        }
4076        mnt_drop_write(path.mnt);
4077exit1:
4078        path_put(&path);
4079        if (retry_estale(error, lookup_flags)) {
4080                lookup_flags |= LOOKUP_REVAL;
4081                inode = NULL;
4082                goto retry;
4083        }
4084        putname(name);
4085        return error;
4086
4087slashes:
4088        if (d_is_negative(dentry))
4089                error = -ENOENT;
4090        else if (d_is_dir(dentry))
4091                error = -EISDIR;
4092        else
4093                error = -ENOTDIR;
4094        goto exit2;
4095}
4096
4097SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4098{
4099        if ((flag & ~AT_REMOVEDIR) != 0)
4100                return -EINVAL;
4101
4102        if (flag & AT_REMOVEDIR)
4103                return do_rmdir(dfd, pathname);
4104
4105        return do_unlinkat(dfd, getname(pathname));
4106}
4107
4108SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4109{
4110        return do_unlinkat(AT_FDCWD, getname(pathname));
4111}
4112
4113int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
4114{
4115        int error = may_create(dir, dentry);
4116
4117        if (error)
4118                return error;
4119
4120        if (!dir->i_op->symlink)
4121                return -EPERM;
4122
4123        error = security_inode_symlink(dir, dentry, oldname);
4124        if (error)
4125                return error;
4126
4127        error = dir->i_op->symlink(dir, dentry, oldname);
4128        if (!error)
4129                fsnotify_create(dir, dentry);
4130        return error;
4131}
4132EXPORT_SYMBOL(vfs_symlink);
4133
4134long do_symlinkat(const char __user *oldname, int newdfd,
4135                  const char __user *newname)
4136{
4137        int error;
4138        struct filename *from;
4139        struct dentry *dentry;
4140        struct path path;
4141        unsigned int lookup_flags = 0;
4142
4143        from = getname(oldname);
4144        if (IS_ERR(from))
4145                return PTR_ERR(from);
4146retry:
4147        dentry = user_path_create(newdfd, newname, &path, lookup_flags);
4148        error = PTR_ERR(dentry);
4149        if (IS_ERR(dentry))
4150                goto out_putname;
4151
4152        error = security_path_symlink(&path, dentry, from->name);
4153        if (!error)
4154                error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
4155        done_path_create(&path, dentry);
4156        if (retry_estale(error, lookup_flags)) {
4157                lookup_flags |= LOOKUP_REVAL;
4158                goto retry;
4159        }
4160out_putname:
4161        putname(from);
4162        return error;
4163}
4164
4165SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
4166                int, newdfd, const char __user *, newname)
4167{
4168        return do_symlinkat(oldname, newdfd, newname);
4169}
4170
4171SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4172{
4173        return do_symlinkat(oldname, AT_FDCWD, newname);
4174}
4175
4176/**
4177 * vfs_link - create a new link
4178 * @old_dentry: object to be linked
4179 * @dir:        new parent
4180 * @new_dentry: where to create the new link
4181 * @delegated_inode: returns inode needing a delegation break
4182 *
4183 * The caller must hold dir->i_mutex
4184 *
4185 * If vfs_link discovers a delegation on the to-be-linked file in need
4186 * of breaking, it will return -EWOULDBLOCK and return a reference to the
4187 * inode in delegated_inode.  The caller should then break the delegation
4188 * and retry.  Because breaking a delegation may take a long time, the
4189 * caller should drop the i_mutex before doing so.
4190 *
4191 * Alternatively, a caller may pass NULL for delegated_inode.  This may
4192 * be appropriate for callers that expect the underlying filesystem not
4193 * to be NFS exported.
4194 */
4195int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
4196{
4197        struct inode *inode = old_dentry->d_inode;
4198        unsigned max_links = dir->i_sb->s_max_links;
4199        int error;
4200
4201        if (!inode)
4202                return -ENOENT;
4203
4204        error = may_create(dir, new_dentry);
4205        if (error)
4206                return error;
4207
4208        if (dir->i_sb != inode->i_sb)
4209                return -EXDEV;
4210
4211        /*
4212         * A link to an append-only or immutable file cannot be created.
4213         */
4214        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4215                return -EPERM;
4216        /*
4217         * Updating the link count will likely cause i_uid and i_gid to
4218         * be writen back improperly if their true value is unknown to
4219         * the vfs.
4220         */
4221        if (HAS_UNMAPPED_ID(inode))
4222                return -EPERM;
4223        if (!dir->i_op->link)
4224                return -EPERM;
4225        if (S_ISDIR(inode->i_mode))
4226                return -EPERM;
4227
4228        error = security_inode_link(old_dentry, dir, new_dentry);
4229        if (error)
4230                return error;
4231
4232        inode_lock(inode);
4233        /* Make sure we don't allow creating hardlink to an unlinked file */
4234        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4235                error =  -ENOENT;
4236        else if (max_links && inode->i_nlink >= max_links)
4237                error = -EMLINK;
4238        else {
4239                error = try_break_deleg(inode, delegated_inode);
4240                if (!error)
4241                        error = dir->i_op->link(old_dentry, dir, new_dentry);
4242        }
4243
4244        if (!error && (inode->i_state & I_LINKABLE)) {
4245                spin_lock(&inode->i_lock);
4246                inode->i_state &= ~I_LINKABLE;
4247                spin_unlock(&inode->i_lock);
4248        }
4249        inode_unlock(inode);
4250        if (!error)
4251                fsnotify_link(dir, inode, new_dentry);
4252        return error;
4253}
4254EXPORT_SYMBOL(vfs_link);
4255
4256/*
4257 * Hardlinks are often used in delicate situations.  We avoid
4258 * security-related surprises by not following symlinks on the
4259 * newname.  --KAB
4260 *
4261 * We don't follow them on the oldname either to be compatible
4262 * with linux 2.0, and to avoid hard-linking to directories
4263 * and other special files.  --ADM
4264 */
4265int do_linkat(int olddfd, const char __user *oldname, int newdfd,
4266              const char __user *newname, int flags)
4267{
4268        struct dentry *new_dentry;
4269        struct path old_path, new_path;
4270        struct inode *delegated_inode = NULL;
4271        int how = 0;
4272        int error;
4273
4274        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
4275                return -EINVAL;
4276        /*
4277         * To use null names we require CAP_DAC_READ_SEARCH
4278         * This ensures that not everyone will be able to create
4279         * handlink using the passed filedescriptor.
4280         */
4281        if (flags & AT_EMPTY_PATH) {
4282                if (!capable(CAP_DAC_READ_SEARCH))
4283                        return -ENOENT;
4284                how = LOOKUP_EMPTY;
4285        }
4286
4287        if (flags & AT_SYMLINK_FOLLOW)
4288                how |= LOOKUP_FOLLOW;
4289retry:
4290        error = user_path_at(olddfd, oldname, how, &old_path);
4291        if (error)
4292                return error;
4293
4294        new_dentry = user_path_create(newdfd, newname, &new_path,
4295                                        (how & LOOKUP_REVAL));
4296        error = PTR_ERR(new_dentry);
4297        if (IS_ERR(new_dentry))
4298                goto out;
4299
4300        error = -EXDEV;
4301        if (old_path.mnt != new_path.mnt)
4302                goto out_dput;
4303        error = may_linkat(&old_path);
4304        if (unlikely(error))
4305                goto out_dput;
4306        error = security_path_link(old_path.dentry, &new_path, new_dentry);
4307        if (error)
4308                goto out_dput;
4309        error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
4310out_dput:
4311        done_path_create(&new_path, new_dentry);
4312        if (delegated_inode) {
4313                error = break_deleg_wait(&delegated_inode);
4314                if (!error) {
4315                        path_put(&old_path);
4316                        goto retry;
4317                }
4318        }
4319        if (retry_estale(error, how)) {
4320                path_put(&old_path);
4321                how |= LOOKUP_REVAL;
4322                goto retry;
4323        }
4324out:
4325        path_put(&old_path);
4326
4327        return error;
4328}
4329
4330SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4331                int, newdfd, const char __user *, newname, int, flags)
4332{
4333        return do_linkat(olddfd, oldname, newdfd, newname, flags);
4334}
4335
4336SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4337{
4338        return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4339}
4340
4341/**
4342 * vfs_rename - rename a filesystem object
4343 * @old_dir:    parent of source
4344 * @old_dentry: source
4345 * @new_dir:    parent of destination
4346 * @new_dentry: destination
4347 * @delegated_inode: returns an inode needing a delegation break
4348 * @flags:      rename flags
4349 *
4350 * The caller must hold multiple mutexes--see lock_rename()).
4351 *
4352 * If vfs_rename discovers a delegation in need of breaking at either
4353 * the source or destination, it will return -EWOULDBLOCK and return a
4354 * reference to the inode in delegated_inode.  The caller should then
4355 * break the delegation and retry.  Because breaking a delegation may
4356 * take a long time, the caller should drop all locks before doing
4357 * so.
4358 *
4359 * Alternatively, a caller may pass NULL for delegated_inode.  This may
4360 * be appropriate for callers that expect the underlying filesystem not
4361 * to be NFS exported.
4362 *
4363 * The worst of all namespace operations - renaming directory. "Perverted"
4364 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4365 * Problems:
4366 *
4367 *      a) we can get into loop creation.
4368 *      b) race potential - two innocent renames can create a loop together.
4369 *         That's where 4.4 screws up. Current fix: serialization on
4370 *         sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4371 *         story.
4372 *      c) we have to lock _four_ objects - parents and victim (if it exists),
4373 *         and source (if it is not a directory).
4374 *         And that - after we got ->i_mutex on parents (until then we don't know
4375 *         whether the target exists).  Solution: try to be smart with locking
4376 *         order for inodes.  We rely on the fact that tree topology may change
4377 *         only under ->s_vfs_rename_mutex _and_ that parent of the object we
4378 *         move will be locked.  Thus we can rank directories by the tree
4379 *         (ancestors first) and rank all non-directories after them.
4380 *         That works since everybody except rename does "lock parent, lookup,
4381 *         lock child" and rename is under ->s_vfs_rename_mutex.
4382 *         HOWEVER, it relies on the assumption that any object with ->lookup()
4383 *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
4384 *         we'd better make sure that there's no link(2) for them.
4385 *      d) conversion from fhandle to dentry may come in the wrong moment - when
4386 *         we are removing the target. Solution: we will have to grab ->i_mutex
4387 *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4388 *         ->i_mutex on parents, which works but leads to some truly excessive
4389 *         locking].
4390 */
4391int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4392               struct inode *new_dir, struct dentry *new_dentry,
4393               struct inode **delegated_inode, unsigned int flags)
4394{
4395        int error;
4396        bool is_dir = d_is_dir(old_dentry);
4397        struct inode *source = old_dentry->d_inode;
4398        struct inode *target = new_dentry->d_inode;
4399        bool new_is_dir = false;
4400        unsigned max_links = new_dir->i_sb->s_max_links;
4401        struct name_snapshot old_name;
4402
4403        if (source == target)
4404                return 0;
4405
4406        error = may_delete(old_dir, old_dentry, is_dir);
4407        if (error)
4408                return error;
4409
4410        if (!target) {
4411                error = may_create(new_dir, new_dentry);
4412        } else {
4413                new_is_dir = d_is_dir(new_dentry);
4414
4415                if (!(flags & RENAME_EXCHANGE))
4416                        error = may_delete(new_dir, new_dentry, is_dir);
4417                else
4418                        error = may_delete(new_dir, new_dentry, new_is_dir);
4419        }
4420        if (error)
4421                return error;
4422
4423        if (!old_dir->i_op->rename)
4424                return -EPERM;
4425
4426        /*
4427         * If we are going to change the parent - check write permissions,
4428         * we'll need to flip '..'.
4429         */
4430        if (new_dir != old_dir) {
4431                if (is_dir) {
4432                        error = inode_permission(source, MAY_WRITE);
4433                        if (error)
4434                                return error;
4435                }
4436                if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4437                        error = inode_permission(target, MAY_WRITE);
4438                        if (error)
4439                                return error;
4440                }
4441        }
4442
4443        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4444                                      flags);
4445        if (error)
4446                return error;
4447
4448        take_dentry_name_snapshot(&old_name, old_dentry);
4449        dget(new_dentry);
4450        if (!is_dir || (flags & RENAME_EXCHANGE))
4451                lock_two_nondirectories(source, target);
4452        else if (target)
4453                inode_lock(target);
4454
4455        error = -EBUSY;
4456        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
4457                goto out;
4458
4459        if (max_links && new_dir != old_dir) {
4460                error = -EMLINK;
4461                if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4462                        goto out;
4463                if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4464                    old_dir->i_nlink >= max_links)
4465                        goto out;
4466        }
4467        if (!is_dir) {
4468                error = try_break_deleg(source, delegated_inode);
4469                if (error)
4470                        goto out;
4471        }
4472        if (target && !new_is_dir) {
4473                error = try_break_deleg(target, delegated_inode);
4474                if (error)
4475                        goto out;
4476        }
4477        error = old_dir->i_op->rename(old_dir, old_dentry,
4478                                       new_dir, new_dentry, flags);
4479        if (error)
4480                goto out;
4481
4482        if (!(flags & RENAME_EXCHANGE) && target) {
4483                if (is_dir) {
4484                        shrink_dcache_parent(new_dentry);
4485                        target->i_flags |= S_DEAD;
4486                }
4487                dont_mount(new_dentry);
4488                detach_mounts(new_dentry);
4489        }
4490        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4491                if (!(flags & RENAME_EXCHANGE))
4492                        d_move(old_dentry, new_dentry);
4493                else
4494                        d_exchange(old_dentry, new_dentry);
4495        }
4496out:
4497        if (!is_dir || (flags & RENAME_EXCHANGE))
4498                unlock_two_nondirectories(source, target);
4499        else if (target)
4500                inode_unlock(target);
4501        dput(new_dentry);
4502        if (!error) {
4503                fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
4504                              !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
4505                if (flags & RENAME_EXCHANGE) {
4506                        fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
4507                                      new_is_dir, NULL, new_dentry);
4508                }
4509        }
4510        release_dentry_name_snapshot(&old_name);
4511
4512        return error;
4513}
4514EXPORT_SYMBOL(vfs_rename);
4515
4516static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
4517                        const char __user *newname, unsigned int flags)
4518{
4519        struct dentry *old_dentry, *new_dentry;
4520        struct dentry *trap;
4521        struct path old_path, new_path;
4522        struct qstr old_last, new_last;
4523        int old_type, new_type;
4524        struct inode *delegated_inode = NULL;
4525        struct filename *from;
4526        struct filename *to;
4527        unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4528        bool should_retry = false;
4529        int error;
4530
4531        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4532                return -EINVAL;
4533
4534        if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
4535            (flags & RENAME_EXCHANGE))
4536                return -EINVAL;
4537
4538        if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
4539                return -EPERM;
4540
4541        if (flags & RENAME_EXCHANGE)
4542                target_flags = 0;
4543
4544retry:
4545        from = filename_parentat(olddfd, getname(oldname), lookup_flags,
4546                                &old_path, &old_last, &old_type);
4547        if (IS_ERR(from)) {
4548                error = PTR_ERR(from);
4549                goto exit;
4550        }
4551
4552        to = filename_parentat(newdfd, getname(newname), lookup_flags,
4553                                &new_path, &new_last, &new_type);
4554        if (IS_ERR(to)) {
4555                error = PTR_ERR(to);
4556                goto exit1;
4557        }
4558
4559        error = -EXDEV;
4560        if (old_path.mnt != new_path.mnt)
4561                goto exit2;
4562
4563        error = -EBUSY;
4564        if (old_type != LAST_NORM)
4565                goto exit2;
4566
4567        if (flags & RENAME_NOREPLACE)
4568                error = -EEXIST;
4569        if (new_type != LAST_NORM)
4570                goto exit2;
4571
4572        error = mnt_want_write(old_path.mnt);
4573        if (error)
4574                goto exit2;
4575
4576retry_deleg:
4577        trap = lock_rename(new_path.dentry, old_path.dentry);
4578
4579        old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
4580        error = PTR_ERR(old_dentry);
4581        if (IS_ERR(old_dentry))
4582                goto exit3;
4583        /* source must exist */
4584        error = -ENOENT;
4585        if (d_is_negative(old_dentry))
4586                goto exit4;
4587        new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
4588        error = PTR_ERR(new_dentry);
4589        if (IS_ERR(new_dentry))
4590                goto exit4;
4591        error = -EEXIST;
4592        if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
4593                goto exit5;
4594        if (flags & RENAME_EXCHANGE) {
4595                error = -ENOENT;
4596                if (d_is_negative(new_dentry))
4597                        goto exit5;
4598
4599                if (!d_is_dir(new_dentry)) {
4600                        error = -ENOTDIR;
4601                        if (new_last.name[new_last.len])
4602                                goto exit5;
4603                }
4604        }
4605        /* unless the source is a directory trailing slashes give -ENOTDIR */
4606        if (!d_is_dir(old_dentry)) {
4607                error = -ENOTDIR;
4608                if (old_last.name[old_last.len])
4609                        goto exit5;
4610                if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
4611                        goto exit5;
4612        }
4613        /* source should not be ancestor of target */
4614        error = -EINVAL;
4615        if (old_dentry == trap)
4616                goto exit5;
4617        /* target should not be an ancestor of source */
4618        if (!(flags & RENAME_EXCHANGE))
4619                error = -ENOTEMPTY;
4620        if (new_dentry == trap)
4621                goto exit5;
4622
4623        error = security_path_rename(&old_path, old_dentry,
4624                                     &new_path, new_dentry, flags);
4625        if (error)
4626                goto exit5;
4627        error = vfs_rename(old_path.dentry->d_inode, old_dentry,
4628                           new_path.dentry->d_inode, new_dentry,
4629                           &delegated_inode, flags);
4630exit5:
4631        dput(new_dentry);
4632exit4:
4633        dput(old_dentry);
4634exit3:
4635        unlock_rename(new_path.dentry, old_path.dentry);
4636        if (delegated_inode) {
4637                error = break_deleg_wait(&delegated_inode);
4638                if (!error)
4639                        goto retry_deleg;
4640        }
4641        mnt_drop_write(old_path.mnt);
4642exit2:
4643        if (retry_estale(error, lookup_flags))
4644                should_retry = true;
4645        path_put(&new_path);
4646        putname(to);
4647exit1:
4648        path_put(&old_path);
4649        putname(from);
4650        if (should_retry) {
4651                should_retry = false;
4652                lookup_flags |= LOOKUP_REVAL;
4653                goto retry;
4654        }
4655exit:
4656        return error;
4657}
4658
4659SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4660                int, newdfd, const char __user *, newname, unsigned int, flags)
4661{
4662        return do_renameat2(olddfd, oldname, newdfd, newname, flags);
4663}
4664
4665SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4666                int, newdfd, const char __user *, newname)
4667{
4668        return do_renameat2(olddfd, oldname, newdfd, newname, 0);
4669}
4670
4671SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4672{
4673        return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4674}
4675
4676int vfs_whiteout(struct inode *dir, struct dentry *dentry)
4677{
4678        int error = may_create(dir, dentry);
4679        if (error)
4680                return error;
4681
4682        if (!dir->i_op->mknod)
4683                return -EPERM;
4684
4685        return dir->i_op->mknod(dir, dentry,
4686                                S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
4687}
4688EXPORT_SYMBOL(vfs_whiteout);
4689
4690int readlink_copy(char __user *buffer, int buflen, const char *link)
4691{
4692        int len = PTR_ERR(link);
4693        if (IS_ERR(link))
4694                goto out;
4695
4696        len = strlen(link);
4697        if (len > (unsigned) buflen)
4698                len = buflen;
4699        if (copy_to_user(buffer, link, len))
4700                len = -EFAULT;
4701out:
4702        return len;
4703}
4704
4705/**
4706 * vfs_readlink - copy symlink body into userspace buffer
4707 * @dentry: dentry on which to get symbolic link
4708 * @buffer: user memory pointer
4709 * @buflen: size of buffer
4710 *
4711 * Does not touch atime.  That's up to the caller if necessary
4712 *
4713 * Does not call security hook.
4714 */
4715int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4716{
4717        struct inode *inode = d_inode(dentry);
4718        DEFINE_DELAYED_CALL(done);
4719        const char *link;
4720        int res;
4721
4722        if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
4723                if (unlikely(inode->i_op->readlink))
4724                        return inode->i_op->readlink(dentry, buffer, buflen);
4725
4726                if (!d_is_symlink(dentry))
4727                        return -EINVAL;
4728
4729                spin_lock(&inode->i_lock);
4730                inode->i_opflags |= IOP_DEFAULT_READLINK;
4731                spin_unlock(&inode->i_lock);
4732        }
4733
4734        link = READ_ONCE(inode->i_link);
4735        if (!link) {
4736                link = inode->i_op->get_link(dentry, inode, &done);
4737                if (IS_ERR(link))
4738                        return PTR_ERR(link);
4739        }
4740        res = readlink_copy(buffer, buflen, link);
4741        do_delayed_call(&done);
4742        return res;
4743}
4744EXPORT_SYMBOL(vfs_readlink);
4745
4746/**
4747 * vfs_get_link - get symlink body
4748 * @dentry: dentry on which to get symbolic link
4749 * @done: caller needs to free returned data with this
4750 *
4751 * Calls security hook and i_op->get_link() on the supplied inode.
4752 *
4753 * It does not touch atime.  That's up to the caller if necessary.
4754 *
4755 * Does not work on "special" symlinks like /proc/$$/fd/N
4756 */
4757const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
4758{
4759        const char *res = ERR_PTR(-EINVAL);
4760        struct inode *inode = d_inode(dentry);
4761
4762        if (d_is_symlink(dentry)) {
4763                res = ERR_PTR(security_inode_readlink(dentry));
4764                if (!res)
4765                        res = inode->i_op->get_link(dentry, inode, done);
4766        }
4767        return res;
4768}
4769EXPORT_SYMBOL(vfs_get_link);
4770
4771/* get the link contents into pagecache */
4772const char *page_get_link(struct dentry *dentry, struct inode *inode,
4773                          struct delayed_call *callback)
4774{
4775        char *kaddr;
4776        struct page *page;
4777        struct address_space *mapping = inode->i_mapping;
4778
4779        if (!dentry) {
4780                page = find_get_page(mapping, 0);
4781                if (!page)
4782                        return ERR_PTR(-ECHILD);
4783                if (!PageUptodate(page)) {
4784                        put_page(page);
4785                        return ERR_PTR(-ECHILD);
4786                }
4787        } else {
4788                page = read_mapping_page(mapping, 0, NULL);
4789                if (IS_ERR(page))
4790                        return (char*)page;
4791        }
4792        set_delayed_call(callback, page_put_link, page);
4793        BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
4794        kaddr = page_address(page);
4795        nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
4796        return kaddr;
4797}
4798
4799EXPORT_SYMBOL(page_get_link);
4800
4801void page_put_link(void *arg)
4802{
4803        put_page(arg);
4804}
4805EXPORT_SYMBOL(page_put_link);
4806
4807int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4808{
4809        DEFINE_DELAYED_CALL(done);
4810        int res = readlink_copy(buffer, buflen,
4811                                page_get_link(dentry, d_inode(dentry),
4812                                              &done));
4813        do_delayed_call(&done);
4814        return res;
4815}
4816EXPORT_SYMBOL(page_readlink);
4817
4818/*
4819 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
4820 */
4821int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
4822{
4823        struct address_space *mapping = inode->i_mapping;
4824        struct page *page;
4825        void *fsdata;
4826        int err;
4827        unsigned int flags = 0;
4828        if (nofs)
4829                flags |= AOP_FLAG_NOFS;
4830
4831retry:
4832        err = pagecache_write_begin(NULL, mapping, 0, len-1,
4833                                flags, &page, &fsdata);
4834        if (err)
4835                goto fail;
4836
4837        memcpy(page_address(page), symname, len-1);
4838
4839        err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
4840                                                        page, fsdata);
4841        if (err < 0)
4842                goto fail;
4843        if (err < len-1)
4844                goto retry;
4845
4846        mark_inode_dirty(inode);
4847        return 0;
4848fail:
4849        return err;
4850}
4851EXPORT_SYMBOL(__page_symlink);
4852
4853int page_symlink(struct inode *inode, const char *symname, int len)
4854{
4855        return __page_symlink(inode, symname, len,
4856                        !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
4857}
4858EXPORT_SYMBOL(page_symlink);
4859
4860const struct inode_operations page_symlink_inode_operations = {
4861        .get_link       = page_get_link,
4862};
4863EXPORT_SYMBOL(page_symlink_inode_operations);
4864