linux/fs/namei.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/namei.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * Some corrections by tytso.
   9 */
  10
  11/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  12 * lookup logic.
  13 */
  14/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  15 */
  16
  17#include <linux/init.h>
  18#include <linux/export.h>
  19#include <linux/kernel.h>
  20#include <linux/slab.h>
  21#include <linux/fs.h>
  22#include <linux/namei.h>
  23#include <linux/pagemap.h>
  24#include <linux/fsnotify.h>
  25#include <linux/personality.h>
  26#include <linux/security.h>
  27#include <linux/ima.h>
  28#include <linux/syscalls.h>
  29#include <linux/mount.h>
  30#include <linux/audit.h>
  31#include <linux/capability.h>
  32#include <linux/file.h>
  33#include <linux/fcntl.h>
  34#include <linux/device_cgroup.h>
  35#include <linux/fs_struct.h>
  36#include <linux/posix_acl.h>
  37#include <asm/uaccess.h>
  38
  39#include "internal.h"
  40#include "mount.h"
  41
  42/* [Feb-1997 T. Schoebel-Theuer]
  43 * Fundamental changes in the pathname lookup mechanisms (namei)
  44 * were necessary because of omirr.  The reason is that omirr needs
  45 * to know the _real_ pathname, not the user-supplied one, in case
  46 * of symlinks (and also when transname replacements occur).
  47 *
  48 * The new code replaces the old recursive symlink resolution with
  49 * an iterative one (in case of non-nested symlink chains).  It does
  50 * this with calls to <fs>_follow_link().
  51 * As a side effect, dir_namei(), _namei() and follow_link() are now 
  52 * replaced with a single function lookup_dentry() that can handle all 
  53 * the special cases of the former code.
  54 *
  55 * With the new dcache, the pathname is stored at each inode, at least as
  56 * long as the refcount of the inode is positive.  As a side effect, the
  57 * size of the dcache depends on the inode cache and thus is dynamic.
  58 *
  59 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  60 * resolution to correspond with current state of the code.
  61 *
  62 * Note that the symlink resolution is not *completely* iterative.
  63 * There is still a significant amount of tail- and mid- recursion in
  64 * the algorithm.  Also, note that <fs>_readlink() is not used in
  65 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  66 * may return different results than <fs>_follow_link().  Many virtual
  67 * filesystems (including /proc) exhibit this behavior.
  68 */
  69
  70/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  71 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  72 * and the name already exists in form of a symlink, try to create the new
  73 * name indicated by the symlink. The old code always complained that the
  74 * name already exists, due to not following the symlink even if its target
  75 * is nonexistent.  The new semantics affects also mknod() and link() when
  76 * the name is a symlink pointing to a non-existent name.
  77 *
  78 * I don't know which semantics is the right one, since I have no access
  79 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  80 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  81 * "old" one. Personally, I think the new semantics is much more logical.
  82 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  83 * file does succeed in both HP-UX and SunOs, but not in Solaris
  84 * and in the old Linux semantics.
  85 */
  86
  87/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  88 * semantics.  See the comments in "open_namei" and "do_link" below.
  89 *
  90 * [10-Sep-98 Alan Modra] Another symlink change.
  91 */
  92
  93/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  94 *      inside the path - always follow.
  95 *      in the last component in creation/removal/renaming - never follow.
  96 *      if LOOKUP_FOLLOW passed - follow.
  97 *      if the pathname has trailing slashes - follow.
  98 *      otherwise - don't follow.
  99 * (applied in that order).
 100 *
 101 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 102 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 103 * During the 2.4 we need to fix the userland stuff depending on it -
 104 * hopefully we will be able to get rid of that wart in 2.5. So far only
 105 * XEmacs seems to be relying on it...
 106 */
 107/*
 108 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 109 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 110 * any extra contention...
 111 */
 112
 113/* In order to reduce some races, while at the same time doing additional
 114 * checking and hopefully speeding things up, we copy filenames to the
 115 * kernel data space before using them..
 116 *
 117 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 118 * PATH_MAX includes the nul terminator --RR.
 119 */
 120void final_putname(struct filename *name)
 121{
 122        if (name->separate) {
 123                __putname(name->name);
 124                kfree(name);
 125        } else {
 126                __putname(name);
 127        }
 128}
 129
 130#define EMBEDDED_NAME_MAX       (PATH_MAX - sizeof(struct filename))
 131
 132static struct filename *
 133getname_flags(const char __user *filename, int flags, int *empty)
 134{
 135        struct filename *result, *err;
 136        int len;
 137        long max;
 138        char *kname;
 139
 140        result = audit_reusename(filename);
 141        if (result)
 142                return result;
 143
 144        result = __getname();
 145        if (unlikely(!result))
 146                return ERR_PTR(-ENOMEM);
 147
 148        /*
 149         * First, try to embed the struct filename inside the names_cache
 150         * allocation
 151         */
 152        kname = (char *)result + sizeof(*result);
 153        result->name = kname;
 154        result->separate = false;
 155        max = EMBEDDED_NAME_MAX;
 156
 157recopy:
 158        len = strncpy_from_user(kname, filename, max);
 159        if (unlikely(len < 0)) {
 160                err = ERR_PTR(len);
 161                goto error;
 162        }
 163
 164        /*
 165         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
 166         * separate struct filename so we can dedicate the entire
 167         * names_cache allocation for the pathname, and re-do the copy from
 168         * userland.
 169         */
 170        if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
 171                kname = (char *)result;
 172
 173                result = kzalloc(sizeof(*result), GFP_KERNEL);
 174                if (!result) {
 175                        err = ERR_PTR(-ENOMEM);
 176                        result = (struct filename *)kname;
 177                        goto error;
 178                }
 179                result->name = kname;
 180                result->separate = true;
 181                max = PATH_MAX;
 182                goto recopy;
 183        }
 184
 185        /* The empty path is special. */
 186        if (unlikely(!len)) {
 187                if (empty)
 188                        *empty = 1;
 189                err = ERR_PTR(-ENOENT);
 190                if (!(flags & LOOKUP_EMPTY))
 191                        goto error;
 192        }
 193
 194        err = ERR_PTR(-ENAMETOOLONG);
 195        if (unlikely(len >= PATH_MAX))
 196                goto error;
 197
 198        result->uptr = filename;
 199        audit_getname(result);
 200        return result;
 201
 202error:
 203        final_putname(result);
 204        return err;
 205}
 206
 207struct filename *
 208getname(const char __user * filename)
 209{
 210        return getname_flags(filename, 0, NULL);
 211}
 212EXPORT_SYMBOL(getname);
 213
 214#ifdef CONFIG_AUDITSYSCALL
 215void putname(struct filename *name)
 216{
 217        if (unlikely(!audit_dummy_context()))
 218                return audit_putname(name);
 219        final_putname(name);
 220}
 221#endif
 222
 223static int check_acl(struct inode *inode, int mask)
 224{
 225#ifdef CONFIG_FS_POSIX_ACL
 226        struct posix_acl *acl;
 227
 228        if (mask & MAY_NOT_BLOCK) {
 229                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
 230                if (!acl)
 231                        return -EAGAIN;
 232                /* no ->get_acl() calls in RCU mode... */
 233                if (acl == ACL_NOT_CACHED)
 234                        return -ECHILD;
 235                return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
 236        }
 237
 238        acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
 239
 240        /*
 241         * A filesystem can force a ACL callback by just never filling the
 242         * ACL cache. But normally you'd fill the cache either at inode
 243         * instantiation time, or on the first ->get_acl call.
 244         *
 245         * If the filesystem doesn't have a get_acl() function at all, we'll
 246         * just create the negative cache entry.
 247         */
 248        if (acl == ACL_NOT_CACHED) {
 249                if (inode->i_op->get_acl) {
 250                        acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
 251                        if (IS_ERR(acl))
 252                                return PTR_ERR(acl);
 253                } else {
 254                        set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
 255                        return -EAGAIN;
 256                }
 257        }
 258
 259        if (acl) {
 260                int error = posix_acl_permission(inode, acl, mask);
 261                posix_acl_release(acl);
 262                return error;
 263        }
 264#endif
 265
 266        return -EAGAIN;
 267}
 268
 269/*
 270 * This does the basic permission checking
 271 */
 272static int acl_permission_check(struct inode *inode, int mask)
 273{
 274        unsigned int mode = inode->i_mode;
 275
 276        if (likely(uid_eq(current_fsuid(), inode->i_uid)))
 277                mode >>= 6;
 278        else {
 279                if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
 280                        int error = check_acl(inode, mask);
 281                        if (error != -EAGAIN)
 282                                return error;
 283                }
 284
 285                if (in_group_p(inode->i_gid))
 286                        mode >>= 3;
 287        }
 288
 289        /*
 290         * If the DACs are ok we don't need any capability check.
 291         */
 292        if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 293                return 0;
 294        return -EACCES;
 295}
 296
 297/**
 298 * generic_permission -  check for access rights on a Posix-like filesystem
 299 * @inode:      inode to check access rights for
 300 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
 301 *
 302 * Used to check for read/write/execute permissions on a file.
 303 * We use "fsuid" for this, letting us set arbitrary permissions
 304 * for filesystem access without changing the "normal" uids which
 305 * are used for other things.
 306 *
 307 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 308 * request cannot be satisfied (eg. requires blocking or too much complexity).
 309 * It would then be called again in ref-walk mode.
 310 */
 311int generic_permission(struct inode *inode, int mask)
 312{
 313        int ret;
 314
 315        /*
 316         * Do the basic permission checks.
 317         */
 318        ret = acl_permission_check(inode, mask);
 319        if (ret != -EACCES)
 320                return ret;
 321
 322        if (S_ISDIR(inode->i_mode)) {
 323                /* DACs are overridable for directories */
 324                if (inode_capable(inode, CAP_DAC_OVERRIDE))
 325                        return 0;
 326                if (!(mask & MAY_WRITE))
 327                        if (inode_capable(inode, CAP_DAC_READ_SEARCH))
 328                                return 0;
 329                return -EACCES;
 330        }
 331        /*
 332         * Read/write DACs are always overridable.
 333         * Executable DACs are overridable when there is
 334         * at least one exec bit set.
 335         */
 336        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
 337                if (inode_capable(inode, CAP_DAC_OVERRIDE))
 338                        return 0;
 339
 340        /*
 341         * Searching includes executable on directories, else just read.
 342         */
 343        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 344        if (mask == MAY_READ)
 345                if (inode_capable(inode, CAP_DAC_READ_SEARCH))
 346                        return 0;
 347
 348        return -EACCES;
 349}
 350
 351/*
 352 * We _really_ want to just do "generic_permission()" without
 353 * even looking at the inode->i_op values. So we keep a cache
 354 * flag in inode->i_opflags, that says "this has not special
 355 * permission function, use the fast case".
 356 */
 357static inline int do_inode_permission(struct inode *inode, int mask)
 358{
 359        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 360                if (likely(inode->i_op->permission))
 361                        return inode->i_op->permission(inode, mask);
 362
 363                /* This gets set once for the inode lifetime */
 364                spin_lock(&inode->i_lock);
 365                inode->i_opflags |= IOP_FASTPERM;
 366                spin_unlock(&inode->i_lock);
 367        }
 368        return generic_permission(inode, mask);
 369}
 370
 371/**
 372 * __inode_permission - Check for access rights to a given inode
 373 * @inode: Inode to check permission on
 374 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 375 *
 376 * Check for read/write/execute permissions on an inode.
 377 *
 378 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 379 *
 380 * This does not check for a read-only file system.  You probably want
 381 * inode_permission().
 382 */
 383int __inode_permission(struct inode *inode, int mask)
 384{
 385        int retval;
 386
 387        if (unlikely(mask & MAY_WRITE)) {
 388                /*
 389                 * Nobody gets write access to an immutable file.
 390                 */
 391                if (IS_IMMUTABLE(inode))
 392                        return -EACCES;
 393        }
 394
 395        retval = do_inode_permission(inode, mask);
 396        if (retval)
 397                return retval;
 398
 399        retval = devcgroup_inode_permission(inode, mask);
 400        if (retval)
 401                return retval;
 402
 403        return security_inode_permission(inode, mask);
 404}
 405
 406/**
 407 * sb_permission - Check superblock-level permissions
 408 * @sb: Superblock of inode to check permission on
 409 * @inode: Inode to check permission on
 410 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 411 *
 412 * Separate out file-system wide checks from inode-specific permission checks.
 413 */
 414static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 415{
 416        if (unlikely(mask & MAY_WRITE)) {
 417                umode_t mode = inode->i_mode;
 418
 419                /* Nobody gets write access to a read-only fs. */
 420                if ((sb->s_flags & MS_RDONLY) &&
 421                    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 422                        return -EROFS;
 423        }
 424        return 0;
 425}
 426
 427/**
 428 * inode_permission - Check for access rights to a given inode
 429 * @inode: Inode to check permission on
 430 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 431 *
 432 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 433 * this, letting us set arbitrary permissions for filesystem access without
 434 * changing the "normal" UIDs which are used for other things.
 435 *
 436 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 437 */
 438int inode_permission(struct inode *inode, int mask)
 439{
 440        int retval;
 441
 442        retval = sb_permission(inode->i_sb, inode, mask);
 443        if (retval)
 444                return retval;
 445        return __inode_permission(inode, mask);
 446}
 447
 448/**
 449 * path_get - get a reference to a path
 450 * @path: path to get the reference to
 451 *
 452 * Given a path increment the reference count to the dentry and the vfsmount.
 453 */
 454void path_get(const struct path *path)
 455{
 456        mntget(path->mnt);
 457        dget(path->dentry);
 458}
 459EXPORT_SYMBOL(path_get);
 460
 461/**
 462 * path_put - put a reference to a path
 463 * @path: path to put the reference to
 464 *
 465 * Given a path decrement the reference count to the dentry and the vfsmount.
 466 */
 467void path_put(const struct path *path)
 468{
 469        dput(path->dentry);
 470        mntput(path->mnt);
 471}
 472EXPORT_SYMBOL(path_put);
 473
 474/*
 475 * Path walking has 2 modes, rcu-walk and ref-walk (see
 476 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 477 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 478 * normal reference counts on dentries and vfsmounts to transition to rcu-walk
 479 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 480 * got stuck, so ref-walk may continue from there. If this is not successful
 481 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 482 * to restart the path walk from the beginning in ref-walk mode.
 483 */
 484
 485static inline void lock_rcu_walk(void)
 486{
 487        br_read_lock(&vfsmount_lock);
 488        rcu_read_lock();
 489}
 490
 491static inline void unlock_rcu_walk(void)
 492{
 493        rcu_read_unlock();
 494        br_read_unlock(&vfsmount_lock);
 495}
 496
 497/**
 498 * unlazy_walk - try to switch to ref-walk mode.
 499 * @nd: nameidata pathwalk data
 500 * @dentry: child of nd->path.dentry or NULL
 501 * Returns: 0 on success, -ECHILD on failure
 502 *
 503 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
 504 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 505 * @nd or NULL.  Must be called from rcu-walk context.
 506 */
 507static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 508{
 509        struct fs_struct *fs = current->fs;
 510        struct dentry *parent = nd->path.dentry;
 511
 512        BUG_ON(!(nd->flags & LOOKUP_RCU));
 513
 514        /*
 515         * Get a reference to the parent first: we're
 516         * going to make "path_put(nd->path)" valid in
 517         * non-RCU context for "terminate_walk()".
 518         *
 519         * If this doesn't work, return immediately with
 520         * RCU walking still active (and then we will do
 521         * the RCU walk cleanup in terminate_walk()).
 522         */
 523        if (!lockref_get_not_dead(&parent->d_lockref))
 524                return -ECHILD;
 525
 526        /*
 527         * After the mntget(), we terminate_walk() will do
 528         * the right thing for non-RCU mode, and all our
 529         * subsequent exit cases should unlock_rcu_walk()
 530         * before returning.
 531         */
 532        mntget(nd->path.mnt);
 533        nd->flags &= ~LOOKUP_RCU;
 534
 535        /*
 536         * For a negative lookup, the lookup sequence point is the parents
 537         * sequence point, and it only needs to revalidate the parent dentry.
 538         *
 539         * For a positive lookup, we need to move both the parent and the
 540         * dentry from the RCU domain to be properly refcounted. And the
 541         * sequence number in the dentry validates *both* dentry counters,
 542         * since we checked the sequence number of the parent after we got
 543         * the child sequence number. So we know the parent must still
 544         * be valid if the child sequence number is still valid.
 545         */
 546        if (!dentry) {
 547                if (read_seqcount_retry(&parent->d_seq, nd->seq))
 548                        goto out;
 549                BUG_ON(nd->inode != parent->d_inode);
 550        } else {
 551                if (!lockref_get_not_dead(&dentry->d_lockref))
 552                        goto out;
 553                if (read_seqcount_retry(&dentry->d_seq, nd->seq))
 554                        goto drop_dentry;
 555        }
 556
 557        /*
 558         * Sequence counts matched. Now make sure that the root is
 559         * still valid and get it if required.
 560         */
 561        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 562                spin_lock(&fs->lock);
 563                if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
 564                        goto unlock_and_drop_dentry;
 565                path_get(&nd->root);
 566                spin_unlock(&fs->lock);
 567        }
 568
 569        unlock_rcu_walk();
 570        return 0;
 571
 572unlock_and_drop_dentry:
 573        spin_unlock(&fs->lock);
 574drop_dentry:
 575        unlock_rcu_walk();
 576        dput(dentry);
 577        goto drop_root_mnt;
 578out:
 579        unlock_rcu_walk();
 580drop_root_mnt:
 581        if (!(nd->flags & LOOKUP_ROOT))
 582                nd->root.mnt = NULL;
 583        return -ECHILD;
 584}
 585
 586static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
 587{
 588        return dentry->d_op->d_revalidate(dentry, flags);
 589}
 590
 591/**
 592 * complete_walk - successful completion of path walk
 593 * @nd:  pointer nameidata
 594 *
 595 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 596 * Revalidate the final result, unless we'd already done that during
 597 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 598 * success, -error on failure.  In case of failure caller does not
 599 * need to drop nd->path.
 600 */
 601static int complete_walk(struct nameidata *nd)
 602{
 603        struct dentry *dentry = nd->path.dentry;
 604        int status;
 605
 606        if (nd->flags & LOOKUP_RCU) {
 607                nd->flags &= ~LOOKUP_RCU;
 608                if (!(nd->flags & LOOKUP_ROOT))
 609                        nd->root.mnt = NULL;
 610
 611                if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
 612                        unlock_rcu_walk();
 613                        return -ECHILD;
 614                }
 615                if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
 616                        unlock_rcu_walk();
 617                        dput(dentry);
 618                        return -ECHILD;
 619                }
 620                mntget(nd->path.mnt);
 621                unlock_rcu_walk();
 622        }
 623
 624        if (likely(!(nd->flags & LOOKUP_JUMPED)))
 625                return 0;
 626
 627        if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
 628                return 0;
 629
 630        status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
 631        if (status > 0)
 632                return 0;
 633
 634        if (!status)
 635                status = -ESTALE;
 636
 637        path_put(&nd->path);
 638        return status;
 639}
 640
 641static __always_inline void set_root(struct nameidata *nd)
 642{
 643        if (!nd->root.mnt)
 644                get_fs_root(current->fs, &nd->root);
 645}
 646
 647static int link_path_walk(const char *, struct nameidata *);
 648
 649static __always_inline void set_root_rcu(struct nameidata *nd)
 650{
 651        if (!nd->root.mnt) {
 652                struct fs_struct *fs = current->fs;
 653                unsigned seq;
 654
 655                do {
 656                        seq = read_seqcount_begin(&fs->seq);
 657                        nd->root = fs->root;
 658                        nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
 659                } while (read_seqcount_retry(&fs->seq, seq));
 660        }
 661}
 662
 663static void path_put_conditional(struct path *path, struct nameidata *nd)
 664{
 665        dput(path->dentry);
 666        if (path->mnt != nd->path.mnt)
 667                mntput(path->mnt);
 668}
 669
 670static inline void path_to_nameidata(const struct path *path,
 671                                        struct nameidata *nd)
 672{
 673        if (!(nd->flags & LOOKUP_RCU)) {
 674                dput(nd->path.dentry);
 675                if (nd->path.mnt != path->mnt)
 676                        mntput(nd->path.mnt);
 677        }
 678        nd->path.mnt = path->mnt;
 679        nd->path.dentry = path->dentry;
 680}
 681
 682/*
 683 * Helper to directly jump to a known parsed path from ->follow_link,
 684 * caller must have taken a reference to path beforehand.
 685 */
 686void nd_jump_link(struct nameidata *nd, struct path *path)
 687{
 688        path_put(&nd->path);
 689
 690        nd->path = *path;
 691        nd->inode = nd->path.dentry->d_inode;
 692        nd->flags |= LOOKUP_JUMPED;
 693}
 694
 695static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
 696{
 697        struct inode *inode = link->dentry->d_inode;
 698        if (inode->i_op->put_link)
 699                inode->i_op->put_link(link->dentry, nd, cookie);
 700        path_put(link);
 701}
 702
 703int sysctl_protected_symlinks __read_mostly = 0;
 704int sysctl_protected_hardlinks __read_mostly = 0;
 705
 706/**
 707 * may_follow_link - Check symlink following for unsafe situations
 708 * @link: The path of the symlink
 709 * @nd: nameidata pathwalk data
 710 *
 711 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 712 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 713 * in a sticky world-writable directory. This is to protect privileged
 714 * processes from failing races against path names that may change out
 715 * from under them by way of other users creating malicious symlinks.
 716 * It will permit symlinks to be followed only when outside a sticky
 717 * world-writable directory, or when the uid of the symlink and follower
 718 * match, or when the directory owner matches the symlink's owner.
 719 *
 720 * Returns 0 if following the symlink is allowed, -ve on error.
 721 */
 722static inline int may_follow_link(struct path *link, struct nameidata *nd)
 723{
 724        const struct inode *inode;
 725        const struct inode *parent;
 726
 727        if (!sysctl_protected_symlinks)
 728                return 0;
 729
 730        /* Allowed if owner and follower match. */
 731        inode = link->dentry->d_inode;
 732        if (uid_eq(current_cred()->fsuid, inode->i_uid))
 733                return 0;
 734
 735        /* Allowed if parent directory not sticky and world-writable. */
 736        parent = nd->path.dentry->d_inode;
 737        if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
 738                return 0;
 739
 740        /* Allowed if parent directory and link owner match. */
 741        if (uid_eq(parent->i_uid, inode->i_uid))
 742                return 0;
 743
 744        audit_log_link_denied("follow_link", link);
 745        path_put_conditional(link, nd);
 746        path_put(&nd->path);
 747        return -EACCES;
 748}
 749
 750/**
 751 * safe_hardlink_source - Check for safe hardlink conditions
 752 * @inode: the source inode to hardlink from
 753 *
 754 * Return false if at least one of the following conditions:
 755 *    - inode is not a regular file
 756 *    - inode is setuid
 757 *    - inode is setgid and group-exec
 758 *    - access failure for read and write
 759 *
 760 * Otherwise returns true.
 761 */
 762static bool safe_hardlink_source(struct inode *inode)
 763{
 764        umode_t mode = inode->i_mode;
 765
 766        /* Special files should not get pinned to the filesystem. */
 767        if (!S_ISREG(mode))
 768                return false;
 769
 770        /* Setuid files should not get pinned to the filesystem. */
 771        if (mode & S_ISUID)
 772                return false;
 773
 774        /* Executable setgid files should not get pinned to the filesystem. */
 775        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
 776                return false;
 777
 778        /* Hardlinking to unreadable or unwritable sources is dangerous. */
 779        if (inode_permission(inode, MAY_READ | MAY_WRITE))
 780                return false;
 781
 782        return true;
 783}
 784
 785/**
 786 * may_linkat - Check permissions for creating a hardlink
 787 * @link: the source to hardlink from
 788 *
 789 * Block hardlink when all of:
 790 *  - sysctl_protected_hardlinks enabled
 791 *  - fsuid does not match inode
 792 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 793 *  - not CAP_FOWNER
 794 *
 795 * Returns 0 if successful, -ve on error.
 796 */
 797static int may_linkat(struct path *link)
 798{
 799        const struct cred *cred;
 800        struct inode *inode;
 801
 802        if (!sysctl_protected_hardlinks)
 803                return 0;
 804
 805        cred = current_cred();
 806        inode = link->dentry->d_inode;
 807
 808        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 809         * otherwise, it must be a safe source.
 810         */
 811        if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
 812            capable(CAP_FOWNER))
 813                return 0;
 814
 815        audit_log_link_denied("linkat", link);
 816        return -EPERM;
 817}
 818
 819static __always_inline int
 820follow_link(struct path *link, struct nameidata *nd, void **p)
 821{
 822        struct dentry *dentry = link->dentry;
 823        int error;
 824        char *s;
 825
 826        BUG_ON(nd->flags & LOOKUP_RCU);
 827
 828        if (link->mnt == nd->path.mnt)
 829                mntget(link->mnt);
 830
 831        error = -ELOOP;
 832        if (unlikely(current->total_link_count >= 40))
 833                goto out_put_nd_path;
 834
 835        cond_resched();
 836        current->total_link_count++;
 837
 838        touch_atime(link);
 839        nd_set_link(nd, NULL);
 840
 841        error = security_inode_follow_link(link->dentry, nd);
 842        if (error)
 843                goto out_put_nd_path;
 844
 845        nd->last_type = LAST_BIND;
 846        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
 847        error = PTR_ERR(*p);
 848        if (IS_ERR(*p))
 849                goto out_put_nd_path;
 850
 851        error = 0;
 852        s = nd_get_link(nd);
 853        if (s) {
 854                if (unlikely(IS_ERR(s))) {
 855                        path_put(&nd->path);
 856                        put_link(nd, link, *p);
 857                        return PTR_ERR(s);
 858                }
 859                if (*s == '/') {
 860                        set_root(nd);
 861                        path_put(&nd->path);
 862                        nd->path = nd->root;
 863                        path_get(&nd->root);
 864                        nd->flags |= LOOKUP_JUMPED;
 865                }
 866                nd->inode = nd->path.dentry->d_inode;
 867                error = link_path_walk(s, nd);
 868                if (unlikely(error))
 869                        put_link(nd, link, *p);
 870        }
 871
 872        return error;
 873
 874out_put_nd_path:
 875        *p = NULL;
 876        path_put(&nd->path);
 877        path_put(link);
 878        return error;
 879}
 880
 881static int follow_up_rcu(struct path *path)
 882{
 883        struct mount *mnt = real_mount(path->mnt);
 884        struct mount *parent;
 885        struct dentry *mountpoint;
 886
 887        parent = mnt->mnt_parent;
 888        if (&parent->mnt == path->mnt)
 889                return 0;
 890        mountpoint = mnt->mnt_mountpoint;
 891        path->dentry = mountpoint;
 892        path->mnt = &parent->mnt;
 893        return 1;
 894}
 895
 896/*
 897 * follow_up - Find the mountpoint of path's vfsmount
 898 *
 899 * Given a path, find the mountpoint of its source file system.
 900 * Replace @path with the path of the mountpoint in the parent mount.
 901 * Up is towards /.
 902 *
 903 * Return 1 if we went up a level and 0 if we were already at the
 904 * root.
 905 */
 906int follow_up(struct path *path)
 907{
 908        struct mount *mnt = real_mount(path->mnt);
 909        struct mount *parent;
 910        struct dentry *mountpoint;
 911
 912        br_read_lock(&vfsmount_lock);
 913        parent = mnt->mnt_parent;
 914        if (parent == mnt) {
 915                br_read_unlock(&vfsmount_lock);
 916                return 0;
 917        }
 918        mntget(&parent->mnt);
 919        mountpoint = dget(mnt->mnt_mountpoint);
 920        br_read_unlock(&vfsmount_lock);
 921        dput(path->dentry);
 922        path->dentry = mountpoint;
 923        mntput(path->mnt);
 924        path->mnt = &parent->mnt;
 925        return 1;
 926}
 927
 928/*
 929 * Perform an automount
 930 * - return -EISDIR to tell follow_managed() to stop and return the path we
 931 *   were called with.
 932 */
 933static int follow_automount(struct path *path, unsigned flags,
 934                            bool *need_mntput)
 935{
 936        struct vfsmount *mnt;
 937        int err;
 938
 939        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
 940                return -EREMOTE;
 941
 942        /* We don't want to mount if someone's just doing a stat -
 943         * unless they're stat'ing a directory and appended a '/' to
 944         * the name.
 945         *
 946         * We do, however, want to mount if someone wants to open or
 947         * create a file of any type under the mountpoint, wants to
 948         * traverse through the mountpoint or wants to open the
 949         * mounted directory.  Also, autofs may mark negative dentries
 950         * as being automount points.  These will need the attentions
 951         * of the daemon to instantiate them before they can be used.
 952         */
 953        if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
 954                     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
 955            path->dentry->d_inode)
 956                return -EISDIR;
 957
 958        current->total_link_count++;
 959        if (current->total_link_count >= 40)
 960                return -ELOOP;
 961
 962        mnt = path->dentry->d_op->d_automount(path);
 963        if (IS_ERR(mnt)) {
 964                /*
 965                 * The filesystem is allowed to return -EISDIR here to indicate
 966                 * it doesn't want to automount.  For instance, autofs would do
 967                 * this so that its userspace daemon can mount on this dentry.
 968                 *
 969                 * However, we can only permit this if it's a terminal point in
 970                 * the path being looked up; if it wasn't then the remainder of
 971                 * the path is inaccessible and we should say so.
 972                 */
 973                if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
 974                        return -EREMOTE;
 975                return PTR_ERR(mnt);
 976        }
 977
 978        if (!mnt) /* mount collision */
 979                return 0;
 980
 981        if (!*need_mntput) {
 982                /* lock_mount() may release path->mnt on error */
 983                mntget(path->mnt);
 984                *need_mntput = true;
 985        }
 986        err = finish_automount(mnt, path);
 987
 988        switch (err) {
 989        case -EBUSY:
 990                /* Someone else made a mount here whilst we were busy */
 991                return 0;
 992        case 0:
 993                path_put(path);
 994                path->mnt = mnt;
 995                path->dentry = dget(mnt->mnt_root);
 996                return 0;
 997        default:
 998                return err;
 999        }
1000
1001}
1002
1003/*
1004 * Handle a dentry that is managed in some way.
1005 * - Flagged for transit management (autofs)
1006 * - Flagged as mountpoint
1007 * - Flagged as automount point
1008 *
1009 * This may only be called in refwalk mode.
1010 *
1011 * Serialization is taken care of in namespace.c
1012 */
1013static int follow_managed(struct path *path, unsigned flags)
1014{
1015        struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1016        unsigned managed;
1017        bool need_mntput = false;
1018        int ret = 0;
1019
1020        /* Given that we're not holding a lock here, we retain the value in a
1021         * local variable for each dentry as we look at it so that we don't see
1022         * the components of that value change under us */
1023        while (managed = ACCESS_ONCE(path->dentry->d_flags),
1024               managed &= DCACHE_MANAGED_DENTRY,
1025               unlikely(managed != 0)) {
1026                /* Allow the filesystem to manage the transit without i_mutex
1027                 * being held. */
1028                if (managed & DCACHE_MANAGE_TRANSIT) {
1029                        BUG_ON(!path->dentry->d_op);
1030                        BUG_ON(!path->dentry->d_op->d_manage);
1031                        ret = path->dentry->d_op->d_manage(path->dentry, false);
1032                        if (ret < 0)
1033                                break;
1034                }
1035
1036                /* Transit to a mounted filesystem. */
1037                if (managed & DCACHE_MOUNTED) {
1038                        struct vfsmount *mounted = lookup_mnt(path);
1039                        if (mounted) {
1040                                dput(path->dentry);
1041                                if (need_mntput)
1042                                        mntput(path->mnt);
1043                                path->mnt = mounted;
1044                                path->dentry = dget(mounted->mnt_root);
1045                                need_mntput = true;
1046                                continue;
1047                        }
1048
1049                        /* Something is mounted on this dentry in another
1050                         * namespace and/or whatever was mounted there in this
1051                         * namespace got unmounted before we managed to get the
1052                         * vfsmount_lock */
1053                }
1054
1055                /* Handle an automount point */
1056                if (managed & DCACHE_NEED_AUTOMOUNT) {
1057                        ret = follow_automount(path, flags, &need_mntput);
1058                        if (ret < 0)
1059                                break;
1060                        continue;
1061                }
1062
1063                /* We didn't change the current path point */
1064                break;
1065        }
1066
1067        if (need_mntput && path->mnt == mnt)
1068                mntput(path->mnt);
1069        if (ret == -EISDIR)
1070                ret = 0;
1071        return ret < 0 ? ret : need_mntput;
1072}
1073
1074int follow_down_one(struct path *path)
1075{
1076        struct vfsmount *mounted;
1077
1078        mounted = lookup_mnt(path);
1079        if (mounted) {
1080                dput(path->dentry);
1081                mntput(path->mnt);
1082                path->mnt = mounted;
1083                path->dentry = dget(mounted->mnt_root);
1084                return 1;
1085        }
1086        return 0;
1087}
1088
1089static inline bool managed_dentry_might_block(struct dentry *dentry)
1090{
1091        return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
1092                dentry->d_op->d_manage(dentry, true) < 0);
1093}
1094
1095/*
1096 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
1097 * we meet a managed dentry that would need blocking.
1098 */
1099static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1100                               struct inode **inode)
1101{
1102        for (;;) {
1103                struct mount *mounted;
1104                /*
1105                 * Don't forget we might have a non-mountpoint managed dentry
1106                 * that wants to block transit.
1107                 */
1108                if (unlikely(managed_dentry_might_block(path->dentry)))
1109                        return false;
1110
1111                if (!d_mountpoint(path->dentry))
1112                        break;
1113
1114                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
1115                if (!mounted)
1116                        break;
1117                path->mnt = &mounted->mnt;
1118                path->dentry = mounted->mnt.mnt_root;
1119                nd->flags |= LOOKUP_JUMPED;
1120                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1121                /*
1122                 * Update the inode too. We don't need to re-check the
1123                 * dentry sequence number here after this d_inode read,
1124                 * because a mount-point is always pinned.
1125                 */
1126                *inode = path->dentry->d_inode;
1127        }
1128        return true;
1129}
1130
1131static void follow_mount_rcu(struct nameidata *nd)
1132{
1133        while (d_mountpoint(nd->path.dentry)) {
1134                struct mount *mounted;
1135                mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
1136                if (!mounted)
1137                        break;
1138                nd->path.mnt = &mounted->mnt;
1139                nd->path.dentry = mounted->mnt.mnt_root;
1140                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1141        }
1142}
1143
1144static int follow_dotdot_rcu(struct nameidata *nd)
1145{
1146        set_root_rcu(nd);
1147
1148        while (1) {
1149                if (nd->path.dentry == nd->root.dentry &&
1150                    nd->path.mnt == nd->root.mnt) {
1151                        break;
1152                }
1153                if (nd->path.dentry != nd->path.mnt->mnt_root) {
1154                        struct dentry *old = nd->path.dentry;
1155                        struct dentry *parent = old->d_parent;
1156                        unsigned seq;
1157
1158                        seq = read_seqcount_begin(&parent->d_seq);
1159                        if (read_seqcount_retry(&old->d_seq, nd->seq))
1160                                goto failed;
1161                        nd->path.dentry = parent;
1162                        nd->seq = seq;
1163                        break;
1164                }
1165                if (!follow_up_rcu(&nd->path))
1166                        break;
1167                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1168        }
1169        follow_mount_rcu(nd);
1170        nd->inode = nd->path.dentry->d_inode;
1171        return 0;
1172
1173failed:
1174        nd->flags &= ~LOOKUP_RCU;
1175        if (!(nd->flags & LOOKUP_ROOT))
1176                nd->root.mnt = NULL;
1177        unlock_rcu_walk();
1178        return -ECHILD;
1179}
1180
1181/*
1182 * Follow down to the covering mount currently visible to userspace.  At each
1183 * point, the filesystem owning that dentry may be queried as to whether the
1184 * caller is permitted to proceed or not.
1185 */
1186int follow_down(struct path *path)
1187{
1188        unsigned managed;
1189        int ret;
1190
1191        while (managed = ACCESS_ONCE(path->dentry->d_flags),
1192               unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1193                /* Allow the filesystem to manage the transit without i_mutex
1194                 * being held.
1195                 *
1196                 * We indicate to the filesystem if someone is trying to mount
1197                 * something here.  This gives autofs the chance to deny anyone
1198                 * other than its daemon the right to mount on its
1199                 * superstructure.
1200                 *
1201                 * The filesystem may sleep at this point.
1202                 */
1203                if (managed & DCACHE_MANAGE_TRANSIT) {
1204                        BUG_ON(!path->dentry->d_op);
1205                        BUG_ON(!path->dentry->d_op->d_manage);
1206                        ret = path->dentry->d_op->d_manage(
1207                                path->dentry, false);
1208                        if (ret < 0)
1209                                return ret == -EISDIR ? 0 : ret;
1210                }
1211
1212                /* Transit to a mounted filesystem. */
1213                if (managed & DCACHE_MOUNTED) {
1214                        struct vfsmount *mounted = lookup_mnt(path);
1215                        if (!mounted)
1216                                break;
1217                        dput(path->dentry);
1218                        mntput(path->mnt);
1219                        path->mnt = mounted;
1220                        path->dentry = dget(mounted->mnt_root);
1221                        continue;
1222                }
1223
1224                /* Don't handle automount points here */
1225                break;
1226        }
1227        return 0;
1228}
1229
1230/*
1231 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1232 */
1233static void follow_mount(struct path *path)
1234{
1235        while (d_mountpoint(path->dentry)) {
1236                struct vfsmount *mounted = lookup_mnt(path);
1237                if (!mounted)
1238                        break;
1239                dput(path->dentry);
1240                mntput(path->mnt);
1241                path->mnt = mounted;
1242                path->dentry = dget(mounted->mnt_root);
1243        }
1244}
1245
1246static void follow_dotdot(struct nameidata *nd)
1247{
1248        set_root(nd);
1249
1250        while(1) {
1251                struct dentry *old = nd->path.dentry;
1252
1253                if (nd->path.dentry == nd->root.dentry &&
1254                    nd->path.mnt == nd->root.mnt) {
1255                        break;
1256                }
1257                if (nd->path.dentry != nd->path.mnt->mnt_root) {
1258                        /* rare case of legitimate dget_parent()... */
1259                        nd->path.dentry = dget_parent(nd->path.dentry);
1260                        dput(old);
1261                        break;
1262                }
1263                if (!follow_up(&nd->path))
1264                        break;
1265        }
1266        follow_mount(&nd->path);
1267        nd->inode = nd->path.dentry->d_inode;
1268}
1269
1270/*
1271 * This looks up the name in dcache, possibly revalidates the old dentry and
1272 * allocates a new one if not found or not valid.  In the need_lookup argument
1273 * returns whether i_op->lookup is necessary.
1274 *
1275 * dir->d_inode->i_mutex must be held
1276 */
1277static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
1278                                    unsigned int flags, bool *need_lookup)
1279{
1280        struct dentry *dentry;
1281        int error;
1282
1283        *need_lookup = false;
1284        dentry = d_lookup(dir, name);
1285        if (dentry) {
1286                if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1287                        error = d_revalidate(dentry, flags);
1288                        if (unlikely(error <= 0)) {
1289                                if (error < 0) {
1290                                        dput(dentry);
1291                                        return ERR_PTR(error);
1292                                } else if (!d_invalidate(dentry)) {
1293                                        dput(dentry);
1294                                        dentry = NULL;
1295                                }
1296                        }
1297                }
1298        }
1299
1300        if (!dentry) {
1301                dentry = d_alloc(dir, name);
1302                if (unlikely(!dentry))
1303                        return ERR_PTR(-ENOMEM);
1304
1305                *need_lookup = true;
1306        }
1307        return dentry;
1308}
1309
1310/*
1311 * Call i_op->lookup on the dentry.  The dentry must be negative but may be
1312 * hashed if it was pouplated with DCACHE_NEED_LOOKUP.
1313 *
1314 * dir->d_inode->i_mutex must be held
1315 */
1316static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
1317                                  unsigned int flags)
1318{
1319        struct dentry *old;
1320
1321        /* Don't create child dentry for a dead directory. */
1322        if (unlikely(IS_DEADDIR(dir))) {
1323                dput(dentry);
1324                return ERR_PTR(-ENOENT);
1325        }
1326
1327        old = dir->i_op->lookup(dir, dentry, flags);
1328        if (unlikely(old)) {
1329                dput(dentry);
1330                dentry = old;
1331        }
1332        return dentry;
1333}
1334
1335static struct dentry *__lookup_hash(struct qstr *name,
1336                struct dentry *base, unsigned int flags)
1337{
1338        bool need_lookup;
1339        struct dentry *dentry;
1340
1341        dentry = lookup_dcache(name, base, flags, &need_lookup);
1342        if (!need_lookup)
1343                return dentry;
1344
1345        return lookup_real(base->d_inode, dentry, flags);
1346}
1347
1348/*
1349 *  It's more convoluted than I'd like it to be, but... it's still fairly
1350 *  small and for now I'd prefer to have fast path as straight as possible.
1351 *  It _is_ time-critical.
1352 */
1353static int lookup_fast(struct nameidata *nd,
1354                       struct path *path, struct inode **inode)
1355{
1356        struct vfsmount *mnt = nd->path.mnt;
1357        struct dentry *dentry, *parent = nd->path.dentry;
1358        int need_reval = 1;
1359        int status = 1;
1360        int err;
1361
1362        /*
1363         * Rename seqlock is not required here because in the off chance
1364         * of a false negative due to a concurrent rename, we're going to
1365         * do the non-racy lookup, below.
1366         */
1367        if (nd->flags & LOOKUP_RCU) {
1368                unsigned seq;
1369                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1370                if (!dentry)
1371                        goto unlazy;
1372
1373                /*
1374                 * This sequence count validates that the inode matches
1375                 * the dentry name information from lookup.
1376                 */
1377                *inode = dentry->d_inode;
1378                if (read_seqcount_retry(&dentry->d_seq, seq))
1379                        return -ECHILD;
1380
1381                /*
1382                 * This sequence count validates that the parent had no
1383                 * changes while we did the lookup of the dentry above.
1384                 *
1385                 * The memory barrier in read_seqcount_begin of child is
1386                 *  enough, we can use __read_seqcount_retry here.
1387                 */
1388                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1389                        return -ECHILD;
1390                nd->seq = seq;
1391
1392                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1393                        status = d_revalidate(dentry, nd->flags);
1394                        if (unlikely(status <= 0)) {
1395                                if (status != -ECHILD)
1396                                        need_reval = 0;
1397                                goto unlazy;
1398                        }
1399                }
1400                path->mnt = mnt;
1401                path->dentry = dentry;
1402                if (unlikely(!__follow_mount_rcu(nd, path, inode)))
1403                        goto unlazy;
1404                if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1405                        goto unlazy;
1406                return 0;
1407unlazy:
1408                if (unlazy_walk(nd, dentry))
1409                        return -ECHILD;
1410        } else {
1411                dentry = __d_lookup(parent, &nd->last);
1412        }
1413
1414        if (unlikely(!dentry))
1415                goto need_lookup;
1416
1417        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1418                status = d_revalidate(dentry, nd->flags);
1419        if (unlikely(status <= 0)) {
1420                if (status < 0) {
1421                        dput(dentry);
1422                        return status;
1423                }
1424                if (!d_invalidate(dentry)) {
1425                        dput(dentry);
1426                        goto need_lookup;
1427                }
1428        }
1429
1430        path->mnt = mnt;
1431        path->dentry = dentry;
1432        err = follow_managed(path, nd->flags);
1433        if (unlikely(err < 0)) {
1434                path_put_conditional(path, nd);
1435                return err;
1436        }
1437        if (err)
1438                nd->flags |= LOOKUP_JUMPED;
1439        *inode = path->dentry->d_inode;
1440        return 0;
1441
1442need_lookup:
1443        return 1;
1444}
1445
1446/* Fast lookup failed, do it the slow way */
1447static int lookup_slow(struct nameidata *nd, struct path *path)
1448{
1449        struct dentry *dentry, *parent;
1450        int err;
1451
1452        parent = nd->path.dentry;
1453        BUG_ON(nd->inode != parent->d_inode);
1454
1455        mutex_lock(&parent->d_inode->i_mutex);
1456        dentry = __lookup_hash(&nd->last, parent, nd->flags);
1457        mutex_unlock(&parent->d_inode->i_mutex);
1458        if (IS_ERR(dentry))
1459                return PTR_ERR(dentry);
1460        path->mnt = nd->path.mnt;
1461        path->dentry = dentry;
1462        err = follow_managed(path, nd->flags);
1463        if (unlikely(err < 0)) {
1464                path_put_conditional(path, nd);
1465                return err;
1466        }
1467        if (err)
1468                nd->flags |= LOOKUP_JUMPED;
1469        return 0;
1470}
1471
1472static inline int may_lookup(struct nameidata *nd)
1473{
1474        if (nd->flags & LOOKUP_RCU) {
1475                int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1476                if (err != -ECHILD)
1477                        return err;
1478                if (unlazy_walk(nd, NULL))
1479                        return -ECHILD;
1480        }
1481        return inode_permission(nd->inode, MAY_EXEC);
1482}
1483
1484static inline int handle_dots(struct nameidata *nd, int type)
1485{
1486        if (type == LAST_DOTDOT) {
1487                if (nd->flags & LOOKUP_RCU) {
1488                        if (follow_dotdot_rcu(nd))
1489                                return -ECHILD;
1490                } else
1491                        follow_dotdot(nd);
1492        }
1493        return 0;
1494}
1495
1496static void terminate_walk(struct nameidata *nd)
1497{
1498        if (!(nd->flags & LOOKUP_RCU)) {
1499                path_put(&nd->path);
1500        } else {
1501                nd->flags &= ~LOOKUP_RCU;
1502                if (!(nd->flags & LOOKUP_ROOT))
1503                        nd->root.mnt = NULL;
1504                unlock_rcu_walk();
1505        }
1506}
1507
1508/*
1509 * Do we need to follow links? We _really_ want to be able
1510 * to do this check without having to look at inode->i_op,
1511 * so we keep a cache of "no, this doesn't need follow_link"
1512 * for the common case.
1513 */
1514static inline int should_follow_link(struct inode *inode, int follow)
1515{
1516        if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
1517                if (likely(inode->i_op->follow_link))
1518                        return follow;
1519
1520                /* This gets set once for the inode lifetime */
1521                spin_lock(&inode->i_lock);
1522                inode->i_opflags |= IOP_NOFOLLOW;
1523                spin_unlock(&inode->i_lock);
1524        }
1525        return 0;
1526}
1527
1528static inline int walk_component(struct nameidata *nd, struct path *path,
1529                int follow)
1530{
1531        struct inode *inode;
1532        int err;
1533        /*
1534         * "." and ".." are special - ".." especially so because it has
1535         * to be able to know about the current root directory and
1536         * parent relationships.
1537         */
1538        if (unlikely(nd->last_type != LAST_NORM))
1539                return handle_dots(nd, nd->last_type);
1540        err = lookup_fast(nd, path, &inode);
1541        if (unlikely(err)) {
1542                if (err < 0)
1543                        goto out_err;
1544
1545                err = lookup_slow(nd, path);
1546                if (err < 0)
1547                        goto out_err;
1548
1549                inode = path->dentry->d_inode;
1550        }
1551        err = -ENOENT;
1552        if (!inode)
1553                goto out_path_put;
1554
1555        if (should_follow_link(inode, follow)) {
1556                if (nd->flags & LOOKUP_RCU) {
1557                        if (unlikely(unlazy_walk(nd, path->dentry))) {
1558                                err = -ECHILD;
1559                                goto out_err;
1560                        }
1561                }
1562                BUG_ON(inode != path->dentry->d_inode);
1563                return 1;
1564        }
1565        path_to_nameidata(path, nd);
1566        nd->inode = inode;
1567        return 0;
1568
1569out_path_put:
1570        path_to_nameidata(path, nd);
1571out_err:
1572        terminate_walk(nd);
1573        return err;
1574}
1575
1576/*
1577 * This limits recursive symlink follows to 8, while
1578 * limiting consecutive symlinks to 40.
1579 *
1580 * Without that kind of total limit, nasty chains of consecutive
1581 * symlinks can cause almost arbitrarily long lookups.
1582 */
1583static inline int nested_symlink(struct path *path, struct nameidata *nd)
1584{
1585        int res;
1586
1587        if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
1588                path_put_conditional(path, nd);
1589                path_put(&nd->path);
1590                return -ELOOP;
1591        }
1592        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
1593
1594        nd->depth++;
1595        current->link_count++;
1596
1597        do {
1598                struct path link = *path;
1599                void *cookie;
1600
1601                res = follow_link(&link, nd, &cookie);
1602                if (res)
1603                        break;
1604                res = walk_component(nd, path, LOOKUP_FOLLOW);
1605                put_link(nd, &link, cookie);
1606        } while (res > 0);
1607
1608        current->link_count--;
1609        nd->depth--;
1610        return res;
1611}
1612
1613/*
1614 * We really don't want to look at inode->i_op->lookup
1615 * when we don't have to. So we keep a cache bit in
1616 * the inode ->i_opflags field that says "yes, we can
1617 * do lookup on this inode".
1618 */
1619static inline int can_lookup(struct inode *inode)
1620{
1621        if (likely(inode->i_opflags & IOP_LOOKUP))
1622                return 1;
1623        if (likely(!inode->i_op->lookup))
1624                return 0;
1625
1626        /* We do this once for the lifetime of the inode */
1627        spin_lock(&inode->i_lock);
1628        inode->i_opflags |= IOP_LOOKUP;
1629        spin_unlock(&inode->i_lock);
1630        return 1;
1631}
1632
1633/*
1634 * We can do the critical dentry name comparison and hashing
1635 * operations one word at a time, but we are limited to:
1636 *
1637 * - Architectures with fast unaligned word accesses. We could
1638 *   do a "get_unaligned()" if this helps and is sufficiently
1639 *   fast.
1640 *
1641 * - Little-endian machines (so that we can generate the mask
1642 *   of low bytes efficiently). Again, we *could* do a byte
1643 *   swapping load on big-endian architectures if that is not
1644 *   expensive enough to make the optimization worthless.
1645 *
1646 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
1647 *   do not trap on the (extremely unlikely) case of a page
1648 *   crossing operation.
1649 *
1650 * - Furthermore, we need an efficient 64-bit compile for the
1651 *   64-bit case in order to generate the "number of bytes in
1652 *   the final mask". Again, that could be replaced with a
1653 *   efficient population count instruction or similar.
1654 */
1655#ifdef CONFIG_DCACHE_WORD_ACCESS
1656
1657#include <asm/word-at-a-time.h>
1658
1659#ifdef CONFIG_64BIT
1660
1661static inline unsigned int fold_hash(unsigned long hash)
1662{
1663        hash += hash >> (8*sizeof(int));
1664        return hash;
1665}
1666
1667#else   /* 32-bit case */
1668
1669#define fold_hash(x) (x)
1670
1671#endif
1672
1673unsigned int full_name_hash(const unsigned char *name, unsigned int len)
1674{
1675        unsigned long a, mask;
1676        unsigned long hash = 0;
1677
1678        for (;;) {
1679                a = load_unaligned_zeropad(name);
1680                if (len < sizeof(unsigned long))
1681                        break;
1682                hash += a;
1683                hash *= 9;
1684                name += sizeof(unsigned long);
1685                len -= sizeof(unsigned long);
1686                if (!len)
1687                        goto done;
1688        }
1689        mask = ~(~0ul << len*8);
1690        hash += mask & a;
1691done:
1692        return fold_hash(hash);
1693}
1694EXPORT_SYMBOL(full_name_hash);
1695
1696/*
1697 * Calculate the length and hash of the path component, and
1698 * return the length of the component;
1699 */
1700static inline unsigned long hash_name(const char *name, unsigned int *hashp)
1701{
1702        unsigned long a, b, adata, bdata, mask, hash, len;
1703        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1704
1705        hash = a = 0;
1706        len = -sizeof(unsigned long);
1707        do {
1708                hash = (hash + a) * 9;
1709                len += sizeof(unsigned long);
1710                a = load_unaligned_zeropad(name+len);
1711                b = a ^ REPEAT_BYTE('/');
1712        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
1713
1714        adata = prep_zero_mask(a, adata, &constants);
1715        bdata = prep_zero_mask(b, bdata, &constants);
1716
1717        mask = create_zero_mask(adata | bdata);
1718
1719        hash += a & zero_bytemask(mask);
1720        *hashp = fold_hash(hash);
1721
1722        return len + find_zero(mask);
1723}
1724
1725#else
1726
1727unsigned int full_name_hash(const unsigned char *name, unsigned int len)
1728{
1729        unsigned long hash = init_name_hash();
1730        while (len--)
1731                hash = partial_name_hash(*name++, hash);
1732        return end_name_hash(hash);
1733}
1734EXPORT_SYMBOL(full_name_hash);
1735
1736/*
1737 * We know there's a real path component here of at least
1738 * one character.
1739 */
1740static inline unsigned long hash_name(const char *name, unsigned int *hashp)
1741{
1742        unsigned long hash = init_name_hash();
1743        unsigned long len = 0, c;
1744
1745        c = (unsigned char)*name;
1746        do {
1747                len++;
1748                hash = partial_name_hash(c, hash);
1749                c = (unsigned char)name[len];
1750        } while (c && c != '/');
1751        *hashp = end_name_hash(hash);
1752        return len;
1753}
1754
1755#endif
1756
1757/*
1758 * Name resolution.
1759 * This is the basic name resolution function, turning a pathname into
1760 * the final dentry. We expect 'base' to be positive and a directory.
1761 *
1762 * Returns 0 and nd will have valid dentry and mnt on success.
1763 * Returns error and drops reference to input namei data on failure.
1764 */
1765static int link_path_walk(const char *name, struct nameidata *nd)
1766{
1767        struct path next;
1768        int err;
1769        
1770        while (*name=='/')
1771                name++;
1772        if (!*name)
1773                return 0;
1774
1775        /* At this point we know we have a real path component. */
1776        for(;;) {
1777                struct qstr this;
1778                long len;
1779                int type;
1780
1781                err = may_lookup(nd);
1782                if (err)
1783                        break;
1784
1785                len = hash_name(name, &this.hash);
1786                this.name = name;
1787                this.len = len;
1788
1789                type = LAST_NORM;
1790                if (name[0] == '.') switch (len) {
1791                        case 2:
1792                                if (name[1] == '.') {
1793                                        type = LAST_DOTDOT;
1794                                        nd->flags |= LOOKUP_JUMPED;
1795                                }
1796                                break;
1797                        case 1:
1798                                type = LAST_DOT;
1799                }
1800                if (likely(type == LAST_NORM)) {
1801                        struct dentry *parent = nd->path.dentry;
1802                        nd->flags &= ~LOOKUP_JUMPED;
1803                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1804                                err = parent->d_op->d_hash(parent, &this);
1805                                if (err < 0)
1806                                        break;
1807                        }
1808                }
1809
1810                nd->last = this;
1811                nd->last_type = type;
1812
1813                if (!name[len])
1814                        return 0;
1815                /*
1816                 * If it wasn't NUL, we know it was '/'. Skip that
1817                 * slash, and continue until no more slashes.
1818                 */
1819                do {
1820                        len++;
1821                } while (unlikely(name[len] == '/'));
1822                if (!name[len])
1823                        return 0;
1824
1825                name += len;
1826
1827                err = walk_component(nd, &next, LOOKUP_FOLLOW);
1828                if (err < 0)
1829                        return err;
1830
1831                if (err) {
1832                        err = nested_symlink(&next, nd);
1833                        if (err)
1834                                return err;
1835                }
1836                if (!can_lookup(nd->inode)) {
1837                        err = -ENOTDIR; 
1838                        break;
1839                }
1840        }
1841        terminate_walk(nd);
1842        return err;
1843}
1844
1845static int path_init(int dfd, const char *name, unsigned int flags,
1846                     struct nameidata *nd, struct file **fp)
1847{
1848        int retval = 0;
1849
1850        nd->last_type = LAST_ROOT; /* if there are only slashes... */
1851        nd->flags = flags | LOOKUP_JUMPED;
1852        nd->depth = 0;
1853        if (flags & LOOKUP_ROOT) {
1854                struct inode *inode = nd->root.dentry->d_inode;
1855                if (*name) {
1856                        if (!can_lookup(inode))
1857                                return -ENOTDIR;
1858                        retval = inode_permission(inode, MAY_EXEC);
1859                        if (retval)
1860                                return retval;
1861                }
1862                nd->path = nd->root;
1863                nd->inode = inode;
1864                if (flags & LOOKUP_RCU) {
1865                        lock_rcu_walk();
1866                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1867                } else {
1868                        path_get(&nd->path);
1869                }
1870                return 0;
1871        }
1872
1873        nd->root.mnt = NULL;
1874
1875        if (*name=='/') {
1876                if (flags & LOOKUP_RCU) {
1877                        lock_rcu_walk();
1878                        set_root_rcu(nd);
1879                } else {
1880                        set_root(nd);
1881                        path_get(&nd->root);
1882                }
1883                nd->path = nd->root;
1884        } else if (dfd == AT_FDCWD) {
1885                if (flags & LOOKUP_RCU) {
1886                        struct fs_struct *fs = current->fs;
1887                        unsigned seq;
1888
1889                        lock_rcu_walk();
1890
1891                        do {
1892                                seq = read_seqcount_begin(&fs->seq);
1893                                nd->path = fs->pwd;
1894                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1895                        } while (read_seqcount_retry(&fs->seq, seq));
1896                } else {
1897                        get_fs_pwd(current->fs, &nd->path);
1898                }
1899        } else {
1900                /* Caller must check execute permissions on the starting path component */
1901                struct fd f = fdget_raw(dfd);
1902                struct dentry *dentry;
1903
1904                if (!f.file)
1905                        return -EBADF;
1906
1907                dentry = f.file->f_path.dentry;
1908
1909                if (*name) {
1910                        if (!can_lookup(dentry->d_inode)) {
1911                                fdput(f);
1912                                return -ENOTDIR;
1913                        }
1914                }
1915
1916                nd->path = f.file->f_path;
1917                if (flags & LOOKUP_RCU) {
1918                        if (f.need_put)
1919                                *fp = f.file;
1920                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1921                        lock_rcu_walk();
1922                } else {
1923                        path_get(&nd->path);
1924                        fdput(f);
1925                }
1926        }
1927
1928        nd->inode = nd->path.dentry->d_inode;
1929        return 0;
1930}
1931
1932static inline int lookup_last(struct nameidata *nd, struct path *path)
1933{
1934        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
1935                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1936
1937        nd->flags &= ~LOOKUP_PARENT;
1938        return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW);
1939}
1940
1941/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1942static int path_lookupat(int dfd, const char *name,
1943                                unsigned int flags, struct nameidata *nd)
1944{
1945        struct file *base = NULL;
1946        struct path path;
1947        int err;
1948
1949        /*
1950         * Path walking is largely split up into 2 different synchronisation
1951         * schemes, rcu-walk and ref-walk (explained in
1952         * Documentation/filesystems/path-lookup.txt). These share much of the
1953         * path walk code, but some things particularly setup, cleanup, and
1954         * following mounts are sufficiently divergent that functions are
1955         * duplicated. Typically there is a function foo(), and its RCU
1956         * analogue, foo_rcu().
1957         *
1958         * -ECHILD is the error number of choice (just to avoid clashes) that
1959         * is returned if some aspect of an rcu-walk fails. Such an error must
1960         * be handled by restarting a traditional ref-walk (which will always
1961         * be able to complete).
1962         */
1963        err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
1964
1965        if (unlikely(err))
1966                return err;
1967
1968        current->total_link_count = 0;
1969        err = link_path_walk(name, nd);
1970
1971        if (!err && !(flags & LOOKUP_PARENT)) {
1972                err = lookup_last(nd, &path);
1973                while (err > 0) {
1974                        void *cookie;
1975                        struct path link = path;
1976                        err = may_follow_link(&link, nd);
1977                        if (unlikely(err))
1978                                break;
1979                        nd->flags |= LOOKUP_PARENT;
1980                        err = follow_link(&link, nd, &cookie);
1981                        if (err)
1982                                break;
1983                        err = lookup_last(nd, &path);
1984                        put_link(nd, &link, cookie);
1985                }
1986        }
1987
1988        if (!err)
1989                err = complete_walk(nd);
1990
1991        if (!err && nd->flags & LOOKUP_DIRECTORY) {
1992                if (!can_lookup(nd->inode)) {
1993                        path_put(&nd->path);
1994                        err = -ENOTDIR;
1995                }
1996        }
1997
1998        if (base)
1999                fput(base);
2000
2001        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
2002                path_put(&nd->root);
2003                nd->root.mnt = NULL;
2004        }
2005        return err;
2006}
2007
2008static int filename_lookup(int dfd, struct filename *name,
2009                                unsigned int flags, struct nameidata *nd)
2010{
2011        int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
2012        if (unlikely(retval == -ECHILD))
2013                retval = path_lookupat(dfd, name->name, flags, nd);
2014        if (unlikely(retval == -ESTALE))
2015                retval = path_lookupat(dfd, name->name,
2016                                                flags | LOOKUP_REVAL, nd);
2017
2018        if (likely(!retval))
2019                audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
2020        return retval;
2021}
2022
2023static int do_path_lookup(int dfd, const char *name,
2024                                unsigned int flags, struct nameidata *nd)
2025{
2026        struct filename filename = { .name = name };
2027
2028        return filename_lookup(dfd, &filename, flags, nd);
2029}
2030
2031/* does lookup, returns the object with parent locked */
2032struct dentry *kern_path_locked(const char *name, struct path *path)
2033{
2034        struct nameidata nd;
2035        struct dentry *d;
2036        int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd);
2037        if (err)
2038                return ERR_PTR(err);
2039        if (nd.last_type != LAST_NORM) {
2040                path_put(&nd.path);
2041                return ERR_PTR(-EINVAL);
2042        }
2043        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2044        d = __lookup_hash(&nd.last, nd.path.dentry, 0);
2045        if (IS_ERR(d)) {
2046                mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2047                path_put(&nd.path);
2048                return d;
2049        }
2050        *path = nd.path;
2051        return d;
2052}
2053
2054int kern_path(const char *name, unsigned int flags, struct path *path)
2055{
2056        struct nameidata nd;
2057        int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
2058        if (!res)
2059                *path = nd.path;
2060        return res;
2061}
2062
2063/**
2064 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2065 * @dentry:  pointer to dentry of the base directory
2066 * @mnt: pointer to vfs mount of the base directory
2067 * @name: pointer to file name
2068 * @flags: lookup flags
2069 * @path: pointer to struct path to fill
2070 */
2071int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2072                    const char *name, unsigned int flags,
2073                    struct path *path)
2074{
2075        struct nameidata nd;
2076        int err;
2077        nd.root.dentry = dentry;
2078        nd.root.mnt = mnt;
2079        BUG_ON(flags & LOOKUP_PARENT);
2080        /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
2081        err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
2082        if (!err)
2083                *path = nd.path;
2084        return err;
2085}
2086
2087/*
2088 * Restricted form of lookup. Doesn't follow links, single-component only,
2089 * needs parent already locked. Doesn't follow mounts.
2090 * SMP-safe.
2091 */
2092static struct dentry *lookup_hash(struct nameidata *nd)
2093{
2094        return __lookup_hash(&nd->last, nd->path.dentry, nd->flags);
2095}
2096
2097/**
2098 * lookup_one_len - filesystem helper to lookup single pathname component
2099 * @name:       pathname component to lookup
2100 * @base:       base directory to lookup from
2101 * @len:        maximum length @len should be interpreted to
2102 *
2103 * Note that this routine is purely a helper for filesystem usage and should
2104 * not be called by generic code.  Also note that by using this function the
2105 * nameidata argument is passed to the filesystem methods and a filesystem
2106 * using this helper needs to be prepared for that.
2107 */
2108struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2109{
2110        struct qstr this;
2111        unsigned int c;
2112        int err;
2113
2114        WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
2115
2116        this.name = name;
2117        this.len = len;
2118        this.hash = full_name_hash(name, len);
2119        if (!len)
2120                return ERR_PTR(-EACCES);
2121
2122        if (unlikely(name[0] == '.')) {
2123                if (len < 2 || (len == 2 && name[1] == '.'))
2124                        return ERR_PTR(-EACCES);
2125        }
2126
2127        while (len--) {
2128                c = *(const unsigned char *)name++;
2129                if (c == '/' || c == '\0')
2130                        return ERR_PTR(-EACCES);
2131        }
2132        /*
2133         * See if the low-level filesystem might want
2134         * to use its own hash..
2135         */
2136        if (base->d_flags & DCACHE_OP_HASH) {
2137                int err = base->d_op->d_hash(base, &this);
2138                if (err < 0)
2139                        return ERR_PTR(err);
2140        }
2141
2142        err = inode_permission(base->d_inode, MAY_EXEC);
2143        if (err)
2144                return ERR_PTR(err);
2145
2146        return __lookup_hash(&this, base, 0);
2147}
2148
2149int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2150                 struct path *path, int *empty)
2151{
2152        struct nameidata nd;
2153        struct filename *tmp = getname_flags(name, flags, empty);
2154        int err = PTR_ERR(tmp);
2155        if (!IS_ERR(tmp)) {
2156
2157                BUG_ON(flags & LOOKUP_PARENT);
2158
2159                err = filename_lookup(dfd, tmp, flags, &nd);
2160                putname(tmp);
2161                if (!err)
2162                        *path = nd.path;
2163        }
2164        return err;
2165}
2166
2167int user_path_at(int dfd, const char __user *name, unsigned flags,
2168                 struct path *path)
2169{
2170        return user_path_at_empty(dfd, name, flags, path, NULL);
2171}
2172
2173/*
2174 * NB: most callers don't do anything directly with the reference to the
2175 *     to struct filename, but the nd->last pointer points into the name string
2176 *     allocated by getname. So we must hold the reference to it until all
2177 *     path-walking is complete.
2178 */
2179static struct filename *
2180user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
2181                 unsigned int flags)
2182{
2183        struct filename *s = getname(path);
2184        int error;
2185
2186        /* only LOOKUP_REVAL is allowed in extra flags */
2187        flags &= LOOKUP_REVAL;
2188
2189        if (IS_ERR(s))
2190                return s;
2191
2192        error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
2193        if (error) {
2194                putname(s);
2195                return ERR_PTR(error);
2196        }
2197
2198        return s;
2199}
2200
2201/**
2202 * mountpoint_last - look up last component for umount
2203 * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
2204 * @path: pointer to container for result
2205 *
2206 * This is a special lookup_last function just for umount. In this case, we
2207 * need to resolve the path without doing any revalidation.
2208 *
2209 * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
2210 * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
2211 * in almost all cases, this lookup will be served out of the dcache. The only
2212 * cases where it won't are if nd->last refers to a symlink or the path is
2213 * bogus and it doesn't exist.
2214 *
2215 * Returns:
2216 * -error: if there was an error during lookup. This includes -ENOENT if the
2217 *         lookup found a negative dentry. The nd->path reference will also be
2218 *         put in this case.
2219 *
2220 * 0:      if we successfully resolved nd->path and found it to not to be a
2221 *         symlink that needs to be followed. "path" will also be populated.
2222 *         The nd->path reference will also be put.
2223 *
2224 * 1:      if we successfully resolved nd->last and found it to be a symlink
2225 *         that needs to be followed. "path" will be populated with the path
2226 *         to the link, and nd->path will *not* be put.
2227 */
2228static int
2229mountpoint_last(struct nameidata *nd, struct path *path)
2230{
2231        int error = 0;
2232        struct dentry *dentry;
2233        struct dentry *dir = nd->path.dentry;
2234
2235        /* If we're in rcuwalk, drop out of it to handle last component */
2236        if (nd->flags & LOOKUP_RCU) {
2237                if (unlazy_walk(nd, NULL)) {
2238                        error = -ECHILD;
2239                        goto out;
2240                }
2241        }
2242
2243        nd->flags &= ~LOOKUP_PARENT;
2244
2245        if (unlikely(nd->last_type != LAST_NORM)) {
2246                error = handle_dots(nd, nd->last_type);
2247                if (error)
2248                        goto out;
2249                dentry = dget(nd->path.dentry);
2250                goto done;
2251        }
2252
2253        mutex_lock(&dir->d_inode->i_mutex);
2254        dentry = d_lookup(dir, &nd->last);
2255        if (!dentry) {
2256                /*
2257                 * No cached dentry. Mounted dentries are pinned in the cache,
2258                 * so that means that this dentry is probably a symlink or the
2259                 * path doesn't actually point to a mounted dentry.
2260                 */
2261                dentry = d_alloc(dir, &nd->last);
2262                if (!dentry) {
2263                        error = -ENOMEM;
2264                        mutex_unlock(&dir->d_inode->i_mutex);
2265                        goto out;
2266                }
2267                dentry = lookup_real(dir->d_inode, dentry, nd->flags);
2268                error = PTR_ERR(dentry);
2269                if (IS_ERR(dentry)) {
2270                        mutex_unlock(&dir->d_inode->i_mutex);
2271                        goto out;
2272                }
2273        }
2274        mutex_unlock(&dir->d_inode->i_mutex);
2275
2276done:
2277        if (!dentry->d_inode) {
2278                error = -ENOENT;
2279                dput(dentry);
2280                goto out;
2281        }
2282        path->dentry = dentry;
2283        path->mnt = mntget(nd->path.mnt);
2284        if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW))
2285                return 1;
2286        follow_mount(path);
2287        error = 0;
2288out:
2289        terminate_walk(nd);
2290        return error;
2291}
2292
2293/**
2294 * path_mountpoint - look up a path to be umounted
2295 * @dfd:        directory file descriptor to start walk from
2296 * @name:       full pathname to walk
2297 * @path:       pointer to container for result
2298 * @flags:      lookup flags
2299 *
2300 * Look up the given name, but don't attempt to revalidate the last component.
2301 * Returns 0 and "path" will be valid on success; Returns error otherwise.
2302 */
2303static int
2304path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
2305{
2306        struct file *base = NULL;
2307        struct nameidata nd;
2308        int err;
2309
2310        err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
2311        if (unlikely(err))
2312                return err;
2313
2314        current->total_link_count = 0;
2315        err = link_path_walk(name, &nd);
2316        if (err)
2317                goto out;
2318
2319        err = mountpoint_last(&nd, path);
2320        while (err > 0) {
2321                void *cookie;
2322                struct path link = *path;
2323                err = may_follow_link(&link, &nd);
2324                if (unlikely(err))
2325                        break;
2326                nd.flags |= LOOKUP_PARENT;
2327                err = follow_link(&link, &nd, &cookie);
2328                if (err)
2329                        break;
2330                err = mountpoint_last(&nd, path);
2331                put_link(&nd, &link, cookie);
2332        }
2333out:
2334        if (base)
2335                fput(base);
2336
2337        if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
2338                path_put(&nd.root);
2339
2340        return err;
2341}
2342
2343static int
2344filename_mountpoint(int dfd, struct filename *s, struct path *path,
2345                        unsigned int flags)
2346{
2347        int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
2348        if (unlikely(error == -ECHILD))
2349                error = path_mountpoint(dfd, s->name, path, flags);
2350        if (unlikely(error == -ESTALE))
2351                error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
2352        if (likely(!error))
2353                audit_inode(s, path->dentry, 0);
2354        return error;
2355}
2356
2357/**
2358 * user_path_mountpoint_at - lookup a path from userland in order to umount it
2359 * @dfd:        directory file descriptor
2360 * @name:       pathname from userland
2361 * @flags:      lookup flags
2362 * @path:       pointer to container to hold result
2363 *
2364 * A umount is a special case for path walking. We're not actually interested
2365 * in the inode in this situation, and ESTALE errors can be a problem. We
2366 * simply want track down the dentry and vfsmount attached at the mountpoint
2367 * and avoid revalidating the last component.
2368 *
2369 * Returns 0 and populates "path" on success.
2370 */
2371int
2372user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2373                        struct path *path)
2374{
2375        struct filename *s = getname(name);
2376        int error;
2377        if (IS_ERR(s))
2378                return PTR_ERR(s);
2379        error = filename_mountpoint(dfd, s, path, flags);
2380        putname(s);
2381        return error;
2382}
2383
2384int
2385kern_path_mountpoint(int dfd, const char *name, struct path *path,
2386                        unsigned int flags)
2387{
2388        struct filename s = {.name = name};
2389        return filename_mountpoint(dfd, &s, path, flags);
2390}
2391EXPORT_SYMBOL(kern_path_mountpoint);
2392
2393/*
2394 * It's inline, so penalty for filesystems that don't use sticky bit is
2395 * minimal.
2396 */
2397static inline int check_sticky(struct inode *dir, struct inode *inode)
2398{
2399        kuid_t fsuid = current_fsuid();
2400
2401        if (!(dir->i_mode & S_ISVTX))
2402                return 0;
2403        if (uid_eq(inode->i_uid, fsuid))
2404                return 0;
2405        if (uid_eq(dir->i_uid, fsuid))
2406                return 0;
2407        return !inode_capable(inode, CAP_FOWNER);
2408}
2409
2410/*
2411 *      Check whether we can remove a link victim from directory dir, check
2412 *  whether the type of victim is right.
2413 *  1. We can't do it if dir is read-only (done in permission())
2414 *  2. We should have write and exec permissions on dir
2415 *  3. We can't remove anything from append-only dir
2416 *  4. We can't do anything with immutable dir (done in permission())
2417 *  5. If the sticky bit on dir is set we should either
2418 *      a. be owner of dir, or
2419 *      b. be owner of victim, or
2420 *      c. have CAP_FOWNER capability
2421 *  6. If the victim is append-only or immutable we can't do antyhing with
2422 *     links pointing to it.
2423 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2424 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2425 *  9. We can't remove a root or mountpoint.
2426 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
2427 *     nfs_async_unlink().
2428 */
2429static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
2430{
2431        int error;
2432
2433        if (!victim->d_inode)
2434                return -ENOENT;
2435
2436        BUG_ON(victim->d_parent->d_inode != dir);
2437        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2438
2439        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
2440        if (error)
2441                return error;
2442        if (IS_APPEND(dir))
2443                return -EPERM;
2444        if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
2445            IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
2446                return -EPERM;
2447        if (isdir) {
2448                if (!S_ISDIR(victim->d_inode->i_mode))
2449                        return -ENOTDIR;
2450                if (IS_ROOT(victim))
2451                        return -EBUSY;
2452        } else if (S_ISDIR(victim->d_inode->i_mode))
2453                return -EISDIR;
2454        if (IS_DEADDIR(dir))
2455                return -ENOENT;
2456        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2457                return -EBUSY;
2458        return 0;
2459}
2460
2461/*      Check whether we can create an object with dentry child in directory
2462 *  dir.
2463 *  1. We can't do it if child already exists (open has special treatment for
2464 *     this case, but since we are inlined it's OK)
2465 *  2. We can't do it if dir is read-only (done in permission())
2466 *  3. We should have write and exec permissions on dir
2467 *  4. We can't do it if dir is immutable (done in permission())
2468 */
2469static inline int may_create(struct inode *dir, struct dentry *child)
2470{
2471        if (child->d_inode)
2472                return -EEXIST;
2473        if (IS_DEADDIR(dir))
2474                return -ENOENT;
2475        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
2476}
2477
2478/*
2479 * p1 and p2 should be directories on the same fs.
2480 */
2481struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
2482{
2483        struct dentry *p;
2484
2485        if (p1 == p2) {
2486                mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2487                return NULL;
2488        }
2489
2490        mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
2491
2492        p = d_ancestor(p2, p1);
2493        if (p) {
2494                mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
2495                mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
2496                return p;
2497        }
2498
2499        p = d_ancestor(p1, p2);
2500        if (p) {
2501                mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2502                mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
2503                return p;
2504        }
2505
2506        mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2507        mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
2508        return NULL;
2509}
2510
2511void unlock_rename(struct dentry *p1, struct dentry *p2)
2512{
2513        mutex_unlock(&p1->d_inode->i_mutex);
2514        if (p1 != p2) {
2515                mutex_unlock(&p2->d_inode->i_mutex);
2516                mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
2517        }
2518}
2519
2520int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2521                bool want_excl)
2522{
2523        int error = may_create(dir, dentry);
2524        if (error)
2525                return error;
2526
2527        if (!dir->i_op->create)
2528                return -EACCES; /* shouldn't it be ENOSYS? */
2529        mode &= S_IALLUGO;
2530        mode |= S_IFREG;
2531        error = security_inode_create(dir, dentry, mode);
2532        if (error)
2533                return error;
2534        error = dir->i_op->create(dir, dentry, mode, want_excl);
2535        if (!error)
2536                fsnotify_create(dir, dentry);
2537        return error;
2538}
2539
2540static int may_open(struct path *path, int acc_mode, int flag)
2541{
2542        struct dentry *dentry = path->dentry;
2543        struct inode *inode = dentry->d_inode;
2544        int error;
2545
2546        /* O_PATH? */
2547        if (!acc_mode)
2548                return 0;
2549
2550        if (!inode)
2551                return -ENOENT;
2552
2553        switch (inode->i_mode & S_IFMT) {
2554        case S_IFLNK:
2555                return -ELOOP;
2556        case S_IFDIR:
2557                if (acc_mode & MAY_WRITE)
2558                        return -EISDIR;
2559                break;
2560        case S_IFBLK:
2561        case S_IFCHR:
2562                if (path->mnt->mnt_flags & MNT_NODEV)
2563                        return -EACCES;
2564                /*FALLTHRU*/
2565        case S_IFIFO:
2566        case S_IFSOCK:
2567                flag &= ~O_TRUNC;
2568                break;
2569        }
2570
2571        error = inode_permission(inode, acc_mode);
2572        if (error)
2573                return error;
2574
2575        /*
2576         * An append-only file must be opened in append mode for writing.
2577         */
2578        if (IS_APPEND(inode)) {
2579                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2580                        return -EPERM;
2581                if (flag & O_TRUNC)
2582                        return -EPERM;
2583        }
2584
2585        /* O_NOATIME can only be set by the owner or superuser */
2586        if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2587                return -EPERM;
2588
2589        return 0;
2590}
2591
2592static int handle_truncate(struct file *filp)
2593{
2594        struct path *path = &filp->f_path;
2595        struct inode *inode = path->dentry->d_inode;
2596        int error = get_write_access(inode);
2597        if (error)
2598                return error;
2599        /*
2600         * Refuse to truncate files with mandatory locks held on them.
2601         */
2602        error = locks_verify_locked(inode);
2603        if (!error)
2604                error = security_path_truncate(path);
2605        if (!error) {
2606                error = do_truncate(path->dentry, 0,
2607                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
2608                                    filp);
2609        }
2610        put_write_access(inode);
2611        return error;
2612}
2613
2614static inline int open_to_namei_flags(int flag)
2615{
2616        if ((flag & O_ACCMODE) == 3)
2617                flag--;
2618        return flag;
2619}
2620
2621static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
2622{
2623        int error = security_path_mknod(dir, dentry, mode, 0);
2624        if (error)
2625                return error;
2626
2627        error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
2628        if (error)
2629                return error;
2630
2631        return security_inode_create(dir->dentry->d_inode, dentry, mode);
2632}
2633
2634/*
2635 * Attempt to atomically look up, create and open a file from a negative
2636 * dentry.
2637 *
2638 * Returns 0 if successful.  The file will have been created and attached to
2639 * @file by the filesystem calling finish_open().
2640 *
2641 * Returns 1 if the file was looked up only or didn't need creating.  The
2642 * caller will need to perform the open themselves.  @path will have been
2643 * updated to point to the new dentry.  This may be negative.
2644 *
2645 * Returns an error code otherwise.
2646 */
2647static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2648                        struct path *path, struct file *file,
2649                        const struct open_flags *op,
2650                        bool got_write, bool need_lookup,
2651                        int *opened)
2652{
2653        struct inode *dir =  nd->path.dentry->d_inode;
2654        unsigned open_flag = open_to_namei_flags(op->open_flag);
2655        umode_t mode;
2656        int error;
2657        int acc_mode;
2658        int create_error = 0;
2659        struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
2660        bool excl;
2661
2662        BUG_ON(dentry->d_inode);
2663
2664        /* Don't create child dentry for a dead directory. */
2665        if (unlikely(IS_DEADDIR(dir))) {
2666                error = -ENOENT;
2667                goto out;
2668        }
2669
2670        mode = op->mode;
2671        if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
2672                mode &= ~current_umask();
2673
2674        excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
2675        if (excl)
2676                open_flag &= ~O_TRUNC;
2677
2678        /*
2679         * Checking write permission is tricky, bacuse we don't know if we are
2680         * going to actually need it: O_CREAT opens should work as long as the
2681         * file exists.  But checking existence breaks atomicity.  The trick is
2682         * to check access and if not granted clear O_CREAT from the flags.
2683         *
2684         * Another problem is returing the "right" error value (e.g. for an
2685         * O_EXCL open we want to return EEXIST not EROFS).
2686         */
2687        if (((open_flag & (O_CREAT | O_TRUNC)) ||
2688            (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
2689                if (!(open_flag & O_CREAT)) {
2690                        /*
2691                         * No O_CREATE -> atomicity not a requirement -> fall
2692                         * back to lookup + open
2693                         */
2694                        goto no_open;
2695                } else if (open_flag & (O_EXCL | O_TRUNC)) {
2696                        /* Fall back and fail with the right error */
2697                        create_error = -EROFS;
2698                        goto no_open;
2699                } else {
2700                        /* No side effects, safe to clear O_CREAT */
2701                        create_error = -EROFS;
2702                        open_flag &= ~O_CREAT;
2703                }
2704        }
2705
2706        if (open_flag & O_CREAT) {
2707                error = may_o_create(&nd->path, dentry, mode);
2708                if (error) {
2709                        create_error = error;
2710                        if (open_flag & O_EXCL)
2711                                goto no_open;
2712                        open_flag &= ~O_CREAT;
2713                }
2714        }
2715
2716        if (nd->flags & LOOKUP_DIRECTORY)
2717                open_flag |= O_DIRECTORY;
2718
2719        file->f_path.dentry = DENTRY_NOT_SET;
2720        file->f_path.mnt = nd->path.mnt;
2721        error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
2722                                      opened);
2723        if (error < 0) {
2724                if (create_error && error == -ENOENT)
2725                        error = create_error;
2726                goto out;
2727        }
2728
2729        if (error) {    /* returned 1, that is */
2730                if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
2731                        error = -EIO;
2732                        goto out;
2733                }
2734                if (file->f_path.dentry) {
2735                        dput(dentry);
2736                        dentry = file->f_path.dentry;
2737                }
2738                if (*opened & FILE_CREATED)
2739                        fsnotify_create(dir, dentry);
2740                if (!dentry->d_inode) {
2741                        WARN_ON(*opened & FILE_CREATED);
2742                        if (create_error) {
2743                                error = create_error;
2744                                goto out;
2745                        }
2746                } else {
2747                        if (excl && !(*opened & FILE_CREATED)) {
2748                                error = -EEXIST;
2749                                goto out;
2750                        }
2751                }
2752                goto looked_up;
2753        }
2754
2755        /*
2756         * We didn't have the inode before the open, so check open permission
2757         * here.
2758         */
2759        acc_mode = op->acc_mode;
2760        if (*opened & FILE_CREATED) {
2761                WARN_ON(!(open_flag & O_CREAT));
2762                fsnotify_create(dir, dentry);
2763                acc_mode = MAY_OPEN;
2764        }
2765        error = may_open(&file->f_path, acc_mode, open_flag);
2766        if (error)
2767                fput(file);
2768
2769out:
2770        dput(dentry);
2771        return error;
2772
2773no_open:
2774        if (need_lookup) {
2775                dentry = lookup_real(dir, dentry, nd->flags);
2776                if (IS_ERR(dentry))
2777                        return PTR_ERR(dentry);
2778
2779                if (create_error) {
2780                        int open_flag = op->open_flag;
2781
2782                        error = create_error;
2783                        if ((open_flag & O_EXCL)) {
2784                                if (!dentry->d_inode)
2785                                        goto out;
2786                        } else if (!dentry->d_inode) {
2787                                goto out;
2788                        } else if ((open_flag & O_TRUNC) &&
2789                                   S_ISREG(dentry->d_inode->i_mode)) {
2790                                goto out;
2791                        }
2792                        /* will fail later, go on to get the right error */
2793                }
2794        }
2795looked_up:
2796        path->dentry = dentry;
2797        path->mnt = nd->path.mnt;
2798        return 1;
2799}
2800
2801/*
2802 * Look up and maybe create and open the last component.
2803 *
2804 * Must be called with i_mutex held on parent.
2805 *
2806 * Returns 0 if the file was successfully atomically created (if necessary) and
2807 * opened.  In this case the file will be returned attached to @file.
2808 *
2809 * Returns 1 if the file was not completely opened at this time, though lookups
2810 * and creations will have been performed and the dentry returned in @path will
2811 * be positive upon return if O_CREAT was specified.  If O_CREAT wasn't
2812 * specified then a negative dentry may be returned.
2813 *
2814 * An error code is returned otherwise.
2815 *
2816 * FILE_CREATE will be set in @*opened if the dentry was created and will be
2817 * cleared otherwise prior to returning.
2818 */
2819static int lookup_open(struct nameidata *nd, struct path *path,
2820                        struct file *file,
2821                        const struct open_flags *op,
2822                        bool got_write, int *opened)
2823{
2824        struct dentry *dir = nd->path.dentry;
2825        struct inode *dir_inode = dir->d_inode;
2826        struct dentry *dentry;
2827        int error;
2828        bool need_lookup;
2829
2830        *opened &= ~FILE_CREATED;
2831        dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
2832        if (IS_ERR(dentry))
2833                return PTR_ERR(dentry);
2834
2835        /* Cached positive dentry: will open in f_op->open */
2836        if (!need_lookup && dentry->d_inode)
2837                goto out_no_open;
2838
2839        if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
2840                return atomic_open(nd, dentry, path, file, op, got_write,
2841                                   need_lookup, opened);
2842        }
2843
2844        if (need_lookup) {
2845                BUG_ON(dentry->d_inode);
2846
2847                dentry = lookup_real(dir_inode, dentry, nd->flags);
2848                if (IS_ERR(dentry))
2849                        return PTR_ERR(dentry);
2850        }
2851
2852        /* Negative dentry, just create the file */
2853        if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
2854                umode_t mode = op->mode;
2855                if (!IS_POSIXACL(dir->d_inode))
2856                        mode &= ~current_umask();
2857                /*
2858                 * This write is needed to ensure that a
2859                 * rw->ro transition does not occur between
2860                 * the time when the file is created and when
2861                 * a permanent write count is taken through
2862                 * the 'struct file' in finish_open().
2863                 */
2864                if (!got_write) {
2865                        error = -EROFS;
2866                        goto out_dput;
2867                }
2868                *opened |= FILE_CREATED;
2869                error = security_path_mknod(&nd->path, dentry, mode, 0);
2870                if (error)
2871                        goto out_dput;
2872                error = vfs_create(dir->d_inode, dentry, mode,
2873                                   nd->flags & LOOKUP_EXCL);
2874                if (error)
2875                        goto out_dput;
2876        }
2877out_no_open:
2878        path->dentry = dentry;
2879        path->mnt = nd->path.mnt;
2880        return 1;
2881
2882out_dput:
2883        dput(dentry);
2884        return error;
2885}
2886
2887/*
2888 * Handle the last step of open()
2889 */
2890static int do_last(struct nameidata *nd, struct path *path,
2891                   struct file *file, const struct open_flags *op,
2892                   int *opened, struct filename *name)
2893{
2894        struct dentry *dir = nd->path.dentry;
2895        int open_flag = op->open_flag;
2896        bool will_truncate = (open_flag & O_TRUNC) != 0;
2897        bool got_write = false;
2898        int acc_mode = op->acc_mode;
2899        struct inode *inode;
2900        bool symlink_ok = false;
2901        struct path save_parent = { .dentry = NULL, .mnt = NULL };
2902        bool retried = false;
2903        int error;
2904
2905        nd->flags &= ~LOOKUP_PARENT;
2906        nd->flags |= op->intent;
2907
2908        if (nd->last_type != LAST_NORM) {
2909                error = handle_dots(nd, nd->last_type);
2910                if (error)
2911                        return error;
2912                goto finish_open;
2913        }
2914
2915        if (!(open_flag & O_CREAT)) {
2916                if (nd->last.name[nd->last.len])
2917                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2918                if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
2919                        symlink_ok = true;
2920                /* we _can_ be in RCU mode here */
2921                error = lookup_fast(nd, path, &inode);
2922                if (likely(!error))
2923                        goto finish_lookup;
2924
2925                if (error < 0)
2926                        goto out;
2927
2928                BUG_ON(nd->inode != dir->d_inode);
2929        } else {
2930                /* create side of things */
2931                /*
2932                 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
2933                 * has been cleared when we got to the last component we are
2934                 * about to look up
2935                 */
2936                error = complete_walk(nd);
2937                if (error)
2938                        return error;
2939
2940                audit_inode(name, dir, LOOKUP_PARENT);
2941                error = -EISDIR;
2942                /* trailing slashes? */
2943                if (nd->last.name[nd->last.len])
2944                        goto out;
2945        }
2946
2947retry_lookup:
2948        if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
2949                error = mnt_want_write(nd->path.mnt);
2950                if (!error)
2951                        got_write = true;
2952                /*
2953                 * do _not_ fail yet - we might not need that or fail with
2954                 * a different error; let lookup_open() decide; we'll be
2955                 * dropping this one anyway.
2956                 */
2957        }
2958        mutex_lock(&dir->d_inode->i_mutex);
2959        error = lookup_open(nd, path, file, op, got_write, opened);
2960        mutex_unlock(&dir->d_inode->i_mutex);
2961
2962        if (error <= 0) {
2963                if (error)
2964                        goto out;
2965
2966                if ((*opened & FILE_CREATED) ||
2967                    !S_ISREG(file_inode(file)->i_mode))
2968                        will_truncate = false;
2969
2970                audit_inode(name, file->f_path.dentry, 0);
2971                goto opened;
2972        }
2973
2974        if (*opened & FILE_CREATED) {
2975                /* Don't check for write permission, don't truncate */
2976                open_flag &= ~O_TRUNC;
2977                will_truncate = false;
2978                acc_mode = MAY_OPEN;
2979                path_to_nameidata(path, nd);
2980                goto finish_open_created;
2981        }
2982
2983        /*
2984         * create/update audit record if it already exists.
2985         */
2986        if (path->dentry->d_inode)
2987                audit_inode(name, path->dentry, 0);
2988
2989        /*
2990         * If atomic_open() acquired write access it is dropped now due to
2991         * possible mount and symlink following (this might be optimized away if
2992         * necessary...)
2993         */
2994        if (got_write) {
2995                mnt_drop_write(nd->path.mnt);
2996                got_write = false;
2997        }
2998
2999        error = -EEXIST;
3000        if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
3001                goto exit_dput;
3002
3003        error = follow_managed(path, nd->flags);
3004        if (error < 0)
3005                goto exit_dput;
3006
3007        if (error)
3008                nd->flags |= LOOKUP_JUMPED;
3009
3010        BUG_ON(nd->flags & LOOKUP_RCU);
3011        inode = path->dentry->d_inode;
3012finish_lookup:
3013        /* we _can_ be in RCU mode here */
3014        error = -ENOENT;
3015        if (!inode) {
3016                path_to_nameidata(path, nd);
3017                goto out;
3018        }
3019
3020        if (should_follow_link(inode, !symlink_ok)) {
3021                if (nd->flags & LOOKUP_RCU) {
3022                        if (unlikely(unlazy_walk(nd, path->dentry))) {
3023                                error = -ECHILD;
3024                                goto out;
3025                        }
3026                }
3027                BUG_ON(inode != path->dentry->d_inode);
3028                return 1;
3029        }
3030
3031        if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
3032                path_to_nameidata(path, nd);
3033        } else {
3034                save_parent.dentry = nd->path.dentry;
3035                save_parent.mnt = mntget(path->mnt);
3036                nd->path.dentry = path->dentry;
3037
3038        }
3039        nd->inode = inode;
3040        /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
3041finish_open:
3042        error = complete_walk(nd);
3043        if (error) {
3044                path_put(&save_parent);
3045                return error;
3046        }
3047        audit_inode(name, nd->path.dentry, 0);
3048        error = -EISDIR;
3049        if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
3050                goto out;
3051        error = -ENOTDIR;
3052        if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
3053                goto out;
3054        if (!S_ISREG(nd->inode->i_mode))
3055                will_truncate = false;
3056
3057        if (will_truncate) {
3058                error = mnt_want_write(nd->path.mnt);
3059                if (error)
3060                        goto out;
3061                got_write = true;
3062        }
3063finish_open_created:
3064        error = may_open(&nd->path, acc_mode, open_flag);
3065        if (error)
3066                goto out;
3067        file->f_path.mnt = nd->path.mnt;
3068        error = finish_open(file, nd->path.dentry, NULL, opened);
3069        if (error) {
3070                if (error == -EOPENSTALE)
3071                        goto stale_open;
3072                goto out;
3073        }
3074opened:
3075        error = open_check_o_direct(file);
3076        if (error)
3077                goto exit_fput;
3078        error = ima_file_check(file, op->acc_mode);
3079        if (error)
3080                goto exit_fput;
3081
3082        if (will_truncate) {
3083                error = handle_truncate(file);
3084                if (error)
3085                        goto exit_fput;
3086        }
3087out:
3088        if (got_write)
3089                mnt_drop_write(nd->path.mnt);
3090        path_put(&save_parent);
3091        terminate_walk(nd);
3092        return error;
3093
3094exit_dput:
3095        path_put_conditional(path, nd);
3096        goto out;
3097exit_fput:
3098        fput(file);
3099        goto out;
3100
3101stale_open:
3102        /* If no saved parent or already retried then can't retry */
3103        if (!save_parent.dentry || retried)
3104                goto out;
3105
3106        BUG_ON(save_parent.dentry != dir);
3107        path_put(&nd->path);
3108        nd->path = save_parent;
3109        nd->inode = dir->d_inode;
3110        save_parent.mnt = NULL;
3111        save_parent.dentry = NULL;
3112        if (got_write) {
3113                mnt_drop_write(nd->path.mnt);
3114                got_write = false;
3115        }
3116        retried = true;
3117        goto retry_lookup;
3118}
3119
3120static int do_tmpfile(int dfd, struct filename *pathname,
3121                struct nameidata *nd, int flags,
3122                const struct open_flags *op,
3123                struct file *file, int *opened)
3124{
3125        static const struct qstr name = QSTR_INIT("/", 1);
3126        struct dentry *dentry, *child;
3127        struct inode *dir;
3128        int error = path_lookupat(dfd, pathname->name,
3129                                  flags | LOOKUP_DIRECTORY, nd);
3130        if (unlikely(error))
3131                return error;
3132        error = mnt_want_write(nd->path.mnt);
3133        if (unlikely(error))
3134                goto out;
3135        /* we want directory to be writable */
3136        error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
3137        if (error)
3138                goto out2;
3139        dentry = nd->path.dentry;
3140        dir = dentry->d_inode;
3141        if (!dir->i_op->tmpfile) {
3142                error = -EOPNOTSUPP;
3143                goto out2;
3144        }
3145        child = d_alloc(dentry, &name);
3146        if (unlikely(!child)) {
3147                error = -ENOMEM;
3148                goto out2;
3149        }
3150        nd->flags &= ~LOOKUP_DIRECTORY;
3151        nd->flags |= op->intent;
3152        dput(nd->path.dentry);
3153        nd->path.dentry = child;
3154        error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
3155        if (error)
3156                goto out2;
3157        audit_inode(pathname, nd->path.dentry, 0);
3158        error = may_open(&nd->path, op->acc_mode, op->open_flag);
3159        if (error)
3160                goto out2;
3161        file->f_path.mnt = nd->path.mnt;
3162        error = finish_open(file, nd->path.dentry, NULL, opened);
3163        if (error)
3164                goto out2;
3165        error = open_check_o_direct(file);
3166        if (error) {
3167                fput(file);
3168        } else if (!(op->open_flag & O_EXCL)) {
3169                struct inode *inode = file_inode(file);
3170                spin_lock(&inode->i_lock);
3171                inode->i_state |= I_LINKABLE;
3172                spin_unlock(&inode->i_lock);
3173        }
3174out2:
3175        mnt_drop_write(nd->path.mnt);
3176out:
3177        path_put(&nd->path);
3178        return error;
3179}
3180
3181static struct file *path_openat(int dfd, struct filename *pathname,
3182                struct nameidata *nd, const struct open_flags *op, int flags)
3183{
3184        struct file *base = NULL;
3185        struct file *file;
3186        struct path path;
3187        int opened = 0;
3188        int error;
3189
3190        file = get_empty_filp();
3191        if (IS_ERR(file))
3192                return file;
3193
3194        file->f_flags = op->open_flag;
3195
3196        if (unlikely(file->f_flags & __O_TMPFILE)) {
3197                error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
3198                goto out;
3199        }
3200
3201        error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
3202        if (unlikely(error))
3203                goto out;
3204
3205        current->total_link_count = 0;
3206        error = link_path_walk(pathname->name, nd);
3207        if (unlikely(error))
3208                goto out;
3209
3210        error = do_last(nd, &path, file, op, &opened, pathname);
3211        while (unlikely(error > 0)) { /* trailing symlink */
3212                struct path link = path;
3213                void *cookie;
3214                if (!(nd->flags & LOOKUP_FOLLOW)) {
3215                        path_put_conditional(&path, nd);
3216                        path_put(&nd->path);
3217                        error = -ELOOP;
3218                        break;
3219                }
3220                error = may_follow_link(&link, nd);
3221                if (unlikely(error))
3222                        break;
3223                nd->flags |= LOOKUP_PARENT;
3224                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3225                error = follow_link(&link, nd, &cookie);
3226                if (unlikely(error))
3227                        break;
3228                error = do_last(nd, &path, file, op, &opened, pathname);
3229                put_link(nd, &link, cookie);
3230        }
3231out:
3232        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
3233                path_put(&nd->root);
3234        if (base)
3235                fput(base);
3236        if (!(opened & FILE_OPENED)) {
3237                BUG_ON(!error);
3238                put_filp(file);
3239        }
3240        if (unlikely(error)) {
3241                if (error == -EOPENSTALE) {
3242                        if (flags & LOOKUP_RCU)
3243                                error = -ECHILD;
3244                        else
3245                                error = -ESTALE;
3246                }
3247                file = ERR_PTR(error);
3248        }
3249        return file;
3250}
3251
3252struct file *do_filp_open(int dfd, struct filename *pathname,
3253                const struct open_flags *op)
3254{
3255        struct nameidata nd;
3256        int flags = op->lookup_flags;
3257        struct file *filp;
3258
3259        filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
3260        if (unlikely(filp == ERR_PTR(-ECHILD)))
3261                filp = path_openat(dfd, pathname, &nd, op, flags);
3262        if (unlikely(filp == ERR_PTR(-ESTALE)))
3263                filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
3264        return filp;
3265}
3266
3267struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3268                const char *name, const struct open_flags *op)
3269{
3270        struct nameidata nd;
3271        struct file *file;
3272        struct filename filename = { .name = name };
3273        int flags = op->lookup_flags | LOOKUP_ROOT;
3274
3275        nd.root.mnt = mnt;
3276        nd.root.dentry = dentry;
3277
3278        if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
3279                return ERR_PTR(-ELOOP);
3280
3281        file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
3282        if (unlikely(file == ERR_PTR(-ECHILD)))
3283                file = path_openat(-1, &filename, &nd, op, flags);
3284        if (unlikely(file == ERR_PTR(-ESTALE)))
3285                file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL);
3286        return file;
3287}
3288
3289struct dentry *kern_path_create(int dfd, const char *pathname,
3290                                struct path *path, unsigned int lookup_flags)
3291{
3292        struct dentry *dentry = ERR_PTR(-EEXIST);
3293        struct nameidata nd;
3294        int err2;
3295        int error;
3296        bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
3297
3298        /*
3299         * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
3300         * other flags passed in are ignored!
3301         */
3302        lookup_flags &= LOOKUP_REVAL;
3303
3304        error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
3305        if (error)
3306                return ERR_PTR(error);
3307
3308        /*
3309         * Yucky last component or no last component at all?
3310         * (foo/., foo/.., /////)
3311         */
3312        if (nd.last_type != LAST_NORM)
3313                goto out;
3314        nd.flags &= ~LOOKUP_PARENT;
3315        nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
3316
3317        /* don't fail immediately if it's r/o, at least try to report other errors */
3318        err2 = mnt_want_write(nd.path.mnt);
3319        /*
3320         * Do the final lookup.
3321         */
3322        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3323        dentry = lookup_hash(&nd);
3324        if (IS_ERR(dentry))
3325                goto unlock;
3326
3327        error = -EEXIST;
3328        if (dentry->d_inode)
3329                goto fail;
3330        /*
3331         * Special case - lookup gave negative, but... we had foo/bar/
3332         * From the vfs_mknod() POV we just have a negative dentry -
3333         * all is fine. Let's be bastards - you had / on the end, you've
3334         * been asking for (non-existent) directory. -ENOENT for you.
3335         */
3336        if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
3337                error = -ENOENT;
3338                goto fail;
3339        }
3340        if (unlikely(err2)) {
3341                error = err2;
3342                goto fail;
3343        }
3344        *path = nd.path;
3345        return dentry;
3346fail:
3347        dput(dentry);
3348        dentry = ERR_PTR(error);
3349unlock:
3350        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3351        if (!err2)
3352                mnt_drop_write(nd.path.mnt);
3353out:
3354        path_put(&nd.path);
3355        return dentry;
3356}
3357EXPORT_SYMBOL(kern_path_create);
3358
3359void done_path_create(struct path *path, struct dentry *dentry)
3360{
3361        dput(dentry);
3362        mutex_unlock(&path->dentry->d_inode->i_mutex);
3363        mnt_drop_write(path->mnt);
3364        path_put(path);
3365}
3366EXPORT_SYMBOL(done_path_create);
3367
3368struct dentry *user_path_create(int dfd, const char __user *pathname,
3369                                struct path *path, unsigned int lookup_flags)
3370{
3371        struct filename *tmp = getname(pathname);
3372        struct dentry *res;
3373        if (IS_ERR(tmp))
3374                return ERR_CAST(tmp);
3375        res = kern_path_create(dfd, tmp->name, path, lookup_flags);
3376        putname(tmp);
3377        return res;
3378}
3379EXPORT_SYMBOL(user_path_create);
3380
3381int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
3382{
3383        int error = may_create(dir, dentry);
3384
3385        if (error)
3386                return error;
3387
3388        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
3389                return -EPERM;
3390
3391        if (!dir->i_op->mknod)
3392                return -EPERM;
3393
3394        error = devcgroup_inode_mknod(mode, dev);
3395        if (error)
3396                return error;
3397
3398        error = security_inode_mknod(dir, dentry, mode, dev);
3399        if (error)
3400                return error;
3401
3402        error = dir->i_op->mknod(dir, dentry, mode, dev);
3403        if (!error)
3404                fsnotify_create(dir, dentry);
3405        return error;
3406}
3407
3408static int may_mknod(umode_t mode)
3409{
3410        switch (mode & S_IFMT) {
3411        case S_IFREG:
3412        case S_IFCHR:
3413        case S_IFBLK:
3414        case S_IFIFO:
3415        case S_IFSOCK:
3416        case 0: /* zero mode translates to S_IFREG */
3417                return 0;
3418        case S_IFDIR:
3419                return -EPERM;
3420        default:
3421                return -EINVAL;
3422        }
3423}
3424
3425SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3426                unsigned, dev)
3427{
3428        struct dentry *dentry;
3429        struct path path;
3430        int error;
3431        unsigned int lookup_flags = 0;
3432
3433        error = may_mknod(mode);
3434        if (error)
3435                return error;
3436retry:
3437        dentry = user_path_create(dfd, filename, &path, lookup_flags);
3438        if (IS_ERR(dentry))
3439                return PTR_ERR(dentry);
3440
3441        if (!IS_POSIXACL(path.dentry->d_inode))
3442                mode &= ~current_umask();
3443        error = security_path_mknod(&path, dentry, mode, dev);
3444        if (error)
3445                goto out;
3446        switch (mode & S_IFMT) {
3447                case 0: case S_IFREG:
3448                        error = vfs_create(path.dentry->d_inode,dentry,mode,true);
3449                        break;
3450                case S_IFCHR: case S_IFBLK:
3451                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,
3452                                        new_decode_dev(dev));
3453                        break;
3454                case S_IFIFO: case S_IFSOCK:
3455                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
3456                        break;
3457        }
3458out:
3459        done_path_create(&path, dentry);
3460        if (retry_estale(error, lookup_flags)) {
3461                lookup_flags |= LOOKUP_REVAL;
3462                goto retry;
3463        }
3464        return error;
3465}
3466
3467SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3468{
3469        return sys_mknodat(AT_FDCWD, filename, mode, dev);
3470}
3471
3472int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3473{
3474        int error = may_create(dir, dentry);
3475        unsigned max_links = dir->i_sb->s_max_links;
3476
3477        if (error)
3478                return error;
3479
3480        if (!dir->i_op->mkdir)
3481                return -EPERM;
3482
3483        mode &= (S_IRWXUGO|S_ISVTX);
3484        error = security_inode_mkdir(dir, dentry, mode);
3485        if (error)
3486                return error;
3487
3488        if (max_links && dir->i_nlink >= max_links)
3489                return -EMLINK;
3490
3491        error = dir->i_op->mkdir(dir, dentry, mode);
3492        if (!error)
3493                fsnotify_mkdir(dir, dentry);
3494        return error;
3495}
3496
3497SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3498{
3499        struct dentry *dentry;
3500        struct path path;
3501        int error;
3502        unsigned int lookup_flags = LOOKUP_DIRECTORY;
3503
3504retry:
3505        dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3506        if (IS_ERR(dentry))
3507                return PTR_ERR(dentry);
3508
3509        if (!IS_POSIXACL(path.dentry->d_inode))
3510                mode &= ~current_umask();
3511        error = security_path_mkdir(&path, dentry, mode);
3512        if (!error)
3513                error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
3514        done_path_create(&path, dentry);
3515        if (retry_estale(error, lookup_flags)) {
3516                lookup_flags |= LOOKUP_REVAL;
3517                goto retry;
3518        }
3519        return error;
3520}
3521
3522SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3523{
3524        return sys_mkdirat(AT_FDCWD, pathname, mode);
3525}
3526
3527/*
3528 * The dentry_unhash() helper will try to drop the dentry early: we
3529 * should have a usage count of 1 if we're the only user of this
3530 * dentry, and if that is true (possibly after pruning the dcache),
3531 * then we drop the dentry now.
3532 *
3533 * A low-level filesystem can, if it choses, legally
3534 * do a
3535 *
3536 *      if (!d_unhashed(dentry))
3537 *              return -EBUSY;
3538 *
3539 * if it cannot handle the case of removing a directory
3540 * that is still in use by something else..
3541 */
3542void dentry_unhash(struct dentry *dentry)
3543{
3544        shrink_dcache_parent(dentry);
3545        spin_lock(&dentry->d_lock);
3546        if (dentry->d_lockref.count == 1)
3547                __d_drop(dentry);
3548        spin_unlock(&dentry->d_lock);
3549}
3550
3551int vfs_rmdir(struct inode *dir, struct dentry *dentry)
3552{
3553        int error = may_delete(dir, dentry, 1);
3554
3555        if (error)
3556                return error;
3557
3558        if (!dir->i_op->rmdir)
3559                return -EPERM;
3560
3561        dget(dentry);
3562        mutex_lock(&dentry->d_inode->i_mutex);
3563
3564        error = -EBUSY;
3565        if (d_mountpoint(dentry))
3566                goto out;
3567
3568        error = security_inode_rmdir(dir, dentry);
3569        if (error)
3570                goto out;
3571
3572        shrink_dcache_parent(dentry);
3573        error = dir->i_op->rmdir(dir, dentry);
3574        if (error)
3575                goto out;
3576
3577        dentry->d_inode->i_flags |= S_DEAD;
3578        dont_mount(dentry);
3579
3580out:
3581        mutex_unlock(&dentry->d_inode->i_mutex);
3582        dput(dentry);
3583        if (!error)
3584                d_delete(dentry);
3585        return error;
3586}
3587
3588static long do_rmdir(int dfd, const char __user *pathname)
3589{
3590        int error = 0;
3591        struct filename *name;
3592        struct dentry *dentry;
3593        struct nameidata nd;
3594        unsigned int lookup_flags = 0;
3595retry:
3596        name = user_path_parent(dfd, pathname, &nd, lookup_flags);
3597        if (IS_ERR(name))
3598                return PTR_ERR(name);
3599
3600        switch(nd.last_type) {
3601        case LAST_DOTDOT:
3602                error = -ENOTEMPTY;
3603                goto exit1;
3604        case LAST_DOT:
3605                error = -EINVAL;
3606                goto exit1;
3607        case LAST_ROOT:
3608                error = -EBUSY;
3609                goto exit1;
3610        }
3611
3612        nd.flags &= ~LOOKUP_PARENT;
3613        error = mnt_want_write(nd.path.mnt);
3614        if (error)
3615                goto exit1;
3616
3617        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3618        dentry = lookup_hash(&nd);
3619        error = PTR_ERR(dentry);
3620        if (IS_ERR(dentry))
3621                goto exit2;
3622        if (!dentry->d_inode) {
3623                error = -ENOENT;
3624                goto exit3;
3625        }
3626        error = security_path_rmdir(&nd.path, dentry);
3627        if (error)
3628                goto exit3;
3629        error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
3630exit3:
3631        dput(dentry);
3632exit2:
3633        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3634        mnt_drop_write(nd.path.mnt);
3635exit1:
3636        path_put(&nd.path);
3637        putname(name);
3638        if (retry_estale(error, lookup_flags)) {
3639                lookup_flags |= LOOKUP_REVAL;
3640                goto retry;
3641        }
3642        return error;
3643}
3644
3645SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3646{
3647        return do_rmdir(AT_FDCWD, pathname);
3648}
3649
3650int vfs_unlink(struct inode *dir, struct dentry *dentry)
3651{
3652        int error = may_delete(dir, dentry, 0);
3653
3654        if (error)
3655                return error;
3656
3657        if (!dir->i_op->unlink)
3658                return -EPERM;
3659
3660        mutex_lock(&dentry->d_inode->i_mutex);
3661        if (d_mountpoint(dentry))
3662                error = -EBUSY;
3663        else {
3664                error = security_inode_unlink(dir, dentry);
3665                if (!error) {
3666                        error = dir->i_op->unlink(dir, dentry);
3667                        if (!error)
3668                                dont_mount(dentry);
3669                }
3670        }
3671        mutex_unlock(&dentry->d_inode->i_mutex);
3672
3673        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
3674        if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
3675                fsnotify_link_count(dentry->d_inode);
3676                d_delete(dentry);
3677        }
3678
3679        return error;
3680}
3681
3682/*
3683 * Make sure that the actual truncation of the file will occur outside its
3684 * directory's i_mutex.  Truncate can take a long time if there is a lot of
3685 * writeout happening, and we don't want to prevent access to the directory
3686 * while waiting on the I/O.
3687 */
3688static long do_unlinkat(int dfd, const char __user *pathname)
3689{
3690        int error;
3691        struct filename *name;
3692        struct dentry *dentry;
3693        struct nameidata nd;
3694        struct inode *inode = NULL;
3695        unsigned int lookup_flags = 0;
3696retry:
3697        name = user_path_parent(dfd, pathname, &nd, lookup_flags);
3698        if (IS_ERR(name))
3699                return PTR_ERR(name);
3700
3701        error = -EISDIR;
3702        if (nd.last_type != LAST_NORM)
3703                goto exit1;
3704
3705        nd.flags &= ~LOOKUP_PARENT;
3706        error = mnt_want_write(nd.path.mnt);
3707        if (error)
3708                goto exit1;
3709
3710        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3711        dentry = lookup_hash(&nd);
3712        error = PTR_ERR(dentry);
3713        if (!IS_ERR(dentry)) {
3714                /* Why not before? Because we want correct error value */
3715                if (nd.last.name[nd.last.len])
3716                        goto slashes;
3717                inode = dentry->d_inode;
3718                if (!inode)
3719                        goto slashes;
3720                ihold(inode);
3721                error = security_path_unlink(&nd.path, dentry);
3722                if (error)
3723                        goto exit2;
3724                error = vfs_unlink(nd.path.dentry->d_inode, dentry);
3725exit2:
3726                dput(dentry);
3727        }
3728        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3729        if (inode)
3730                iput(inode);    /* truncate the inode here */
3731        mnt_drop_write(nd.path.mnt);
3732exit1:
3733        path_put(&nd.path);
3734        putname(name);
3735        if (retry_estale(error, lookup_flags)) {
3736                lookup_flags |= LOOKUP_REVAL;
3737                inode = NULL;
3738                goto retry;
3739        }
3740        return error;
3741
3742slashes:
3743        error = !dentry->d_inode ? -ENOENT :
3744                S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
3745        goto exit2;
3746}
3747
3748SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
3749{
3750        if ((flag & ~AT_REMOVEDIR) != 0)
3751                return -EINVAL;
3752
3753        if (flag & AT_REMOVEDIR)
3754                return do_rmdir(dfd, pathname);
3755
3756        return do_unlinkat(dfd, pathname);
3757}
3758
3759SYSCALL_DEFINE1(unlink, const char __user *, pathname)
3760{
3761        return do_unlinkat(AT_FDCWD, pathname);
3762}
3763
3764int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
3765{
3766        int error = may_create(dir, dentry);
3767
3768        if (error)
3769                return error;
3770
3771        if (!dir->i_op->symlink)
3772                return -EPERM;
3773
3774        error = security_inode_symlink(dir, dentry, oldname);
3775        if (error)
3776                return error;
3777
3778        error = dir->i_op->symlink(dir, dentry, oldname);
3779        if (!error)
3780                fsnotify_create(dir, dentry);
3781        return error;
3782}
3783
3784SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3785                int, newdfd, const char __user *, newname)
3786{
3787        int error;
3788        struct filename *from;
3789        struct dentry *dentry;
3790        struct path path;
3791        unsigned int lookup_flags = 0;
3792
3793        from = getname(oldname);
3794        if (IS_ERR(from))
3795                return PTR_ERR(from);
3796retry:
3797        dentry = user_path_create(newdfd, newname, &path, lookup_flags);
3798        error = PTR_ERR(dentry);
3799        if (IS_ERR(dentry))
3800                goto out_putname;
3801
3802        error = security_path_symlink(&path, dentry, from->name);
3803        if (!error)
3804                error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
3805        done_path_create(&path, dentry);
3806        if (retry_estale(error, lookup_flags)) {
3807                lookup_flags |= LOOKUP_REVAL;
3808                goto retry;
3809        }
3810out_putname:
3811        putname(from);
3812        return error;
3813}
3814
3815SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
3816{
3817        return sys_symlinkat(oldname, AT_FDCWD, newname);
3818}
3819
3820int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
3821{
3822        struct inode *inode = old_dentry->d_inode;
3823        unsigned max_links = dir->i_sb->s_max_links;
3824        int error;
3825
3826        if (!inode)
3827                return -ENOENT;
3828
3829        error = may_create(dir, new_dentry);
3830        if (error)
3831                return error;
3832
3833        if (dir->i_sb != inode->i_sb)
3834                return -EXDEV;
3835
3836        /*
3837         * A link to an append-only or immutable file cannot be created.
3838         */
3839        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
3840                return -EPERM;
3841        if (!dir->i_op->link)
3842                return -EPERM;
3843        if (S_ISDIR(inode->i_mode))
3844                return -EPERM;
3845
3846        error = security_inode_link(old_dentry, dir, new_dentry);
3847        if (error)
3848                return error;
3849
3850        mutex_lock(&inode->i_mutex);
3851        /* Make sure we don't allow creating hardlink to an unlinked file */
3852        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
3853                error =  -ENOENT;
3854        else if (max_links && inode->i_nlink >= max_links)
3855                error = -EMLINK;
3856        else
3857                error = dir->i_op->link(old_dentry, dir, new_dentry);
3858
3859        if (!error && (inode->i_state & I_LINKABLE)) {
3860                spin_lock(&inode->i_lock);
3861                inode->i_state &= ~I_LINKABLE;
3862                spin_unlock(&inode->i_lock);
3863        }
3864        mutex_unlock(&inode->i_mutex);
3865        if (!error)
3866                fsnotify_link(dir, inode, new_dentry);
3867        return error;
3868}
3869
3870/*
3871 * Hardlinks are often used in delicate situations.  We avoid
3872 * security-related surprises by not following symlinks on the
3873 * newname.  --KAB
3874 *
3875 * We don't follow them on the oldname either to be compatible
3876 * with linux 2.0, and to avoid hard-linking to directories
3877 * and other special files.  --ADM
3878 */
3879SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3880                int, newdfd, const char __user *, newname, int, flags)
3881{
3882        struct dentry *new_dentry;
3883        struct path old_path, new_path;
3884        int how = 0;
3885        int error;
3886
3887        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
3888                return -EINVAL;
3889        /*
3890         * To use null names we require CAP_DAC_READ_SEARCH
3891         * This ensures that not everyone will be able to create
3892         * handlink using the passed filedescriptor.
3893         */
3894        if (flags & AT_EMPTY_PATH) {
3895                if (!capable(CAP_DAC_READ_SEARCH))
3896                        return -ENOENT;
3897                how = LOOKUP_EMPTY;
3898        }
3899
3900        if (flags & AT_SYMLINK_FOLLOW)
3901                how |= LOOKUP_FOLLOW;
3902retry:
3903        error = user_path_at(olddfd, oldname, how, &old_path);
3904        if (error)
3905                return error;
3906
3907        new_dentry = user_path_create(newdfd, newname, &new_path,
3908                                        (how & LOOKUP_REVAL));
3909        error = PTR_ERR(new_dentry);
3910        if (IS_ERR(new_dentry))
3911                goto out;
3912
3913        error = -EXDEV;
3914        if (old_path.mnt != new_path.mnt)
3915                goto out_dput;
3916        error = may_linkat(&old_path);
3917        if (unlikely(error))
3918                goto out_dput;
3919        error = security_path_link(old_path.dentry, &new_path, new_dentry);
3920        if (error)
3921                goto out_dput;
3922        error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
3923out_dput:
3924        done_path_create(&new_path, new_dentry);
3925        if (retry_estale(error, how)) {
3926                how |= LOOKUP_REVAL;
3927                goto retry;
3928        }
3929out:
3930        path_put(&old_path);
3931
3932        return error;
3933}
3934
3935SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
3936{
3937        return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
3938}
3939
3940/*
3941 * The worst of all namespace operations - renaming directory. "Perverted"
3942 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
3943 * Problems:
3944 *      a) we can get into loop creation. Check is done in is_subdir().
3945 *      b) race potential - two innocent renames can create a loop together.
3946 *         That's where 4.4 screws up. Current fix: serialization on
3947 *         sb->s_vfs_rename_mutex. We might be more accurate, but that's another
3948 *         story.
3949 *      c) we have to lock _three_ objects - parents and victim (if it exists).
3950 *         And that - after we got ->i_mutex on parents (until then we don't know
3951 *         whether the target exists).  Solution: try to be smart with locking
3952 *         order for inodes.  We rely on the fact that tree topology may change
3953 *         only under ->s_vfs_rename_mutex _and_ that parent of the object we
3954 *         move will be locked.  Thus we can rank directories by the tree
3955 *         (ancestors first) and rank all non-directories after them.
3956 *         That works since everybody except rename does "lock parent, lookup,
3957 *         lock child" and rename is under ->s_vfs_rename_mutex.
3958 *         HOWEVER, it relies on the assumption that any object with ->lookup()
3959 *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
3960 *         we'd better make sure that there's no link(2) for them.
3961 *      d) conversion from fhandle to dentry may come in the wrong moment - when
3962 *         we are removing the target. Solution: we will have to grab ->i_mutex
3963 *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
3964 *         ->i_mutex on parents, which works but leads to some truly excessive
3965 *         locking].
3966 */
3967static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3968                          struct inode *new_dir, struct dentry *new_dentry)
3969{
3970        int error = 0;
3971        struct inode *target = new_dentry->d_inode;
3972        unsigned max_links = new_dir->i_sb->s_max_links;
3973
3974        /*
3975         * If we are going to change the parent - check write permissions,
3976         * we'll need to flip '..'.
3977         */
3978        if (new_dir != old_dir) {
3979                error = inode_permission(old_dentry->d_inode, MAY_WRITE);
3980                if (error)
3981                        return error;
3982        }
3983
3984        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
3985        if (error)
3986                return error;
3987
3988        dget(new_dentry);
3989        if (target)
3990                mutex_lock(&target->i_mutex);
3991
3992        error = -EBUSY;
3993        if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
3994                goto out;
3995
3996        error = -EMLINK;
3997        if (max_links && !target && new_dir != old_dir &&
3998            new_dir->i_nlink >= max_links)
3999                goto out;
4000
4001        if (target)
4002                shrink_dcache_parent(new_dentry);
4003        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
4004        if (error)
4005                goto out;
4006
4007        if (target) {
4008                target->i_flags |= S_DEAD;
4009                dont_mount(new_dentry);
4010        }
4011out:
4012        if (target)
4013                mutex_unlock(&target->i_mutex);
4014        dput(new_dentry);
4015        if (!error)
4016                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
4017                        d_move(old_dentry,new_dentry);
4018        return error;
4019}
4020
4021static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
4022                            struct inode *new_dir, struct dentry *new_dentry)
4023{
4024        struct inode *target = new_dentry->d_inode;
4025        int error;
4026
4027        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
4028        if (error)
4029                return error;
4030
4031        dget(new_dentry);
4032        if (target)
4033                mutex_lock(&target->i_mutex);
4034
4035        error = -EBUSY;
4036        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
4037                goto out;
4038
4039        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
4040        if (error)
4041                goto out;
4042
4043        if (target)
4044                dont_mount(new_dentry);
4045        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
4046                d_move(old_dentry, new_dentry);
4047out:
4048        if (target)
4049                mutex_unlock(&target->i_mutex);
4050        dput(new_dentry);
4051        return error;
4052}
4053
4054int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4055               struct inode *new_dir, struct dentry *new_dentry)
4056{
4057        int error;
4058        int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
4059        const unsigned char *old_name;
4060
4061        if (old_dentry->d_inode == new_dentry->d_inode)
4062                return 0;
4063 
4064        error = may_delete(old_dir, old_dentry, is_dir);
4065        if (error)
4066                return error;
4067
4068        if (!new_dentry->d_inode)
4069                error = may_create(new_dir, new_dentry);
4070        else
4071                error = may_delete(new_dir, new_dentry, is_dir);
4072        if (error)
4073                return error;
4074
4075        if (!old_dir->i_op->rename)
4076                return -EPERM;
4077
4078        old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4079
4080        if (is_dir)
4081                error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
4082        else
4083                error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
4084        if (!error)
4085                fsnotify_move(old_dir, new_dir, old_name, is_dir,
4086                              new_dentry->d_inode, old_dentry);
4087        fsnotify_oldname_free(old_name);
4088
4089        return error;
4090}
4091
4092SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4093                int, newdfd, const char __user *, newname)
4094{
4095        struct dentry *old_dir, *new_dir;
4096        struct dentry *old_dentry, *new_dentry;
4097        struct dentry *trap;
4098        struct nameidata oldnd, newnd;
4099        struct filename *from;
4100        struct filename *to;
4101        unsigned int lookup_flags = 0;
4102        bool should_retry = false;
4103        int error;
4104retry:
4105        from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
4106        if (IS_ERR(from)) {
4107                error = PTR_ERR(from);
4108                goto exit;
4109        }
4110
4111        to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
4112        if (IS_ERR(to)) {
4113                error = PTR_ERR(to);
4114                goto exit1;
4115        }
4116
4117        error = -EXDEV;
4118        if (oldnd.path.mnt != newnd.path.mnt)
4119                goto exit2;
4120
4121        old_dir = oldnd.path.dentry;
4122        error = -EBUSY;
4123        if (oldnd.last_type != LAST_NORM)
4124                goto exit2;
4125
4126        new_dir = newnd.path.dentry;
4127        if (newnd.last_type != LAST_NORM)
4128                goto exit2;
4129
4130        error = mnt_want_write(oldnd.path.mnt);
4131        if (error)
4132                goto exit2;
4133
4134        oldnd.flags &= ~LOOKUP_PARENT;
4135        newnd.flags &= ~LOOKUP_PARENT;
4136        newnd.flags |= LOOKUP_RENAME_TARGET;
4137
4138        trap = lock_rename(new_dir, old_dir);
4139
4140        old_dentry = lookup_hash(&oldnd);
4141        error = PTR_ERR(old_dentry);
4142        if (IS_ERR(old_dentry))
4143                goto exit3;
4144        /* source must exist */
4145        error = -ENOENT;
4146        if (!old_dentry->d_inode)
4147                goto exit4;
4148        /* unless the source is a directory trailing slashes give -ENOTDIR */
4149        if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
4150                error = -ENOTDIR;
4151                if (oldnd.last.name[oldnd.last.len])
4152                        goto exit4;
4153                if (newnd.last.name[newnd.last.len])
4154                        goto exit4;
4155        }
4156        /* source should not be ancestor of target */
4157        error = -EINVAL;
4158        if (old_dentry == trap)
4159                goto exit4;
4160        new_dentry = lookup_hash(&newnd);
4161        error = PTR_ERR(new_dentry);
4162        if (IS_ERR(new_dentry))
4163                goto exit4;
4164        /* target should not be an ancestor of source */
4165        error = -ENOTEMPTY;
4166        if (new_dentry == trap)
4167                goto exit5;
4168
4169        error = security_path_rename(&oldnd.path, old_dentry,
4170                                     &newnd.path, new_dentry);
4171        if (error)
4172                goto exit5;
4173        error = vfs_rename(old_dir->d_inode, old_dentry,
4174                                   new_dir->d_inode, new_dentry);
4175exit5:
4176        dput(new_dentry);
4177exit4:
4178        dput(old_dentry);
4179exit3:
4180        unlock_rename(new_dir, old_dir);
4181        mnt_drop_write(oldnd.path.mnt);
4182exit2:
4183        if (retry_estale(error, lookup_flags))
4184                should_retry = true;
4185        path_put(&newnd.path);
4186        putname(to);
4187exit1:
4188        path_put(&oldnd.path);
4189        putname(from);
4190        if (should_retry) {
4191                should_retry = false;
4192                lookup_flags |= LOOKUP_REVAL;
4193                goto retry;
4194        }
4195exit:
4196        return error;
4197}
4198
4199SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4200{
4201        return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
4202}
4203
4204int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
4205{
4206        int len;
4207
4208        len = PTR_ERR(link);
4209        if (IS_ERR(link))
4210                goto out;
4211
4212        len = strlen(link);
4213        if (len > (unsigned) buflen)
4214                len = buflen;
4215        if (copy_to_user(buffer, link, len))
4216                len = -EFAULT;
4217out:
4218        return len;
4219}
4220
4221/*
4222 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
4223 * have ->follow_link() touching nd only in nd_set_link().  Using (or not
4224 * using) it for any given inode is up to filesystem.
4225 */
4226int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4227{
4228        struct nameidata nd;
4229        void *cookie;
4230        int res;
4231
4232        nd.depth = 0;
4233        cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
4234        if (IS_ERR(cookie))
4235                return PTR_ERR(cookie);
4236
4237        res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
4238        if (dentry->d_inode->i_op->put_link)
4239                dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
4240        return res;
4241}
4242
4243/* get the link contents into pagecache */
4244static char *page_getlink(struct dentry * dentry, struct page **ppage)
4245{
4246        char *kaddr;
4247        struct page *page;
4248        struct address_space *mapping = dentry->d_inode->i_mapping;
4249        page = read_mapping_page(mapping, 0, NULL);
4250        if (IS_ERR(page))
4251                return (char*)page;
4252        *ppage = page;
4253        kaddr = kmap(page);
4254        nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
4255        return kaddr;
4256}
4257
4258int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4259{
4260        struct page *page = NULL;
4261        char *s = page_getlink(dentry, &page);
4262        int res = vfs_readlink(dentry,buffer,buflen,s);
4263        if (page) {
4264                kunmap(page);
4265                page_cache_release(page);
4266        }
4267        return res;
4268}
4269
4270void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
4271{
4272        struct page *page = NULL;
4273        nd_set_link(nd, page_getlink(dentry, &page));
4274        return page;
4275}
4276
4277void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
4278{
4279        struct page *page = cookie;
4280
4281        if (page) {
4282                kunmap(page);
4283                page_cache_release(page);
4284        }
4285}
4286
4287/*
4288 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
4289 */
4290int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
4291{
4292        struct address_space *mapping = inode->i_mapping;
4293        struct page *page;
4294        void *fsdata;
4295        int err;
4296        char *kaddr;
4297        unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
4298        if (nofs)
4299                flags |= AOP_FLAG_NOFS;
4300
4301retry:
4302        err = pagecache_write_begin(NULL, mapping, 0, len-1,
4303                                flags, &page, &fsdata);
4304        if (err)
4305                goto fail;
4306
4307        kaddr = kmap_atomic(page);
4308        memcpy(kaddr, symname, len-1);
4309        kunmap_atomic(kaddr);
4310
4311        err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
4312                                                        page, fsdata);
4313        if (err < 0)
4314                goto fail;
4315        if (err < len-1)
4316                goto retry;
4317
4318        mark_inode_dirty(inode);
4319        return 0;
4320fail:
4321        return err;
4322}
4323
4324int page_symlink(struct inode *inode, const char *symname, int len)
4325{
4326        return __page_symlink(inode, symname, len,
4327                        !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
4328}
4329
4330const struct inode_operations page_symlink_inode_operations = {
4331        .readlink       = generic_readlink,
4332        .follow_link    = page_follow_link_light,
4333        .put_link       = page_put_link,
4334};
4335
4336EXPORT_SYMBOL(user_path_at);
4337EXPORT_SYMBOL(follow_down_one);
4338EXPORT_SYMBOL(follow_down);
4339EXPORT_SYMBOL(follow_up);
4340EXPORT_SYMBOL(get_write_access); /* nfsd */
4341EXPORT_SYMBOL(lock_rename);
4342EXPORT_SYMBOL(lookup_one_len);
4343EXPORT_SYMBOL(page_follow_link_light);
4344EXPORT_SYMBOL(page_put_link);
4345EXPORT_SYMBOL(page_readlink);
4346EXPORT_SYMBOL(__page_symlink);
4347EXPORT_SYMBOL(page_symlink);
4348EXPORT_SYMBOL(page_symlink_inode_operations);
4349EXPORT_SYMBOL(kern_path);
4350EXPORT_SYMBOL(vfs_path_lookup);
4351EXPORT_SYMBOL(inode_permission);
4352EXPORT_SYMBOL(unlock_rename);
4353EXPORT_SYMBOL(vfs_create);
4354EXPORT_SYMBOL(vfs_link);
4355EXPORT_SYMBOL(vfs_mkdir);
4356EXPORT_SYMBOL(vfs_mknod);
4357EXPORT_SYMBOL(generic_permission);
4358EXPORT_SYMBOL(vfs_readlink);
4359EXPORT_SYMBOL(vfs_rename);
4360EXPORT_SYMBOL(vfs_rmdir);
4361EXPORT_SYMBOL(vfs_symlink);
4362EXPORT_SYMBOL(vfs_unlink);
4363EXPORT_SYMBOL(dentry_unhash);
4364EXPORT_SYMBOL(generic_readlink);
4365