linux/fs/open.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/open.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/string.h>
   8#include <linux/mm.h>
   9#include <linux/file.h>
  10#include <linux/fdtable.h>
  11#include <linux/fsnotify.h>
  12#include <linux/module.h>
  13#include <linux/tty.h>
  14#include <linux/namei.h>
  15#include <linux/backing-dev.h>
  16#include <linux/capability.h>
  17#include <linux/securebits.h>
  18#include <linux/security.h>
  19#include <linux/mount.h>
  20#include <linux/fcntl.h>
  21#include <linux/slab.h>
  22#include <asm/uaccess.h>
  23#include <linux/fs.h>
  24#include <linux/personality.h>
  25#include <linux/pagemap.h>
  26#include <linux/syscalls.h>
  27#include <linux/rcupdate.h>
  28#include <linux/audit.h>
  29#include <linux/falloc.h>
  30#include <linux/fs_struct.h>
  31#include <linux/ima.h>
  32#include <linux/dnotify.h>
  33#include <linux/compat.h>
  34
  35#include "internal.h"
  36
  37int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
  38        struct file *filp)
  39{
  40        int ret;
  41        struct iattr newattrs;
  42
  43        /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
  44        if (length < 0)
  45                return -EINVAL;
  46
  47        newattrs.ia_size = length;
  48        newattrs.ia_valid = ATTR_SIZE | time_attrs;
  49        if (filp) {
  50                newattrs.ia_file = filp;
  51                newattrs.ia_valid |= ATTR_FILE;
  52        }
  53
  54        /* Remove suid, sgid, and file capabilities on truncate too */
  55        ret = dentry_needs_remove_privs(dentry);
  56        if (ret < 0)
  57                return ret;
  58        if (ret)
  59                newattrs.ia_valid |= ret | ATTR_FORCE;
  60
  61        mutex_lock(&dentry->d_inode->i_mutex);
  62        /* Note any delegations or leases have already been broken: */
  63        ret = notify_change(dentry, &newattrs, NULL);
  64        mutex_unlock(&dentry->d_inode->i_mutex);
  65        return ret;
  66}
  67
  68long vfs_truncate(struct path *path, loff_t length)
  69{
  70        struct inode *inode;
  71        struct dentry *upperdentry;
  72        long error;
  73
  74        inode = path->dentry->d_inode;
  75
  76        /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
  77        if (S_ISDIR(inode->i_mode))
  78                return -EISDIR;
  79        if (!S_ISREG(inode->i_mode))
  80                return -EINVAL;
  81
  82        error = mnt_want_write(path->mnt);
  83        if (error)
  84                goto out;
  85
  86        error = inode_permission(inode, MAY_WRITE);
  87        if (error)
  88                goto mnt_drop_write_and_out;
  89
  90        error = -EPERM;
  91        if (IS_APPEND(inode))
  92                goto mnt_drop_write_and_out;
  93
  94        /*
  95         * If this is an overlayfs then do as if opening the file so we get
  96         * write access on the upper inode, not on the overlay inode.  For
  97         * non-overlay filesystems d_real() is an identity function.
  98         */
  99        upperdentry = d_real(path->dentry, NULL, O_WRONLY);
 100        error = PTR_ERR(upperdentry);
 101        if (IS_ERR(upperdentry))
 102                goto mnt_drop_write_and_out;
 103
 104        error = get_write_access(upperdentry->d_inode);
 105        if (error)
 106                goto mnt_drop_write_and_out;
 107
 108        /*
 109         * Make sure that there are no leases.  get_write_access() protects
 110         * against the truncate racing with a lease-granting setlease().
 111         */
 112        error = break_lease(inode, O_WRONLY);
 113        if (error)
 114                goto put_write_and_out;
 115
 116        error = locks_verify_truncate(inode, NULL, length);
 117        if (!error)
 118                error = security_path_truncate(path);
 119        if (!error)
 120                error = do_truncate(path->dentry, length, 0, NULL);
 121
 122put_write_and_out:
 123        put_write_access(upperdentry->d_inode);
 124mnt_drop_write_and_out:
 125        mnt_drop_write(path->mnt);
 126out:
 127        return error;
 128}
 129EXPORT_SYMBOL_GPL(vfs_truncate);
 130
 131static long do_sys_truncate(const char __user *pathname, loff_t length)
 132{
 133        unsigned int lookup_flags = LOOKUP_FOLLOW;
 134        struct path path;
 135        int error;
 136
 137        if (length < 0) /* sorry, but loff_t says... */
 138                return -EINVAL;
 139
 140retry:
 141        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 142        if (!error) {
 143                error = vfs_truncate(&path, length);
 144                path_put(&path);
 145        }
 146        if (retry_estale(error, lookup_flags)) {
 147                lookup_flags |= LOOKUP_REVAL;
 148                goto retry;
 149        }
 150        return error;
 151}
 152
 153SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
 154{
 155        return do_sys_truncate(path, length);
 156}
 157
 158#ifdef CONFIG_COMPAT
 159COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
 160{
 161        return do_sys_truncate(path, length);
 162}
 163#endif
 164
 165static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 166{
 167        struct inode *inode;
 168        struct dentry *dentry;
 169        struct fd f;
 170        int error;
 171
 172        error = -EINVAL;
 173        if (length < 0)
 174                goto out;
 175        error = -EBADF;
 176        f = fdget(fd);
 177        if (!f.file)
 178                goto out;
 179
 180        /* explicitly opened as large or we are on 64-bit box */
 181        if (f.file->f_flags & O_LARGEFILE)
 182                small = 0;
 183
 184        dentry = f.file->f_path.dentry;
 185        inode = dentry->d_inode;
 186        error = -EINVAL;
 187        if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
 188                goto out_putf;
 189
 190        error = -EINVAL;
 191        /* Cannot ftruncate over 2^31 bytes without large file support */
 192        if (small && length > MAX_NON_LFS)
 193                goto out_putf;
 194
 195        error = -EPERM;
 196        if (IS_APPEND(inode))
 197                goto out_putf;
 198
 199        sb_start_write(inode->i_sb);
 200        error = locks_verify_truncate(inode, f.file, length);
 201        if (!error)
 202                error = security_path_truncate(&f.file->f_path);
 203        if (!error)
 204                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
 205        sb_end_write(inode->i_sb);
 206out_putf:
 207        fdput(f);
 208out:
 209        return error;
 210}
 211
 212SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
 213{
 214        return do_sys_ftruncate(fd, length, 1);
 215}
 216
 217#ifdef CONFIG_COMPAT
 218COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
 219{
 220        return do_sys_ftruncate(fd, length, 1);
 221}
 222#endif
 223
 224/* LFS versions of truncate are only needed on 32 bit machines */
 225#if BITS_PER_LONG == 32
 226SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
 227{
 228        return do_sys_truncate(path, length);
 229}
 230
 231SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
 232{
 233        return do_sys_ftruncate(fd, length, 0);
 234}
 235#endif /* BITS_PER_LONG == 32 */
 236
 237
 238int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 239{
 240        struct inode *inode = file_inode(file);
 241        long ret;
 242
 243        if (offset < 0 || len <= 0)
 244                return -EINVAL;
 245
 246        /* Return error if mode is not supported */
 247        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
 248                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
 249                return -EOPNOTSUPP;
 250
 251        /* Punch hole and zero range are mutually exclusive */
 252        if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
 253            (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
 254                return -EOPNOTSUPP;
 255
 256        /* Punch hole must have keep size set */
 257        if ((mode & FALLOC_FL_PUNCH_HOLE) &&
 258            !(mode & FALLOC_FL_KEEP_SIZE))
 259                return -EOPNOTSUPP;
 260
 261        /* Collapse range should only be used exclusively. */
 262        if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
 263            (mode & ~FALLOC_FL_COLLAPSE_RANGE))
 264                return -EINVAL;
 265
 266        if (!(file->f_mode & FMODE_WRITE))
 267                return -EBADF;
 268
 269        /*
 270         * It's not possible to punch hole or perform collapse range
 271         * on append only file
 272         */
 273        if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)
 274            && IS_APPEND(inode))
 275                return -EPERM;
 276
 277        if (IS_IMMUTABLE(inode))
 278                return -EPERM;
 279
 280        /*
 281         * We can not allow to do any fallocate operation on an active
 282         * swapfile
 283         */
 284        if (IS_SWAPFILE(inode))
 285                ret = -ETXTBSY;
 286
 287        /*
 288         * Revalidate the write permissions, in case security policy has
 289         * changed since the files were opened.
 290         */
 291        ret = security_file_permission(file, MAY_WRITE);
 292        if (ret)
 293                return ret;
 294
 295        if (S_ISFIFO(inode->i_mode))
 296                return -ESPIPE;
 297
 298        /*
 299         * Let individual file system decide if it supports preallocation
 300         * for directories or not.
 301         */
 302        if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
 303                return -ENODEV;
 304
 305        /* Check for wrap through zero too */
 306        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
 307                return -EFBIG;
 308
 309        if (!file->f_op->fallocate)
 310                return -EOPNOTSUPP;
 311
 312        sb_start_write(inode->i_sb);
 313        ret = file->f_op->fallocate(file, mode, offset, len);
 314        sb_end_write(inode->i_sb);
 315        return ret;
 316}
 317EXPORT_SYMBOL_GPL(vfs_fallocate);
 318
 319SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
 320{
 321        struct fd f = fdget(fd);
 322        int error = -EBADF;
 323
 324        if (f.file) {
 325                error = vfs_fallocate(f.file, mode, offset, len);
 326                fdput(f);
 327        }
 328        return error;
 329}
 330
 331/*
 332 * access() needs to use the real uid/gid, not the effective uid/gid.
 333 * We do this by temporarily clearing all FS-related capabilities and
 334 * switching the fsuid/fsgid around to the real ones.
 335 */
 336SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 337{
 338        const struct cred *old_cred;
 339        struct cred *override_cred;
 340        struct path path;
 341        struct inode *inode;
 342        int res;
 343        unsigned int lookup_flags = LOOKUP_FOLLOW;
 344
 345        if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
 346                return -EINVAL;
 347
 348        override_cred = prepare_creds();
 349        if (!override_cred)
 350                return -ENOMEM;
 351
 352        override_cred->fsuid = override_cred->uid;
 353        override_cred->fsgid = override_cred->gid;
 354
 355        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 356                /* Clear the capabilities if we switch to a non-root user */
 357                kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
 358                if (!uid_eq(override_cred->uid, root_uid))
 359                        cap_clear(override_cred->cap_effective);
 360                else
 361                        override_cred->cap_effective =
 362                                override_cred->cap_permitted;
 363        }
 364
 365        old_cred = override_creds(override_cred);
 366retry:
 367        res = user_path_at(dfd, filename, lookup_flags, &path);
 368        if (res)
 369                goto out;
 370
 371        inode = path.dentry->d_inode;
 372
 373        if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 374                /*
 375                 * MAY_EXEC on regular files is denied if the fs is mounted
 376                 * with the "noexec" flag.
 377                 */
 378                res = -EACCES;
 379                if (path.mnt->mnt_flags & MNT_NOEXEC)
 380                        goto out_path_release;
 381        }
 382
 383        res = inode_permission(inode, mode | MAY_ACCESS);
 384        /* SuS v2 requires we report a read only fs too */
 385        if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 386                goto out_path_release;
 387        /*
 388         * This is a rare case where using __mnt_is_readonly()
 389         * is OK without a mnt_want/drop_write() pair.  Since
 390         * no actual write to the fs is performed here, we do
 391         * not need to telegraph to that to anyone.
 392         *
 393         * By doing this, we accept that this access is
 394         * inherently racy and know that the fs may change
 395         * state before we even see this result.
 396         */
 397        if (__mnt_is_readonly(path.mnt))
 398                res = -EROFS;
 399
 400out_path_release:
 401        path_put(&path);
 402        if (retry_estale(res, lookup_flags)) {
 403                lookup_flags |= LOOKUP_REVAL;
 404                goto retry;
 405        }
 406out:
 407        revert_creds(old_cred);
 408        put_cred(override_cred);
 409        return res;
 410}
 411
 412SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
 413{
 414        return sys_faccessat(AT_FDCWD, filename, mode);
 415}
 416
 417SYSCALL_DEFINE1(chdir, const char __user *, filename)
 418{
 419        struct path path;
 420        int error;
 421        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 422retry:
 423        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
 424        if (error)
 425                goto out;
 426
 427        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
 428        if (error)
 429                goto dput_and_out;
 430
 431        set_fs_pwd(current->fs, &path);
 432
 433dput_and_out:
 434        path_put(&path);
 435        if (retry_estale(error, lookup_flags)) {
 436                lookup_flags |= LOOKUP_REVAL;
 437                goto retry;
 438        }
 439out:
 440        return error;
 441}
 442
 443SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 444{
 445        struct fd f = fdget_raw(fd);
 446        struct inode *inode;
 447        int error = -EBADF;
 448
 449        error = -EBADF;
 450        if (!f.file)
 451                goto out;
 452
 453        inode = file_inode(f.file);
 454
 455        error = -ENOTDIR;
 456        if (!S_ISDIR(inode->i_mode))
 457                goto out_putf;
 458
 459        error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
 460        if (!error)
 461                set_fs_pwd(current->fs, &f.file->f_path);
 462out_putf:
 463        fdput(f);
 464out:
 465        return error;
 466}
 467
 468SYSCALL_DEFINE1(chroot, const char __user *, filename)
 469{
 470        struct path path;
 471        int error;
 472        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 473retry:
 474        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
 475        if (error)
 476                goto out;
 477
 478        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
 479        if (error)
 480                goto dput_and_out;
 481
 482        error = -EPERM;
 483        if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
 484                goto dput_and_out;
 485        error = security_path_chroot(&path);
 486        if (error)
 487                goto dput_and_out;
 488
 489        set_fs_root(current->fs, &path);
 490        error = 0;
 491dput_and_out:
 492        path_put(&path);
 493        if (retry_estale(error, lookup_flags)) {
 494                lookup_flags |= LOOKUP_REVAL;
 495                goto retry;
 496        }
 497out:
 498        return error;
 499}
 500
 501static int chmod_common(struct path *path, umode_t mode)
 502{
 503        struct inode *inode = path->dentry->d_inode;
 504        struct inode *delegated_inode = NULL;
 505        struct iattr newattrs;
 506        int error;
 507
 508        error = mnt_want_write(path->mnt);
 509        if (error)
 510                return error;
 511retry_deleg:
 512        mutex_lock(&inode->i_mutex);
 513        error = security_path_chmod(path, mode);
 514        if (error)
 515                goto out_unlock;
 516        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 517        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 518        error = notify_change(path->dentry, &newattrs, &delegated_inode);
 519out_unlock:
 520        mutex_unlock(&inode->i_mutex);
 521        if (delegated_inode) {
 522                error = break_deleg_wait(&delegated_inode);
 523                if (!error)
 524                        goto retry_deleg;
 525        }
 526        mnt_drop_write(path->mnt);
 527        return error;
 528}
 529
 530SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
 531{
 532        struct file * file;
 533        int err = -EBADF;
 534
 535        file = fget(fd);
 536        if (file) {
 537                audit_inode(NULL, file->f_path.dentry, 0);
 538                err = chmod_common(&file->f_path, mode);
 539                fput(file);
 540        }
 541        return err;
 542}
 543
 544SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode)
 545{
 546        struct path path;
 547        int error;
 548        unsigned int lookup_flags = LOOKUP_FOLLOW;
 549retry:
 550        error = user_path_at(dfd, filename, lookup_flags, &path);
 551        if (!error) {
 552                error = chmod_common(&path, mode);
 553                path_put(&path);
 554                if (retry_estale(error, lookup_flags)) {
 555                        lookup_flags |= LOOKUP_REVAL;
 556                        goto retry;
 557                }
 558        }
 559        return error;
 560}
 561
 562SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
 563{
 564        return sys_fchmodat(AT_FDCWD, filename, mode);
 565}
 566
 567static int chown_common(struct path *path, uid_t user, gid_t group)
 568{
 569        struct inode *inode = path->dentry->d_inode;
 570        struct inode *delegated_inode = NULL;
 571        int error;
 572        struct iattr newattrs;
 573        kuid_t uid;
 574        kgid_t gid;
 575
 576        uid = make_kuid(current_user_ns(), user);
 577        gid = make_kgid(current_user_ns(), group);
 578
 579retry_deleg:
 580        newattrs.ia_valid =  ATTR_CTIME;
 581        if (user != (uid_t) -1) {
 582                if (!uid_valid(uid))
 583                        return -EINVAL;
 584                newattrs.ia_valid |= ATTR_UID;
 585                newattrs.ia_uid = uid;
 586        }
 587        if (group != (gid_t) -1) {
 588                if (!gid_valid(gid))
 589                        return -EINVAL;
 590                newattrs.ia_valid |= ATTR_GID;
 591                newattrs.ia_gid = gid;
 592        }
 593        if (!S_ISDIR(inode->i_mode))
 594                newattrs.ia_valid |=
 595                        ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
 596        mutex_lock(&inode->i_mutex);
 597        error = security_path_chown(path, uid, gid);
 598        if (!error)
 599                error = notify_change(path->dentry, &newattrs, &delegated_inode);
 600        mutex_unlock(&inode->i_mutex);
 601        if (delegated_inode) {
 602                error = break_deleg_wait(&delegated_inode);
 603                if (!error)
 604                        goto retry_deleg;
 605        }
 606        return error;
 607}
 608
 609SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 610                gid_t, group, int, flag)
 611{
 612        struct path path;
 613        int error = -EINVAL;
 614        int lookup_flags;
 615
 616        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
 617                goto out;
 618
 619        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
 620        if (flag & AT_EMPTY_PATH)
 621                lookup_flags |= LOOKUP_EMPTY;
 622retry:
 623        error = user_path_at(dfd, filename, lookup_flags, &path);
 624        if (error)
 625                goto out;
 626        error = mnt_want_write(path.mnt);
 627        if (error)
 628                goto out_release;
 629        error = chown_common(&path, user, group);
 630        mnt_drop_write(path.mnt);
 631out_release:
 632        path_put(&path);
 633        if (retry_estale(error, lookup_flags)) {
 634                lookup_flags |= LOOKUP_REVAL;
 635                goto retry;
 636        }
 637out:
 638        return error;
 639}
 640
 641SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 642{
 643        return sys_fchownat(AT_FDCWD, filename, user, group, 0);
 644}
 645
 646SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
 647{
 648        return sys_fchownat(AT_FDCWD, filename, user, group,
 649                            AT_SYMLINK_NOFOLLOW);
 650}
 651
 652SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 653{
 654        struct fd f = fdget(fd);
 655        int error = -EBADF;
 656
 657        if (!f.file)
 658                goto out;
 659
 660        error = mnt_want_write_file(f.file);
 661        if (error)
 662                goto out_fput;
 663        audit_inode(NULL, f.file->f_path.dentry, 0);
 664        error = chown_common(&f.file->f_path, user, group);
 665        mnt_drop_write_file(f.file);
 666out_fput:
 667        fdput(f);
 668out:
 669        return error;
 670}
 671
 672/*
 673 * You have to be very careful that these write
 674 * counts get cleaned up in error cases and
 675 * upon __fput().  This should probably never
 676 * be called outside of __dentry_open().
 677 */
 678static inline int __get_file_write_access(struct inode *inode,
 679                                          struct vfsmount *mnt)
 680{
 681        int error;
 682        error = get_write_access(inode);
 683        if (error)
 684                return error;
 685        /*
 686         * Do not take mount writer counts on
 687         * special files since no writes to
 688         * the mount itself will occur.
 689         */
 690        if (!special_file(inode->i_mode)) {
 691                /*
 692                 * Balanced in __fput()
 693                 */
 694                error = __mnt_want_write(mnt);
 695                if (error)
 696                        put_write_access(inode);
 697        }
 698        return error;
 699}
 700
 701int open_check_o_direct(struct file *f)
 702{
 703        /* NB: we're sure to have correct a_ops only after f_op->open */
 704        if (f->f_flags & O_DIRECT) {
 705                if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
 706                        return -EINVAL;
 707        }
 708        return 0;
 709}
 710
 711static int do_dentry_open(struct file *f,
 712                          struct inode *inode,
 713                          int (*open)(struct inode *, struct file *),
 714                          const struct cred *cred)
 715{
 716        static const struct file_operations empty_fops = {};
 717        int error;
 718
 719        f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 720                                FMODE_PREAD | FMODE_PWRITE;
 721
 722        if (unlikely(f->f_flags & O_PATH))
 723                f->f_mode = FMODE_PATH;
 724
 725        path_get(&f->f_path);
 726        f->f_inode = inode;
 727        if (f->f_mode & FMODE_WRITE) {
 728                error = __get_file_write_access(inode, f->f_path.mnt);
 729                if (error)
 730                        goto cleanup_file;
 731                if (!special_file(inode->i_mode))
 732                        file_take_write(f);
 733        }
 734
 735        f->f_mapping = inode->i_mapping;
 736
 737        if (unlikely(f->f_mode & FMODE_PATH)) {
 738                f->f_op = &empty_fops;
 739                return 0;
 740        }
 741
 742        /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
 743        if (S_ISREG(inode->i_mode))
 744                f->f_mode |= FMODE_ATOMIC_POS;
 745
 746        f->f_op = fops_get(inode->i_fop);
 747
 748        error = security_file_open(f, cred);
 749        if (error)
 750                goto cleanup_all;
 751
 752        error = break_lease(locks_inode(f), f->f_flags);
 753        if (error)
 754                goto cleanup_all;
 755
 756        if (!open && f->f_op)
 757                open = f->f_op->open;
 758        if (open) {
 759                error = open(inode, f);
 760                if (error)
 761                        goto cleanup_all;
 762        }
 763        if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 764                i_readcount_inc(inode);
 765
 766        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 767
 768        file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
 769
 770        return 0;
 771
 772cleanup_all:
 773        fops_put(f->f_op);
 774        if (f->f_mode & FMODE_WRITE) {
 775                put_write_access(inode);
 776                if (!special_file(inode->i_mode)) {
 777                        /*
 778                         * We don't consider this a real
 779                         * mnt_want/drop_write() pair
 780                         * because it all happenend right
 781                         * here, so just reset the state.
 782                         */
 783                        file_reset_write(f);
 784                        __mnt_drop_write(f->f_path.mnt);
 785                }
 786        }
 787cleanup_file:
 788        path_put(&f->f_path);
 789        f->f_path.mnt = NULL;
 790        f->f_path.dentry = NULL;
 791        f->f_inode = NULL;
 792        return error;
 793}
 794
 795/**
 796 * finish_open - finish opening a file
 797 * @file: file pointer
 798 * @dentry: pointer to dentry
 799 * @open: open callback
 800 * @opened: state of open
 801 *
 802 * This can be used to finish opening a file passed to i_op->atomic_open().
 803 *
 804 * If the open callback is set to NULL, then the standard f_op->open()
 805 * filesystem callback is substituted.
 806 *
 807 * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
 808 * the return value of d_splice_alias(), then the caller needs to perform dput()
 809 * on it after finish_open().
 810 *
 811 * On successful return @file is a fully instantiated open file.  After this, if
 812 * an error occurs in ->atomic_open(), it needs to clean up with fput().
 813 *
 814 * Returns zero on success or -errno if the open failed.
 815 */
 816int finish_open(struct file *file, struct dentry *dentry,
 817                int (*open)(struct inode *, struct file *),
 818                int *opened)
 819{
 820        int error;
 821        BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
 822
 823        file->f_path.dentry = dentry;
 824        error = do_dentry_open(file, dentry->d_inode, open,
 825                               current_cred());
 826        if (!error)
 827                *opened |= FILE_OPENED;
 828
 829        return error;
 830}
 831EXPORT_SYMBOL(finish_open);
 832
 833/**
 834 * finish_no_open - finish ->atomic_open() without opening the file
 835 *
 836 * @file: file pointer
 837 * @dentry: dentry or NULL (as returned from ->lookup())
 838 *
 839 * This can be used to set the result of a successful lookup in ->atomic_open().
 840 *
 841 * NB: unlike finish_open() this function does consume the dentry reference and
 842 * the caller need not dput() it.
 843 *
 844 * Returns "1" which must be the return value of ->atomic_open() after having
 845 * called this function.
 846 */
 847int finish_no_open(struct file *file, struct dentry *dentry)
 848{
 849        file->f_path.dentry = dentry;
 850        return 1;
 851}
 852EXPORT_SYMBOL(finish_no_open);
 853
 854struct file *dentry_open(const struct path *path, int flags,
 855                         const struct cred *cred)
 856{
 857        int error;
 858        struct file *f;
 859
 860        validate_creds(cred);
 861
 862        /* We must always pass in a valid mount pointer. */
 863        BUG_ON(!path->mnt);
 864
 865        f = get_empty_filp();
 866        if (!IS_ERR(f)) {
 867                f->f_flags = flags;
 868                error = vfs_open(path, f, cred);
 869                if (!error) {
 870                        /* from now on we need fput() to dispose of f */
 871                        error = open_check_o_direct(f);
 872                        if (error) {
 873                                fput(f);
 874                                f = ERR_PTR(error);
 875                        }
 876                } else { 
 877                        put_filp(f);
 878                        f = ERR_PTR(error);
 879                }
 880        }
 881        return f;
 882}
 883EXPORT_SYMBOL(dentry_open);
 884
 885/**
 886 * vfs_open - open the file at the given path
 887 * @path: path to open
 888 * @filp: newly allocated file with f_flag initialized
 889 * @cred: credentials to use
 890 */
 891int vfs_open(const struct path *path, struct file *filp,
 892             const struct cred *cred)
 893{
 894        struct inode *inode = path->dentry->d_inode;
 895        iop_dentry_open_t dentry_open = get_dentry_open_iop(inode);
 896
 897        if (dentry_open)
 898                return dentry_open(path->dentry, filp, cred);
 899        else {
 900                struct dentry *dentry = d_real(path->dentry, NULL, filp->f_flags);
 901
 902                if (IS_ERR(dentry))
 903                        return PTR_ERR(dentry);
 904
 905                filp->f_path = *path;
 906                return do_dentry_open(filp, dentry->d_inode, NULL, cred);
 907        }
 908}
 909EXPORT_SYMBOL(vfs_open);
 910
 911static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
 912{
 913        int lookup_flags = 0;
 914        int acc_mode;
 915
 916        if (flags & O_CREAT)
 917                op->mode = (mode & S_IALLUGO) | S_IFREG;
 918        else
 919                op->mode = 0;
 920
 921        /* Must never be set by userspace */
 922        flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
 923
 924        /*
 925         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
 926         * check for O_DSYNC if the need any syncing at all we enforce it's
 927         * always set instead of having to deal with possibly weird behaviour
 928         * for malicious applications setting only __O_SYNC.
 929         */
 930        if (flags & __O_SYNC)
 931                flags |= O_DSYNC;
 932
 933        /*
 934         * If we have O_PATH in the open flag. Then we
 935         * cannot have anything other than the below set of flags
 936         */
 937        if (flags & O_PATH) {
 938                flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
 939                acc_mode = 0;
 940        } else {
 941                acc_mode = MAY_OPEN | ACC_MODE(flags);
 942        }
 943
 944        op->open_flag = flags;
 945
 946        /* O_TRUNC implies we need access checks for write permissions */
 947        if (flags & O_TRUNC)
 948                acc_mode |= MAY_WRITE;
 949
 950        /* Allow the LSM permission hook to distinguish append
 951           access from general write access. */
 952        if (flags & O_APPEND)
 953                acc_mode |= MAY_APPEND;
 954
 955        op->acc_mode = acc_mode;
 956
 957        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
 958
 959        if (flags & O_CREAT) {
 960                op->intent |= LOOKUP_CREATE;
 961                if (flags & O_EXCL)
 962                        op->intent |= LOOKUP_EXCL;
 963        }
 964
 965        if (flags & O_DIRECTORY)
 966                lookup_flags |= LOOKUP_DIRECTORY;
 967        if (!(flags & O_NOFOLLOW))
 968                lookup_flags |= LOOKUP_FOLLOW;
 969        return lookup_flags;
 970}
 971
 972/**
 973 * file_open_name - open file and return file pointer
 974 *
 975 * @name:       struct filename containing path to open
 976 * @flags:      open flags as per the open(2) second argument
 977 * @mode:       mode for the new file if O_CREAT is set, else ignored
 978 *
 979 * This is the helper to open a file from kernelspace if you really
 980 * have to.  But in generally you should not do this, so please move
 981 * along, nothing to see here..
 982 */
 983struct file *file_open_name(struct filename *name, int flags, umode_t mode)
 984{
 985        struct open_flags op;
 986        int lookup = build_open_flags(flags, mode, &op);
 987        return do_filp_open(AT_FDCWD, name, &op, lookup);
 988}
 989
 990/**
 991 * filp_open - open file and return file pointer
 992 *
 993 * @filename:   path to open
 994 * @flags:      open flags as per the open(2) second argument
 995 * @mode:       mode for the new file if O_CREAT is set, else ignored
 996 *
 997 * This is the helper to open a file from kernelspace if you really
 998 * have to.  But in generally you should not do this, so please move
 999 * along, nothing to see here..
1000 */
1001struct file *filp_open(const char *filename, int flags, umode_t mode)
1002{
1003        struct filename *name = getname_kernel(filename);
1004        struct file *file = ERR_CAST(name);
1005        
1006        if (!IS_ERR(name)) {
1007                file = file_open_name(name, flags, mode);
1008                putname(name);
1009        }
1010        return file;
1011}
1012EXPORT_SYMBOL(filp_open);
1013
1014struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
1015                            const char *filename, int flags)
1016{
1017        struct open_flags op;
1018        int lookup = build_open_flags(flags, 0, &op);
1019        if (flags & O_CREAT)
1020                return ERR_PTR(-EINVAL);
1021        if (!filename && (flags & O_DIRECTORY))
1022                if (!dentry->d_inode->i_op->lookup)
1023                        return ERR_PTR(-ENOTDIR);
1024        return do_file_open_root(dentry, mnt, filename, &op, lookup);
1025}
1026EXPORT_SYMBOL(file_open_root);
1027
1028long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
1029{
1030        struct open_flags op;
1031        int lookup = build_open_flags(flags, mode, &op);
1032        struct filename *tmp = getname(filename);
1033        int fd = PTR_ERR(tmp);
1034
1035        if (!IS_ERR(tmp)) {
1036                fd = get_unused_fd_flags(flags);
1037                if (fd >= 0) {
1038                        struct file *f = do_filp_open(dfd, tmp, &op, lookup);
1039                        if (IS_ERR(f)) {
1040                                put_unused_fd(fd);
1041                                fd = PTR_ERR(f);
1042                        } else {
1043                                fsnotify_open(f);
1044                                fd_install(fd, f);
1045                        }
1046                }
1047                putname(tmp);
1048        }
1049        return fd;
1050}
1051
1052SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
1053{
1054        if (force_o_largefile())
1055                flags |= O_LARGEFILE;
1056
1057        return do_sys_open(AT_FDCWD, filename, flags, mode);
1058}
1059
1060SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
1061                umode_t, mode)
1062{
1063        if (force_o_largefile())
1064                flags |= O_LARGEFILE;
1065
1066        return do_sys_open(dfd, filename, flags, mode);
1067}
1068
1069#ifndef __alpha__
1070
1071/*
1072 * For backward compatibility?  Maybe this should be moved
1073 * into arch/i386 instead?
1074 */
1075SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
1076{
1077        return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
1078}
1079
1080#endif
1081
1082/*
1083 * "id" is the POSIX thread ID. We use the
1084 * files pointer for this..
1085 */
1086int filp_close(struct file *filp, fl_owner_t id)
1087{
1088        int retval = 0;
1089
1090        if (!file_count(filp)) {
1091                printk(KERN_ERR "VFS: Close: file count is 0\n");
1092                return 0;
1093        }
1094
1095        if (filp->f_op && filp->f_op->flush)
1096                retval = filp->f_op->flush(filp, id);
1097
1098        if (likely(!(filp->f_mode & FMODE_PATH))) {
1099                dnotify_flush(filp, id);
1100                locks_remove_posix(filp, id);
1101        }
1102        fput(filp);
1103        return retval;
1104}
1105
1106EXPORT_SYMBOL(filp_close);
1107
1108/*
1109 * Careful here! We test whether the file pointer is NULL before
1110 * releasing the fd. This ensures that one clone task can't release
1111 * an fd while another clone is opening it.
1112 */
1113SYSCALL_DEFINE1(close, unsigned int, fd)
1114{
1115        int retval = __close_fd(current->files, fd);
1116
1117        /* can't restart close syscall because file table entry was cleared */
1118        if (unlikely(retval == -ERESTARTSYS ||
1119                     retval == -ERESTARTNOINTR ||
1120                     retval == -ERESTARTNOHAND ||
1121                     retval == -ERESTART_RESTARTBLOCK))
1122                retval = -EINTR;
1123
1124        return retval;
1125}
1126EXPORT_SYMBOL(sys_close);
1127
1128/*
1129 * This routine simulates a hangup on the tty, to arrange that users
1130 * are given clean terminals at login time.
1131 */
1132SYSCALL_DEFINE0(vhangup)
1133{
1134        if (capable(CAP_SYS_TTY_CONFIG)) {
1135                tty_vhangup_self();
1136                return 0;
1137        }
1138        return -EPERM;
1139}
1140
1141/*
1142 * Called when an inode is about to be open.
1143 * We use this to disallow opening large files on 32bit systems if
1144 * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
1145 * on this flag in sys_open.
1146 */
1147int generic_file_open(struct inode * inode, struct file * filp)
1148{
1149        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1150                return -EOVERFLOW;
1151        return 0;
1152}
1153
1154EXPORT_SYMBOL(generic_file_open);
1155
1156/*
1157 * This is used by subsystems that don't want seekable
1158 * file descriptors. The function is not supposed to ever fail, the only
1159 * reason it returns an 'int' and not 'void' is so that it can be plugged
1160 * directly into file_operations structure.
1161 */
1162int nonseekable_open(struct inode *inode, struct file *filp)
1163{
1164        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1165        return 0;
1166}
1167
1168EXPORT_SYMBOL(nonseekable_open);
1169