LXR linux/fs/fcntl.c

   1/*
   2 *  linux/fs/fcntl.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/syscalls.h>
   8#include <linux/init.h>
   9#include <linux/mm.h>
  10#include <linux/fs.h>
  11#include <linux/file.h>
  12#include <linux/fdtable.h>
  13#include <linux/capability.h>
  14#include <linux/dnotify.h>
  15#include <linux/slab.h>
  16#include <linux/module.h>
  17#include <linux/security.h>
  18#include <linux/ptrace.h>
  19#include <linux/signal.h>
  20#include <linux/rcupdate.h>
  21#include <linux/pid_namespace.h>
  22
  23#include <asm/poll.h>
  24#include <asm/siginfo.h>
  25#include <asm/uaccess.h>
  26
  27void set_close_on_exec(unsigned int fd, int flag)
  28{
  29        struct files_struct *files = current->files;
  30        struct fdtable *fdt;
  31        spin_lock(&files->file_lock);
  32        fdt = files_fdtable(files);
  33        if (flag)
  34                FD_SET(fd, fdt->close_on_exec);
  35        else
  36                FD_CLR(fd, fdt->close_on_exec);
  37        spin_unlock(&files->file_lock);
  38}
  39
  40static int get_close_on_exec(unsigned int fd)
  41{
  42        struct files_struct *files = current->files;
  43        struct fdtable *fdt;
  44        int res;
  45        rcu_read_lock();
  46        fdt = files_fdtable(files);
  47        res = FD_ISSET(fd, fdt->close_on_exec);
  48        rcu_read_unlock();
  49        return res;
  50}
  51
  52SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
  53{
  54        int err = -EBADF;
  55        struct file * file, *tofree;
  56        struct files_struct * files = current->files;
  57        struct fdtable *fdt;
  58
  59        if ((flags & ~O_CLOEXEC) != 0)
  60                return -EINVAL;
  61
  62        if (unlikely(oldfd == newfd))
  63                return -EINVAL;
  64
  65        spin_lock(&files->file_lock);
  66        err = expand_files(files, newfd);
  67        file = fcheck(oldfd);
  68        if (unlikely(!file))
  69                goto Ebadf;
  70        if (unlikely(err < 0)) {
  71                if (err == -EMFILE)
  72                        goto Ebadf;
  73                goto out_unlock;
  74        }
  75        /*
  76         * We need to detect attempts to do dup2() over allocated but still
  77         * not finished descriptor.  NB: OpenBSD avoids that at the price of
  78         * extra work in their equivalent of fget() - they insert struct
  79         * file immediately after grabbing descriptor, mark it larval if
  80         * more work (e.g. actual opening) is needed and make sure that
  81         * fget() treats larval files as absent.  Potentially interesting,
  82         * but while extra work in fget() is trivial, locking implications
  83         * and amount of surgery on open()-related paths in VFS are not.
  84         * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
  85         * deadlocks in rather amusing ways, AFAICS.  All of that is out of
  86         * scope of POSIX or SUS, since neither considers shared descriptor
  87         * tables and this condition does not arise without those.
  88         */
  89        err = -EBUSY;
  90        fdt = files_fdtable(files);
  91        tofree = fdt->fd[newfd];
  92        if (!tofree && FD_ISSET(newfd, fdt->open_fds))
  93                goto out_unlock;
  94        get_file(file);
  95        rcu_assign_pointer(fdt->fd[newfd], file);
  96        FD_SET(newfd, fdt->open_fds);
  97        if (flags & O_CLOEXEC)
  98                FD_SET(newfd, fdt->close_on_exec);
  99        else
 100                FD_CLR(newfd, fdt->close_on_exec);
 101        spin_unlock(&files->file_lock);
 102
 103        if (tofree)
 104                filp_close(tofree, files);
 105
 106        return newfd;
 107
 108Ebadf:
 109        err = -EBADF;
 110out_unlock:
 111        spin_unlock(&files->file_lock);
 112        return err;
 113}
 114
 115SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 116{
 117        if (unlikely(newfd == oldfd)) { /* corner case */
 118                struct files_struct *files = current->files;
 119                int retval = oldfd;
 120
 121                rcu_read_lock();
 122                if (!fcheck_files(files, oldfd))
 123                        retval = -EBADF;
 124                rcu_read_unlock();
 125                return retval;
 126        }
 127        return sys_dup3(oldfd, newfd, 0);
 128}
 129
 130SYSCALL_DEFINE1(dup, unsigned int, fildes)
 131{
 132        int ret = -EBADF;
 133        struct file *file = fget(fildes);
 134
 135        if (file) {
 136                ret = get_unused_fd();
 137                if (ret >= 0)
 138                        fd_install(ret, file);
 139                else
 140                        fput(file);
 141        }
 142        return ret;
 143}
 144
 145#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 146
 147static int setfl(int fd, struct file * filp, unsigned long arg)
 148{
 149        struct inode * inode = filp->f_path.dentry->d_inode;
 150        int error = 0;
 151
 152        /*
 153         * O_APPEND cannot be cleared if the file is marked as append-only
 154         * and the file is open for write.
 155         */
 156        if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
 157                return -EPERM;
 158
 159        /* O_NOATIME can only be set by the owner or superuser */
 160        if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
 161                if (!is_owner_or_cap(inode))
 162                        return -EPERM;
 163
 164        /* required for strict SunOS emulation */
 165        if (O_NONBLOCK != O_NDELAY)
 166               if (arg & O_NDELAY)
 167                   arg |= O_NONBLOCK;
 168
 169        if (arg & O_DIRECT) {
 170                if (!filp->f_mapping || !filp->f_mapping->a_ops ||
 171                        !filp->f_mapping->a_ops->direct_IO)
 172                                return -EINVAL;
 173        }
 174
 175        if (filp->f_op && filp->f_op->check_flags)
 176                error = filp->f_op->check_flags(arg);
 177        if (error)
 178                return error;
 179
 180        /*
 181         * ->fasync() is responsible for setting the FASYNC bit.
 182         */
 183        if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op &&
 184                        filp->f_op->fasync) {
 185                error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
 186                if (error < 0)
 187                        goto out;
 188                if (error > 0)
 189                        error = 0;
 190        }
 191        spin_lock(&filp->f_lock);
 192        filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
 193        spin_unlock(&filp->f_lock);
 194
 195 out:
 196        return error;
 197}
 198
 199static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
 200                     int force)
 201{
 202        write_lock_irq(&filp->f_owner.lock);
 203        if (force || !filp->f_owner.pid) {
 204                put_pid(filp->f_owner.pid);
 205                filp->f_owner.pid = get_pid(pid);
 206                filp->f_owner.pid_type = type;
 207
 208                if (pid) {
 209                        const struct cred *cred = current_cred();
 210                        filp->f_owner.uid = cred->uid;
 211                        filp->f_owner.euid = cred->euid;
 212                }
 213        }
 214        write_unlock_irq(&filp->f_owner.lock);
 215}
 216
 217int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
 218                int force)
 219{
 220        int err;
 221
 222        err = security_file_set_fowner(filp);
 223        if (err)
 224                return err;
 225
 226        f_modown(filp, pid, type, force);
 227        return 0;
 228}
 229EXPORT_SYMBOL(__f_setown);
 230
 231int f_setown(struct file *filp, unsigned long arg, int force)
 232{
 233        enum pid_type type;
 234        struct pid *pid;
 235        int who = arg;
 236        int result;
 237        type = PIDTYPE_PID;
 238        if (who < 0) {
 239                type = PIDTYPE_PGID;
 240                who = -who;
 241        }
 242        rcu_read_lock();
 243        pid = find_vpid(who);
 244        result = __f_setown(filp, pid, type, force);
 245        rcu_read_unlock();
 246        return result;
 247}
 248EXPORT_SYMBOL(f_setown);
 249
 250void f_delown(struct file *filp)
 251{
 252        f_modown(filp, NULL, PIDTYPE_PID, 1);
 253}
 254
 255pid_t f_getown(struct file *filp)
 256{
 257        pid_t pid;
 258        read_lock(&filp->f_owner.lock);
 259        pid = pid_vnr(filp->f_owner.pid);
 260        if (filp->f_owner.pid_type == PIDTYPE_PGID)
 261                pid = -pid;
 262        read_unlock(&filp->f_owner.lock);
 263        return pid;
 264}
 265
 266static int f_setown_ex(struct file *filp, unsigned long arg)
 267{
 268        struct f_owner_ex * __user owner_p = (void * __user)arg;
 269        struct f_owner_ex owner;
 270        struct pid *pid;
 271        int type;
 272        int ret;
 273
 274        ret = copy_from_user(&owner, owner_p, sizeof(owner));
 275        if (ret)
 276                return ret;
 277
 278        switch (owner.type) {
 279        case F_OWNER_TID:
 280                type = PIDTYPE_MAX;
 281                break;
 282
 283        case F_OWNER_PID:
 284                type = PIDTYPE_PID;
 285                break;
 286
 287        case F_OWNER_PGRP:
 288                type = PIDTYPE_PGID;
 289                break;
 290
 291        default:
 292                return -EINVAL;
 293        }
 294
 295        rcu_read_lock();
 296        pid = find_vpid(owner.pid);
 297        if (owner.pid && !pid)
 298                ret = -ESRCH;
 299        else
 300                ret = __f_setown(filp, pid, type, 1);
 301        rcu_read_unlock();
 302
 303        return ret;
 304}
 305
 306static int f_getown_ex(struct file *filp, unsigned long arg)
 307{
 308        struct f_owner_ex * __user owner_p = (void * __user)arg;
 309        struct f_owner_ex owner;
 310        int ret = 0;
 311
 312        read_lock(&filp->f_owner.lock);
 313        owner.pid = pid_vnr(filp->f_owner.pid);
 314        switch (filp->f_owner.pid_type) {
 315        case PIDTYPE_MAX:
 316                owner.type = F_OWNER_TID;
 317                break;
 318
 319        case PIDTYPE_PID:
 320                owner.type = F_OWNER_PID;
 321                break;
 322
 323        case PIDTYPE_PGID:
 324                owner.type = F_OWNER_PGRP;
 325                break;
 326
 327        default:
 328                WARN_ON(1);
 329                ret = -EINVAL;
 330                break;
 331        }
 332        read_unlock(&filp->f_owner.lock);
 333
 334        if (!ret)
 335                ret = copy_to_user(owner_p, &owner, sizeof(owner));
 336        return ret;
 337}
 338
 339static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 340                struct file *filp)
 341{
 342        long err = -EINVAL;
 343
 344        switch (cmd) {
 345        case F_DUPFD:
 346        case F_DUPFD_CLOEXEC:
 347                if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 348                        break;
 349                err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
 350                if (err >= 0) {
 351                        get_file(filp);
 352                        fd_install(err, filp);
 353                }
 354                break;
 355        case F_GETFD:
 356                err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
 357                break;
 358        case F_SETFD:
 359                err = 0;
 360                set_close_on_exec(fd, arg & FD_CLOEXEC);
 361                break;
 362        case F_GETFL:
 363                err = filp->f_flags;
 364                break;
 365        case F_SETFL:
 366                err = setfl(fd, filp, arg);
 367                break;
 368        case F_GETLK:
 369                err = fcntl_getlk(filp, (struct flock __user *) arg);
 370                break;
 371        case F_SETLK:
 372        case F_SETLKW:
 373                err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
 374                break;
 375        case F_GETOWN:
 376                /*
 377                 * XXX If f_owner is a process group, the
 378                 * negative return value will get converted
 379                 * into an error.  Oops.  If we keep the
 380                 * current syscall conventions, the only way
 381                 * to fix this will be in libc.
 382                 */
 383                err = f_getown(filp);
 384                force_successful_syscall_return();
 385                break;
 386        case F_SETOWN:
 387                err = f_setown(filp, arg, 1);
 388                break;
 389        case F_GETOWN_EX:
 390                err = f_getown_ex(filp, arg);
 391                break;
 392        case F_SETOWN_EX:
 393                err = f_setown_ex(filp, arg);
 394                break;
 395        case F_GETSIG:
 396                err = filp->f_owner.signum;
 397                break;
 398        case F_SETSIG:
 399                /* arg == 0 restores default behaviour. */
 400                if (!valid_signal(arg)) {
 401                        break;
 402                }
 403                err = 0;
 404                filp->f_owner.signum = arg;
 405                break;
 406        case F_GETLEASE:
 407                err = fcntl_getlease(filp);
 408                break;
 409        case F_SETLEASE:
 410                err = fcntl_setlease(fd, filp, arg);
 411                break;
 412        case F_NOTIFY:
 413                err = fcntl_dirnotify(fd, filp, arg);
 414                break;
 415        default:
 416                break;
 417        }
 418        return err;
 419}
 420
 421SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 422{       
 423        struct file *filp;
 424        long err = -EBADF;
 425
 426        filp = fget(fd);
 427        if (!filp)
 428                goto out;
 429
 430        err = security_file_fcntl(filp, cmd, arg);
 431        if (err) {
 432                fput(filp);
 433                return err;
 434        }
 435
 436        err = do_fcntl(fd, cmd, arg, filp);
 437
 438        fput(filp);
 439out:
 440        return err;
 441}
 442
 443#if BITS_PER_LONG == 32
 444SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 445                unsigned long, arg)
 446{       
 447        struct file * filp;
 448        long err;
 449
 450        err = -EBADF;
 451        filp = fget(fd);
 452        if (!filp)
 453                goto out;
 454
 455        err = security_file_fcntl(filp, cmd, arg);
 456        if (err) {
 457                fput(filp);
 458                return err;
 459        }
 460        err = -EBADF;
 461        
 462        switch (cmd) {
 463                case F_GETLK64:
 464                        err = fcntl_getlk64(filp, (struct flock64 __user *) arg);
 465                        break;
 466                case F_SETLK64:
 467                case F_SETLKW64:
 468                        err = fcntl_setlk64(fd, filp, cmd,
 469                                        (struct flock64 __user *) arg);
 470                        break;
 471                default:
 472                        err = do_fcntl(fd, cmd, arg, filp);
 473                        break;
 474        }
 475        fput(filp);
 476out:
 477        return err;
 478}
 479#endif
 480
 481/* Table to convert sigio signal codes into poll band bitmaps */
 482
 483static const long band_table[NSIGPOLL] = {
 484        POLLIN | POLLRDNORM,                    /* POLL_IN */
 485        POLLOUT | POLLWRNORM | POLLWRBAND,      /* POLL_OUT */
 486        POLLIN | POLLRDNORM | POLLMSG,          /* POLL_MSG */
 487        POLLERR,                                /* POLL_ERR */
 488        POLLPRI | POLLRDBAND,                   /* POLL_PRI */
 489        POLLHUP | POLLERR                       /* POLL_HUP */
 490};
 491
 492static inline int sigio_perm(struct task_struct *p,
 493                             struct fown_struct *fown, int sig)
 494{
 495        const struct cred *cred;
 496        int ret;
 497
 498        rcu_read_lock();
 499        cred = __task_cred(p);
 500        ret = ((fown->euid == 0 ||
 501                fown->euid == cred->suid || fown->euid == cred->uid ||
 502                fown->uid  == cred->suid || fown->uid  == cred->uid) &&
 503               !security_file_send_sigiotask(p, fown, sig));
 504        rcu_read_unlock();
 505        return ret;
 506}
 507
 508static void send_sigio_to_task(struct task_struct *p,
 509                               struct fown_struct *fown,
 510                               int fd, int reason, int group)
 511{
 512        /*
 513         * F_SETSIG can change ->signum lockless in parallel, make
 514         * sure we read it once and use the same value throughout.
 515         */
 516        int signum = ACCESS_ONCE(fown->signum);
 517
 518        if (!sigio_perm(p, fown, signum))
 519                return;
 520
 521        switch (signum) {
 522                siginfo_t si;
 523                default:
 524                        /* Queue a rt signal with the appropriate fd as its
 525                           value.  We use SI_SIGIO as the source, not 
 526                           SI_KERNEL, since kernel signals always get 
 527                           delivered even if we can't queue.  Failure to
 528                           queue in this case _should_ be reported; we fall
 529                           back to SIGIO in that case. --sct */
 530                        si.si_signo = signum;
 531                        si.si_errno = 0;
 532                        si.si_code  = reason;
 533                        /* Make sure we are called with one of the POLL_*
 534                           reasons, otherwise we could leak kernel stack into
 535                           userspace.  */
 536                        BUG_ON((reason & __SI_MASK) != __SI_POLL);
 537                        if (reason - POLL_IN >= NSIGPOLL)
 538                                si.si_band  = ~0L;
 539                        else
 540                                si.si_band = band_table[reason - POLL_IN];
 541                        si.si_fd    = fd;
 542                        if (!do_send_sig_info(signum, &si, p, group))
 543                                break;
 544                /* fall-through: fall back on the old plain SIGIO signal */
 545                case 0:
 546                        do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
 547        }
 548}
 549
 550void send_sigio(struct fown_struct *fown, int fd, int band)
 551{
 552        struct task_struct *p;
 553        enum pid_type type;
 554        struct pid *pid;
 555        int group = 1;
 556        
 557        read_lock(&fown->lock);
 558
 559        type = fown->pid_type;
 560        if (type == PIDTYPE_MAX) {
 561                group = 0;
 562                type = PIDTYPE_PID;
 563        }
 564
 565        pid = fown->pid;
 566        if (!pid)
 567                goto out_unlock_fown;
 568        
 569        read_lock(&tasklist_lock);
 570        do_each_pid_task(pid, type, p) {
 571                send_sigio_to_task(p, fown, fd, band, group);
 572        } while_each_pid_task(pid, type, p);
 573        read_unlock(&tasklist_lock);
 574 out_unlock_fown:
 575        read_unlock(&fown->lock);
 576}
 577
 578static void send_sigurg_to_task(struct task_struct *p,
 579                                struct fown_struct *fown, int group)
 580{
 581        if (sigio_perm(p, fown, SIGURG))
 582                do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
 583}
 584
 585int send_sigurg(struct fown_struct *fown)
 586{
 587        struct task_struct *p;
 588        enum pid_type type;
 589        struct pid *pid;
 590        int group = 1;
 591        int ret = 0;
 592        
 593        read_lock(&fown->lock);
 594
 595        type = fown->pid_type;
 596        if (type == PIDTYPE_MAX) {
 597                group = 0;
 598                type = PIDTYPE_PID;
 599        }
 600
 601        pid = fown->pid;
 602        if (!pid)
 603                goto out_unlock_fown;
 604
 605        ret = 1;
 606        
 607        read_lock(&tasklist_lock);
 608        do_each_pid_task(pid, type, p) {
 609                send_sigurg_to_task(p, fown, group);
 610        } while_each_pid_task(pid, type, p);
 611        read_unlock(&tasklist_lock);
 612 out_unlock_fown:
 613        read_unlock(&fown->lock);
 614        return ret;
 615}
 616
 617static DEFINE_RWLOCK(fasync_lock);
 618static struct kmem_cache *fasync_cache __read_mostly;
 619
 620/*
 621 * fasync_helper() is used by almost all character device drivers
 622 * to set up the fasync queue. It returns negative on error, 0 if it did
 623 * no changes and positive if it added/deleted the entry.
 624 */
 625int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
 626{
 627        struct fasync_struct *fa, **fp;
 628        struct fasync_struct *new = NULL;
 629        int result = 0;
 630
 631        if (on) {
 632                new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
 633                if (!new)
 634                        return -ENOMEM;
 635        }
 636
 637        /*
 638         * We need to take f_lock first since it's not an IRQ-safe
 639         * lock.
 640         */
 641        spin_lock(&filp->f_lock);
 642        write_lock_irq(&fasync_lock);
 643        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
 644                if (fa->fa_file == filp) {
 645                        if(on) {
 646                                fa->fa_fd = fd;
 647                                kmem_cache_free(fasync_cache, new);
 648                        } else {
 649                                *fp = fa->fa_next;
 650                                kmem_cache_free(fasync_cache, fa);
 651                                result = 1;
 652                        }
 653                        goto out;
 654                }
 655        }
 656
 657        if (on) {
 658                new->magic = FASYNC_MAGIC;
 659                new->fa_file = filp;
 660                new->fa_fd = fd;
 661                new->fa_next = *fapp;
 662                *fapp = new;
 663                result = 1;
 664        }
 665out:
 666        if (on)
 667                filp->f_flags |= FASYNC;
 668        else
 669                filp->f_flags &= ~FASYNC;
 670        write_unlock_irq(&fasync_lock);
 671        spin_unlock(&filp->f_lock);
 672        return result;
 673}
 674
 675EXPORT_SYMBOL(fasync_helper);
 676
 677void __kill_fasync(struct fasync_struct *fa, int sig, int band)
 678{
 679        while (fa) {
 680                struct fown_struct * fown;
 681                if (fa->magic != FASYNC_MAGIC) {
 682                        printk(KERN_ERR "kill_fasync: bad magic number in "
 683                               "fasync_struct!\n");
 684                        return;
 685                }
 686                fown = &fa->fa_file->f_owner;
 687                /* Don't send SIGURG to processes which have not set a
 688                   queued signum: SIGURG has its own default signalling
 689                   mechanism. */
 690                if (!(sig == SIGURG && fown->signum == 0))
 691                        send_sigio(fown, fa->fa_fd, band);
 692                fa = fa->fa_next;
 693        }
 694}
 695
 696EXPORT_SYMBOL(__kill_fasync);
 697
 698void kill_fasync(struct fasync_struct **fp, int sig, int band)
 699{
 700        /* First a quick test without locking: usually
 701         * the list is empty.
 702         */
 703        if (*fp) {
 704                read_lock(&fasync_lock);
 705                /* reread *fp after obtaining the lock */
 706                __kill_fasync(*fp, sig, band);
 707                read_unlock(&fasync_lock);
 708        }
 709}
 710EXPORT_SYMBOL(kill_fasync);
 711
 712static int __init fasync_init(void)
 713{
 714        fasync_cache = kmem_cache_create("fasync_cache",
 715                sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
 716        return 0;
 717}
 718
 719module_init(fasync_init)
 720