linux/fs/pipe.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/pipe.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/file.h>
   9#include <linux/poll.h>
  10#include <linux/slab.h>
  11#include <linux/module.h>
  12#include <linux/init.h>
  13#include <linux/fs.h>
  14#include <linux/log2.h>
  15#include <linux/mount.h>
  16#include <linux/magic.h>
  17#include <linux/pipe_fs_i.h>
  18#include <linux/uio.h>
  19#include <linux/highmem.h>
  20#include <linux/pagemap.h>
  21#include <linux/audit.h>
  22#include <linux/syscalls.h>
  23#include <linux/fcntl.h>
  24#include <linux/aio.h>
  25
  26#include <asm/uaccess.h>
  27#include <asm/ioctls.h>
  28
  29#include "internal.h"
  30
  31/*
  32 * The max size that a non-root user is allowed to grow the pipe. Can
  33 * be set by root in /proc/sys/fs/pipe-max-size
  34 */
  35unsigned int pipe_max_size = 1048576;
  36
  37/*
  38 * Minimum pipe size, as required by POSIX
  39 */
  40unsigned int pipe_min_size = PAGE_SIZE;
  41
  42/*
  43 * We use a start+len construction, which provides full use of the 
  44 * allocated memory.
  45 * -- Florian Coosmann (FGC)
  46 * 
  47 * Reads with count = 0 should always return 0.
  48 * -- Julian Bradfield 1999-06-07.
  49 *
  50 * FIFOs and Pipes now generate SIGIO for both readers and writers.
  51 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
  52 *
  53 * pipe_read & write cleanup
  54 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  55 */
  56
  57static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
  58{
  59        if (pipe->files)
  60                mutex_lock_nested(&pipe->mutex, subclass);
  61}
  62
  63void pipe_lock(struct pipe_inode_info *pipe)
  64{
  65        /*
  66         * pipe_lock() nests non-pipe inode locks (for writing to a file)
  67         */
  68        pipe_lock_nested(pipe, I_MUTEX_PARENT);
  69}
  70EXPORT_SYMBOL(pipe_lock);
  71
  72void pipe_unlock(struct pipe_inode_info *pipe)
  73{
  74        if (pipe->files)
  75                mutex_unlock(&pipe->mutex);
  76}
  77EXPORT_SYMBOL(pipe_unlock);
  78
  79static inline void __pipe_lock(struct pipe_inode_info *pipe)
  80{
  81        mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
  82}
  83
  84static inline void __pipe_unlock(struct pipe_inode_info *pipe)
  85{
  86        mutex_unlock(&pipe->mutex);
  87}
  88
  89void pipe_double_lock(struct pipe_inode_info *pipe1,
  90                      struct pipe_inode_info *pipe2)
  91{
  92        BUG_ON(pipe1 == pipe2);
  93
  94        if (pipe1 < pipe2) {
  95                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
  96                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
  97        } else {
  98                pipe_lock_nested(pipe2, I_MUTEX_PARENT);
  99                pipe_lock_nested(pipe1, I_MUTEX_CHILD);
 100        }
 101}
 102
 103/* Drop the inode semaphore and wait for a pipe event, atomically */
 104void pipe_wait(struct pipe_inode_info *pipe)
 105{
 106        DEFINE_WAIT(wait);
 107
 108        /*
 109         * Pipes are system-local resources, so sleeping on them
 110         * is considered a noninteractive wait:
 111         */
 112        prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
 113        pipe_unlock(pipe);
 114        schedule();
 115        finish_wait(&pipe->wait, &wait);
 116        pipe_lock(pipe);
 117}
 118
 119static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 120                                  struct pipe_buffer *buf)
 121{
 122        struct page *page = buf->page;
 123
 124        /*
 125         * If nobody else uses this page, and we don't already have a
 126         * temporary page, let's keep track of it as a one-deep
 127         * allocation cache. (Otherwise just release our reference to it)
 128         */
 129        if (page_count(page) == 1 && !pipe->tmp_page)
 130                pipe->tmp_page = page;
 131        else
 132                page_cache_release(page);
 133}
 134
 135/**
 136 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
 137 * @pipe:       the pipe that the buffer belongs to
 138 * @buf:        the buffer to attempt to steal
 139 *
 140 * Description:
 141 *      This function attempts to steal the &struct page attached to
 142 *      @buf. If successful, this function returns 0 and returns with
 143 *      the page locked. The caller may then reuse the page for whatever
 144 *      he wishes; the typical use is insertion into a different file
 145 *      page cache.
 146 */
 147int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
 148                           struct pipe_buffer *buf)
 149{
 150        struct page *page = buf->page;
 151
 152        /*
 153         * A reference of one is golden, that means that the owner of this
 154         * page is the only one holding a reference to it. lock the page
 155         * and return OK.
 156         */
 157        if (page_count(page) == 1) {
 158                lock_page(page);
 159                return 0;
 160        }
 161
 162        return 1;
 163}
 164EXPORT_SYMBOL(generic_pipe_buf_steal);
 165
 166/**
 167 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
 168 * @pipe:       the pipe that the buffer belongs to
 169 * @buf:        the buffer to get a reference to
 170 *
 171 * Description:
 172 *      This function grabs an extra reference to @buf. It's used in
 173 *      in the tee() system call, when we duplicate the buffers in one
 174 *      pipe into another.
 175 */
 176void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 177{
 178        page_cache_get(buf->page);
 179}
 180EXPORT_SYMBOL(generic_pipe_buf_get);
 181
 182/**
 183 * generic_pipe_buf_confirm - verify contents of the pipe buffer
 184 * @info:       the pipe that the buffer belongs to
 185 * @buf:        the buffer to confirm
 186 *
 187 * Description:
 188 *      This function does nothing, because the generic pipe code uses
 189 *      pages that are always good when inserted into the pipe.
 190 */
 191int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 192                             struct pipe_buffer *buf)
 193{
 194        return 0;
 195}
 196EXPORT_SYMBOL(generic_pipe_buf_confirm);
 197
 198/**
 199 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
 200 * @pipe:       the pipe that the buffer belongs to
 201 * @buf:        the buffer to put a reference to
 202 *
 203 * Description:
 204 *      This function releases a reference to @buf.
 205 */
 206void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 207                              struct pipe_buffer *buf)
 208{
 209        page_cache_release(buf->page);
 210}
 211EXPORT_SYMBOL(generic_pipe_buf_release);
 212
 213static const struct pipe_buf_operations anon_pipe_buf_ops = {
 214        .can_merge = 1,
 215        .confirm = generic_pipe_buf_confirm,
 216        .release = anon_pipe_buf_release,
 217        .steal = generic_pipe_buf_steal,
 218        .get = generic_pipe_buf_get,
 219};
 220
 221static const struct pipe_buf_operations packet_pipe_buf_ops = {
 222        .can_merge = 0,
 223        .confirm = generic_pipe_buf_confirm,
 224        .release = anon_pipe_buf_release,
 225        .steal = generic_pipe_buf_steal,
 226        .get = generic_pipe_buf_get,
 227};
 228
 229static ssize_t
 230pipe_read(struct kiocb *iocb, struct iov_iter *to)
 231{
 232        size_t total_len = iov_iter_count(to);
 233        struct file *filp = iocb->ki_filp;
 234        struct pipe_inode_info *pipe = filp->private_data;
 235        int do_wakeup;
 236        ssize_t ret;
 237
 238        /* Null read succeeds. */
 239        if (unlikely(total_len == 0))
 240                return 0;
 241
 242        do_wakeup = 0;
 243        ret = 0;
 244        __pipe_lock(pipe);
 245        for (;;) {
 246                int bufs = pipe->nrbufs;
 247                if (bufs) {
 248                        int curbuf = pipe->curbuf;
 249                        struct pipe_buffer *buf = pipe->bufs + curbuf;
 250                        const struct pipe_buf_operations *ops = buf->ops;
 251                        size_t chars = buf->len;
 252                        size_t written;
 253                        int error;
 254
 255                        if (chars > total_len)
 256                                chars = total_len;
 257
 258                        error = ops->confirm(pipe, buf);
 259                        if (error) {
 260                                if (!ret)
 261                                        ret = error;
 262                                break;
 263                        }
 264
 265                        written = copy_page_to_iter(buf->page, buf->offset, chars, to);
 266                        if (unlikely(written < chars)) {
 267                                if (!ret)
 268                                        ret = -EFAULT;
 269                                break;
 270                        }
 271                        ret += chars;
 272                        buf->offset += chars;
 273                        buf->len -= chars;
 274
 275                        /* Was it a packet buffer? Clean up and exit */
 276                        if (buf->flags & PIPE_BUF_FLAG_PACKET) {
 277                                total_len = chars;
 278                                buf->len = 0;
 279                        }
 280
 281                        if (!buf->len) {
 282                                buf->ops = NULL;
 283                                ops->release(pipe, buf);
 284                                curbuf = (curbuf + 1) & (pipe->buffers - 1);
 285                                pipe->curbuf = curbuf;
 286                                pipe->nrbufs = --bufs;
 287                                do_wakeup = 1;
 288                        }
 289                        total_len -= chars;
 290                        if (!total_len)
 291                                break;  /* common path: read succeeded */
 292                }
 293                if (bufs)       /* More to do? */
 294                        continue;
 295                if (!pipe->writers)
 296                        break;
 297                if (!pipe->waiting_writers) {
 298                        /* syscall merging: Usually we must not sleep
 299                         * if O_NONBLOCK is set, or if we got some data.
 300                         * But if a writer sleeps in kernel space, then
 301                         * we can wait for that data without violating POSIX.
 302                         */
 303                        if (ret)
 304                                break;
 305                        if (filp->f_flags & O_NONBLOCK) {
 306                                ret = -EAGAIN;
 307                                break;
 308                        }
 309                }
 310                if (signal_pending(current)) {
 311                        if (!ret)
 312                                ret = -ERESTARTSYS;
 313                        break;
 314                }
 315                if (do_wakeup) {
 316                        wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
 317                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 318                }
 319                pipe_wait(pipe);
 320        }
 321        __pipe_unlock(pipe);
 322
 323        /* Signal writers asynchronously that there is more room. */
 324        if (do_wakeup) {
 325                wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
 326                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 327        }
 328        if (ret > 0)
 329                file_accessed(filp);
 330        return ret;
 331}
 332
 333static inline int is_packetized(struct file *file)
 334{
 335        return (file->f_flags & O_DIRECT) != 0;
 336}
 337
 338static ssize_t
 339pipe_write(struct kiocb *iocb, struct iov_iter *from)
 340{
 341        struct file *filp = iocb->ki_filp;
 342        struct pipe_inode_info *pipe = filp->private_data;
 343        ssize_t ret = 0;
 344        int do_wakeup = 0;
 345        size_t total_len = iov_iter_count(from);
 346        ssize_t chars;
 347
 348        /* Null write succeeds. */
 349        if (unlikely(total_len == 0))
 350                return 0;
 351
 352        __pipe_lock(pipe);
 353
 354        if (!pipe->readers) {
 355                send_sig(SIGPIPE, current, 0);
 356                ret = -EPIPE;
 357                goto out;
 358        }
 359
 360        /* We try to merge small writes */
 361        chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
 362        if (pipe->nrbufs && chars != 0) {
 363                int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
 364                                                        (pipe->buffers - 1);
 365                struct pipe_buffer *buf = pipe->bufs + lastbuf;
 366                const struct pipe_buf_operations *ops = buf->ops;
 367                int offset = buf->offset + buf->len;
 368
 369                if (ops->can_merge && offset + chars <= PAGE_SIZE) {
 370                        int error = ops->confirm(pipe, buf);
 371                        if (error)
 372                                goto out;
 373
 374                        ret = copy_page_from_iter(buf->page, offset, chars, from);
 375                        if (unlikely(ret < chars)) {
 376                                error = -EFAULT;
 377                                goto out;
 378                        }
 379                        do_wakeup = 1;
 380                        buf->len += chars;
 381                        ret = chars;
 382                        if (!iov_iter_count(from))
 383                                goto out;
 384                }
 385        }
 386
 387        for (;;) {
 388                int bufs;
 389
 390                if (!pipe->readers) {
 391                        send_sig(SIGPIPE, current, 0);
 392                        if (!ret)
 393                                ret = -EPIPE;
 394                        break;
 395                }
 396                bufs = pipe->nrbufs;
 397                if (bufs < pipe->buffers) {
 398                        int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
 399                        struct pipe_buffer *buf = pipe->bufs + newbuf;
 400                        struct page *page = pipe->tmp_page;
 401                        int copied;
 402
 403                        if (!page) {
 404                                page = alloc_page(GFP_HIGHUSER);
 405                                if (unlikely(!page)) {
 406                                        ret = ret ? : -ENOMEM;
 407                                        break;
 408                                }
 409                                pipe->tmp_page = page;
 410                        }
 411                        /* Always wake up, even if the copy fails. Otherwise
 412                         * we lock up (O_NONBLOCK-)readers that sleep due to
 413                         * syscall merging.
 414                         * FIXME! Is this really true?
 415                         */
 416                        do_wakeup = 1;
 417                        copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
 418                        if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
 419                                if (!ret)
 420                                        ret = -EFAULT;
 421                                break;
 422                        }
 423                        ret += copied;
 424
 425                        /* Insert it into the buffer array */
 426                        buf->page = page;
 427                        buf->ops = &anon_pipe_buf_ops;
 428                        buf->offset = 0;
 429                        buf->len = copied;
 430                        buf->flags = 0;
 431                        if (is_packetized(filp)) {
 432                                buf->ops = &packet_pipe_buf_ops;
 433                                buf->flags = PIPE_BUF_FLAG_PACKET;
 434                        }
 435                        pipe->nrbufs = ++bufs;
 436                        pipe->tmp_page = NULL;
 437
 438                        if (!iov_iter_count(from))
 439                                break;
 440                }
 441                if (bufs < pipe->buffers)
 442                        continue;
 443                if (filp->f_flags & O_NONBLOCK) {
 444                        if (!ret)
 445                                ret = -EAGAIN;
 446                        break;
 447                }
 448                if (signal_pending(current)) {
 449                        if (!ret)
 450                                ret = -ERESTARTSYS;
 451                        break;
 452                }
 453                if (do_wakeup) {
 454                        wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 455                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 456                        do_wakeup = 0;
 457                }
 458                pipe->waiting_writers++;
 459                pipe_wait(pipe);
 460                pipe->waiting_writers--;
 461        }
 462out:
 463        __pipe_unlock(pipe);
 464        if (do_wakeup) {
 465                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 466                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 467        }
 468        if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
 469                int err = file_update_time(filp);
 470                if (err)
 471                        ret = err;
 472                sb_end_write(file_inode(filp)->i_sb);
 473        }
 474        return ret;
 475}
 476
 477static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 478{
 479        struct pipe_inode_info *pipe = filp->private_data;
 480        int count, buf, nrbufs;
 481
 482        switch (cmd) {
 483                case FIONREAD:
 484                        __pipe_lock(pipe);
 485                        count = 0;
 486                        buf = pipe->curbuf;
 487                        nrbufs = pipe->nrbufs;
 488                        while (--nrbufs >= 0) {
 489                                count += pipe->bufs[buf].len;
 490                                buf = (buf+1) & (pipe->buffers - 1);
 491                        }
 492                        __pipe_unlock(pipe);
 493
 494                        return put_user(count, (int __user *)arg);
 495                default:
 496                        return -ENOIOCTLCMD;
 497        }
 498}
 499
 500/* No kernel lock held - fine */
 501static unsigned int
 502pipe_poll(struct file *filp, poll_table *wait)
 503{
 504        unsigned int mask;
 505        struct pipe_inode_info *pipe = filp->private_data;
 506        int nrbufs;
 507
 508        poll_wait(filp, &pipe->wait, wait);
 509
 510        /* Reading only -- no need for acquiring the semaphore.  */
 511        nrbufs = pipe->nrbufs;
 512        mask = 0;
 513        if (filp->f_mode & FMODE_READ) {
 514                mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
 515                if (!pipe->writers && filp->f_version != pipe->w_counter)
 516                        mask |= POLLHUP;
 517        }
 518
 519        if (filp->f_mode & FMODE_WRITE) {
 520                mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
 521                /*
 522                 * Most Unices do not set POLLERR for FIFOs but on Linux they
 523                 * behave exactly like pipes for poll().
 524                 */
 525                if (!pipe->readers)
 526                        mask |= POLLERR;
 527        }
 528
 529        return mask;
 530}
 531
 532static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
 533{
 534        int kill = 0;
 535
 536        spin_lock(&inode->i_lock);
 537        if (!--pipe->files) {
 538                inode->i_pipe = NULL;
 539                kill = 1;
 540        }
 541        spin_unlock(&inode->i_lock);
 542
 543        if (kill)
 544                free_pipe_info(pipe);
 545}
 546
 547static int
 548pipe_release(struct inode *inode, struct file *file)
 549{
 550        struct pipe_inode_info *pipe = file->private_data;
 551
 552        __pipe_lock(pipe);
 553        if (file->f_mode & FMODE_READ)
 554                pipe->readers--;
 555        if (file->f_mode & FMODE_WRITE)
 556                pipe->writers--;
 557
 558        if (pipe->readers || pipe->writers) {
 559                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
 560                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 561                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 562        }
 563        __pipe_unlock(pipe);
 564
 565        put_pipe_info(inode, pipe);
 566        return 0;
 567}
 568
 569static int
 570pipe_fasync(int fd, struct file *filp, int on)
 571{
 572        struct pipe_inode_info *pipe = filp->private_data;
 573        int retval = 0;
 574
 575        __pipe_lock(pipe);
 576        if (filp->f_mode & FMODE_READ)
 577                retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 578        if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
 579                retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
 580                if (retval < 0 && (filp->f_mode & FMODE_READ))
 581                        /* this can happen only if on == T */
 582                        fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 583        }
 584        __pipe_unlock(pipe);
 585        return retval;
 586}
 587
 588struct pipe_inode_info *alloc_pipe_info(void)
 589{
 590        struct pipe_inode_info *pipe;
 591
 592        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
 593        if (pipe) {
 594                pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
 595                if (pipe->bufs) {
 596                        init_waitqueue_head(&pipe->wait);
 597                        pipe->r_counter = pipe->w_counter = 1;
 598                        pipe->buffers = PIPE_DEF_BUFFERS;
 599                        mutex_init(&pipe->mutex);
 600                        return pipe;
 601                }
 602                kfree(pipe);
 603        }
 604
 605        return NULL;
 606}
 607
 608void free_pipe_info(struct pipe_inode_info *pipe)
 609{
 610        int i;
 611
 612        for (i = 0; i < pipe->buffers; i++) {
 613                struct pipe_buffer *buf = pipe->bufs + i;
 614                if (buf->ops)
 615                        buf->ops->release(pipe, buf);
 616        }
 617        if (pipe->tmp_page)
 618                __free_page(pipe->tmp_page);
 619        kfree(pipe->bufs);
 620        kfree(pipe);
 621}
 622
 623static struct vfsmount *pipe_mnt __read_mostly;
 624
 625/*
 626 * pipefs_dname() is called from d_path().
 627 */
 628static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 629{
 630        return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
 631                                dentry->d_inode->i_ino);
 632}
 633
 634static const struct dentry_operations pipefs_dentry_operations = {
 635        .d_dname        = pipefs_dname,
 636};
 637
 638static struct inode * get_pipe_inode(void)
 639{
 640        struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
 641        struct pipe_inode_info *pipe;
 642
 643        if (!inode)
 644                goto fail_inode;
 645
 646        inode->i_ino = get_next_ino();
 647
 648        pipe = alloc_pipe_info();
 649        if (!pipe)
 650                goto fail_iput;
 651
 652        inode->i_pipe = pipe;
 653        pipe->files = 2;
 654        pipe->readers = pipe->writers = 1;
 655        inode->i_fop = &pipefifo_fops;
 656
 657        /*
 658         * Mark the inode dirty from the very beginning,
 659         * that way it will never be moved to the dirty
 660         * list because "mark_inode_dirty()" will think
 661         * that it already _is_ on the dirty list.
 662         */
 663        inode->i_state = I_DIRTY;
 664        inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
 665        inode->i_uid = current_fsuid();
 666        inode->i_gid = current_fsgid();
 667        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 668
 669        return inode;
 670
 671fail_iput:
 672        iput(inode);
 673
 674fail_inode:
 675        return NULL;
 676}
 677
 678int create_pipe_files(struct file **res, int flags)
 679{
 680        int err;
 681        struct inode *inode = get_pipe_inode();
 682        struct file *f;
 683        struct path path;
 684        static struct qstr name = { .name = "" };
 685
 686        if (!inode)
 687                return -ENFILE;
 688
 689        err = -ENOMEM;
 690        path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
 691        if (!path.dentry)
 692                goto err_inode;
 693        path.mnt = mntget(pipe_mnt);
 694
 695        d_instantiate(path.dentry, inode);
 696
 697        err = -ENFILE;
 698        f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
 699        if (IS_ERR(f))
 700                goto err_dentry;
 701
 702        f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
 703        f->private_data = inode->i_pipe;
 704
 705        res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
 706        if (IS_ERR(res[0]))
 707                goto err_file;
 708
 709        path_get(&path);
 710        res[0]->private_data = inode->i_pipe;
 711        res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
 712        res[1] = f;
 713        return 0;
 714
 715err_file:
 716        put_filp(f);
 717err_dentry:
 718        free_pipe_info(inode->i_pipe);
 719        path_put(&path);
 720        return err;
 721
 722err_inode:
 723        free_pipe_info(inode->i_pipe);
 724        iput(inode);
 725        return err;
 726}
 727
 728static int __do_pipe_flags(int *fd, struct file **files, int flags)
 729{
 730        int error;
 731        int fdw, fdr;
 732
 733        if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
 734                return -EINVAL;
 735
 736        error = create_pipe_files(files, flags);
 737        if (error)
 738                return error;
 739
 740        error = get_unused_fd_flags(flags);
 741        if (error < 0)
 742                goto err_read_pipe;
 743        fdr = error;
 744
 745        error = get_unused_fd_flags(flags);
 746        if (error < 0)
 747                goto err_fdr;
 748        fdw = error;
 749
 750        audit_fd_pair(fdr, fdw);
 751        fd[0] = fdr;
 752        fd[1] = fdw;
 753        return 0;
 754
 755 err_fdr:
 756        put_unused_fd(fdr);
 757 err_read_pipe:
 758        fput(files[0]);
 759        fput(files[1]);
 760        return error;
 761}
 762
 763int do_pipe_flags(int *fd, int flags)
 764{
 765        struct file *files[2];
 766        int error = __do_pipe_flags(fd, files, flags);
 767        if (!error) {
 768                fd_install(fd[0], files[0]);
 769                fd_install(fd[1], files[1]);
 770        }
 771        return error;
 772}
 773
 774/*
 775 * sys_pipe() is the normal C calling standard for creating
 776 * a pipe. It's not the way Unix traditionally does this, though.
 777 */
 778SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
 779{
 780        struct file *files[2];
 781        int fd[2];
 782        int error;
 783
 784        error = __do_pipe_flags(fd, files, flags);
 785        if (!error) {
 786                if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
 787                        fput(files[0]);
 788                        fput(files[1]);
 789                        put_unused_fd(fd[0]);
 790                        put_unused_fd(fd[1]);
 791                        error = -EFAULT;
 792                } else {
 793                        fd_install(fd[0], files[0]);
 794                        fd_install(fd[1], files[1]);
 795                }
 796        }
 797        return error;
 798}
 799
 800SYSCALL_DEFINE1(pipe, int __user *, fildes)
 801{
 802        return sys_pipe2(fildes, 0);
 803}
 804
 805static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
 806{
 807        int cur = *cnt; 
 808
 809        while (cur == *cnt) {
 810                pipe_wait(pipe);
 811                if (signal_pending(current))
 812                        break;
 813        }
 814        return cur == *cnt ? -ERESTARTSYS : 0;
 815}
 816
 817static void wake_up_partner(struct pipe_inode_info *pipe)
 818{
 819        wake_up_interruptible(&pipe->wait);
 820}
 821
 822static int fifo_open(struct inode *inode, struct file *filp)
 823{
 824        struct pipe_inode_info *pipe;
 825        bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
 826        int ret;
 827
 828        filp->f_version = 0;
 829
 830        spin_lock(&inode->i_lock);
 831        if (inode->i_pipe) {
 832                pipe = inode->i_pipe;
 833                pipe->files++;
 834                spin_unlock(&inode->i_lock);
 835        } else {
 836                spin_unlock(&inode->i_lock);
 837                pipe = alloc_pipe_info();
 838                if (!pipe)
 839                        return -ENOMEM;
 840                pipe->files = 1;
 841                spin_lock(&inode->i_lock);
 842                if (unlikely(inode->i_pipe)) {
 843                        inode->i_pipe->files++;
 844                        spin_unlock(&inode->i_lock);
 845                        free_pipe_info(pipe);
 846                        pipe = inode->i_pipe;
 847                } else {
 848                        inode->i_pipe = pipe;
 849                        spin_unlock(&inode->i_lock);
 850                }
 851        }
 852        filp->private_data = pipe;
 853        /* OK, we have a pipe and it's pinned down */
 854
 855        __pipe_lock(pipe);
 856
 857        /* We can only do regular read/write on fifos */
 858        filp->f_mode &= (FMODE_READ | FMODE_WRITE);
 859
 860        switch (filp->f_mode) {
 861        case FMODE_READ:
 862        /*
 863         *  O_RDONLY
 864         *  POSIX.1 says that O_NONBLOCK means return with the FIFO
 865         *  opened, even when there is no process writing the FIFO.
 866         */
 867                pipe->r_counter++;
 868                if (pipe->readers++ == 0)
 869                        wake_up_partner(pipe);
 870
 871                if (!is_pipe && !pipe->writers) {
 872                        if ((filp->f_flags & O_NONBLOCK)) {
 873                                /* suppress POLLHUP until we have
 874                                 * seen a writer */
 875                                filp->f_version = pipe->w_counter;
 876                        } else {
 877                                if (wait_for_partner(pipe, &pipe->w_counter))
 878                                        goto err_rd;
 879                        }
 880                }
 881                break;
 882        
 883        case FMODE_WRITE:
 884        /*
 885         *  O_WRONLY
 886         *  POSIX.1 says that O_NONBLOCK means return -1 with
 887         *  errno=ENXIO when there is no process reading the FIFO.
 888         */
 889                ret = -ENXIO;
 890                if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
 891                        goto err;
 892
 893                pipe->w_counter++;
 894                if (!pipe->writers++)
 895                        wake_up_partner(pipe);
 896
 897                if (!is_pipe && !pipe->readers) {
 898                        if (wait_for_partner(pipe, &pipe->r_counter))
 899                                goto err_wr;
 900                }
 901                break;
 902        
 903        case FMODE_READ | FMODE_WRITE:
 904        /*
 905         *  O_RDWR
 906         *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
 907         *  This implementation will NEVER block on a O_RDWR open, since
 908         *  the process can at least talk to itself.
 909         */
 910
 911                pipe->readers++;
 912                pipe->writers++;
 913                pipe->r_counter++;
 914                pipe->w_counter++;
 915                if (pipe->readers == 1 || pipe->writers == 1)
 916                        wake_up_partner(pipe);
 917                break;
 918
 919        default:
 920                ret = -EINVAL;
 921                goto err;
 922        }
 923
 924        /* Ok! */
 925        __pipe_unlock(pipe);
 926        return 0;
 927
 928err_rd:
 929        if (!--pipe->readers)
 930                wake_up_interruptible(&pipe->wait);
 931        ret = -ERESTARTSYS;
 932        goto err;
 933
 934err_wr:
 935        if (!--pipe->writers)
 936                wake_up_interruptible(&pipe->wait);
 937        ret = -ERESTARTSYS;
 938        goto err;
 939
 940err:
 941        __pipe_unlock(pipe);
 942
 943        put_pipe_info(inode, pipe);
 944        return ret;
 945}
 946
 947const struct file_operations pipefifo_fops = {
 948        .open           = fifo_open,
 949        .llseek         = no_llseek,
 950        .read           = new_sync_read,
 951        .read_iter      = pipe_read,
 952        .write          = new_sync_write,
 953        .write_iter     = pipe_write,
 954        .poll           = pipe_poll,
 955        .unlocked_ioctl = pipe_ioctl,
 956        .release        = pipe_release,
 957        .fasync         = pipe_fasync,
 958};
 959
 960/*
 961 * Allocate a new array of pipe buffers and copy the info over. Returns the
 962 * pipe size if successful, or return -ERROR on error.
 963 */
 964static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 965{
 966        struct pipe_buffer *bufs;
 967
 968        /*
 969         * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
 970         * expect a lot of shrink+grow operations, just free and allocate
 971         * again like we would do for growing. If the pipe currently
 972         * contains more buffers than arg, then return busy.
 973         */
 974        if (nr_pages < pipe->nrbufs)
 975                return -EBUSY;
 976
 977        bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
 978        if (unlikely(!bufs))
 979                return -ENOMEM;
 980
 981        /*
 982         * The pipe array wraps around, so just start the new one at zero
 983         * and adjust the indexes.
 984         */
 985        if (pipe->nrbufs) {
 986                unsigned int tail;
 987                unsigned int head;
 988
 989                tail = pipe->curbuf + pipe->nrbufs;
 990                if (tail < pipe->buffers)
 991                        tail = 0;
 992                else
 993                        tail &= (pipe->buffers - 1);
 994
 995                head = pipe->nrbufs - tail;
 996                if (head)
 997                        memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
 998                if (tail)
 999                        memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
1000        }
1001
1002        pipe->curbuf = 0;
1003        kfree(pipe->bufs);
1004        pipe->bufs = bufs;
1005        pipe->buffers = nr_pages;
1006        return nr_pages * PAGE_SIZE;
1007}
1008
1009/*
1010 * Currently we rely on the pipe array holding a power-of-2 number
1011 * of pages.
1012 */
1013static inline unsigned int round_pipe_size(unsigned int size)
1014{
1015        unsigned long nr_pages;
1016
1017        nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1018        return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
1019}
1020
1021/*
1022 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
1023 * will return an error.
1024 */
1025int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1026                 size_t *lenp, loff_t *ppos)
1027{
1028        int ret;
1029
1030        ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
1031        if (ret < 0 || !write)
1032                return ret;
1033
1034        pipe_max_size = round_pipe_size(pipe_max_size);
1035        return ret;
1036}
1037
1038/*
1039 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1040 * location, so checking ->i_pipe is not enough to verify that this is a
1041 * pipe.
1042 */
1043struct pipe_inode_info *get_pipe_info(struct file *file)
1044{
1045        return file->f_op == &pipefifo_fops ? file->private_data : NULL;
1046}
1047
1048long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1049{
1050        struct pipe_inode_info *pipe;
1051        long ret;
1052
1053        pipe = get_pipe_info(file);
1054        if (!pipe)
1055                return -EBADF;
1056
1057        __pipe_lock(pipe);
1058
1059        switch (cmd) {
1060        case F_SETPIPE_SZ: {
1061                unsigned int size, nr_pages;
1062
1063                size = round_pipe_size(arg);
1064                nr_pages = size >> PAGE_SHIFT;
1065
1066                ret = -EINVAL;
1067                if (!nr_pages)
1068                        goto out;
1069
1070                if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
1071                        ret = -EPERM;
1072                        goto out;
1073                }
1074                ret = pipe_set_size(pipe, nr_pages);
1075                break;
1076                }
1077        case F_GETPIPE_SZ:
1078                ret = pipe->buffers * PAGE_SIZE;
1079                break;
1080        default:
1081                ret = -EINVAL;
1082                break;
1083        }
1084
1085out:
1086        __pipe_unlock(pipe);
1087        return ret;
1088}
1089
1090static const struct super_operations pipefs_ops = {
1091        .destroy_inode = free_inode_nonrcu,
1092        .statfs = simple_statfs,
1093};
1094
1095/*
1096 * pipefs should _never_ be mounted by userland - too much of security hassle,
1097 * no real gain from having the whole whorehouse mounted. So we don't need
1098 * any operations on the root directory. However, we need a non-trivial
1099 * d_name - pipe: will go nicely and kill the special-casing in procfs.
1100 */
1101static struct dentry *pipefs_mount(struct file_system_type *fs_type,
1102                         int flags, const char *dev_name, void *data)
1103{
1104        return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
1105                        &pipefs_dentry_operations, PIPEFS_MAGIC);
1106}
1107
1108static struct file_system_type pipe_fs_type = {
1109        .name           = "pipefs",
1110        .mount          = pipefs_mount,
1111        .kill_sb        = kill_anon_super,
1112};
1113
1114static int __init init_pipe_fs(void)
1115{
1116        int err = register_filesystem(&pipe_fs_type);
1117
1118        if (!err) {
1119                pipe_mnt = kern_mount(&pipe_fs_type);
1120                if (IS_ERR(pipe_mnt)) {
1121                        err = PTR_ERR(pipe_mnt);
1122                        unregister_filesystem(&pipe_fs_type);
1123                }
1124        }
1125        return err;
1126}
1127
1128fs_initcall(init_pipe_fs);
1129