linux/fs/pipe.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/pipe.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/file.h>
   9#include <linux/poll.h>
  10#include <linux/slab.h>
  11#include <linux/module.h>
  12#include <linux/init.h>
  13#include <linux/fs.h>
  14#include <linux/log2.h>
  15#include <linux/mount.h>
  16#include <linux/magic.h>
  17#include <linux/pipe_fs_i.h>
  18#include <linux/uio.h>
  19#include <linux/highmem.h>
  20#include <linux/pagemap.h>
  21#include <linux/audit.h>
  22#include <linux/syscalls.h>
  23#include <linux/fcntl.h>
  24#include <linux/aio.h>
  25
  26#include <asm/uaccess.h>
  27#include <asm/ioctls.h>
  28
  29#include "internal.h"
  30
  31/*
  32 * The max size that a non-root user is allowed to grow the pipe. Can
  33 * be set by root in /proc/sys/fs/pipe-max-size
  34 */
  35unsigned int pipe_max_size = 1048576;
  36
  37/*
  38 * Minimum pipe size, as required by POSIX
  39 */
  40unsigned int pipe_min_size = PAGE_SIZE;
  41
  42/*
  43 * We use a start+len construction, which provides full use of the 
  44 * allocated memory.
  45 * -- Florian Coosmann (FGC)
  46 * 
  47 * Reads with count = 0 should always return 0.
  48 * -- Julian Bradfield 1999-06-07.
  49 *
  50 * FIFOs and Pipes now generate SIGIO for both readers and writers.
  51 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
  52 *
  53 * pipe_read & write cleanup
  54 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  55 */
  56
  57static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
  58{
  59        if (pipe->files)
  60                mutex_lock_nested(&pipe->mutex, subclass);
  61}
  62
  63void pipe_lock(struct pipe_inode_info *pipe)
  64{
  65        /*
  66         * pipe_lock() nests non-pipe inode locks (for writing to a file)
  67         */
  68        pipe_lock_nested(pipe, I_MUTEX_PARENT);
  69}
  70EXPORT_SYMBOL(pipe_lock);
  71
  72void pipe_unlock(struct pipe_inode_info *pipe)
  73{
  74        if (pipe->files)
  75                mutex_unlock(&pipe->mutex);
  76}
  77EXPORT_SYMBOL(pipe_unlock);
  78
  79static inline void __pipe_lock(struct pipe_inode_info *pipe)
  80{
  81        mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
  82}
  83
  84static inline void __pipe_unlock(struct pipe_inode_info *pipe)
  85{
  86        mutex_unlock(&pipe->mutex);
  87}
  88
  89void pipe_double_lock(struct pipe_inode_info *pipe1,
  90                      struct pipe_inode_info *pipe2)
  91{
  92        BUG_ON(pipe1 == pipe2);
  93
  94        if (pipe1 < pipe2) {
  95                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
  96                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
  97        } else {
  98                pipe_lock_nested(pipe2, I_MUTEX_PARENT);
  99                pipe_lock_nested(pipe1, I_MUTEX_CHILD);
 100        }
 101}
 102
 103/* Drop the inode semaphore and wait for a pipe event, atomically */
 104void pipe_wait(struct pipe_inode_info *pipe)
 105{
 106        DEFINE_WAIT(wait);
 107
 108        /*
 109         * Pipes are system-local resources, so sleeping on them
 110         * is considered a noninteractive wait:
 111         */
 112        prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
 113        pipe_unlock(pipe);
 114        schedule();
 115        finish_wait(&pipe->wait, &wait);
 116        pipe_lock(pipe);
 117}
 118
 119static int
 120pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
 121                        int atomic)
 122{
 123        unsigned long copy;
 124
 125        while (len > 0) {
 126                while (!iov->iov_len)
 127                        iov++;
 128                copy = min_t(unsigned long, len, iov->iov_len);
 129
 130                if (atomic) {
 131                        if (__copy_from_user_inatomic(to, iov->iov_base, copy))
 132                                return -EFAULT;
 133                } else {
 134                        if (copy_from_user(to, iov->iov_base, copy))
 135                                return -EFAULT;
 136                }
 137                to += copy;
 138                len -= copy;
 139                iov->iov_base += copy;
 140                iov->iov_len -= copy;
 141        }
 142        return 0;
 143}
 144
 145static int
 146pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
 147                      int atomic)
 148{
 149        unsigned long copy;
 150
 151        while (len > 0) {
 152                while (!iov->iov_len)
 153                        iov++;
 154                copy = min_t(unsigned long, len, iov->iov_len);
 155
 156                if (atomic) {
 157                        if (__copy_to_user_inatomic(iov->iov_base, from, copy))
 158                                return -EFAULT;
 159                } else {
 160                        if (copy_to_user(iov->iov_base, from, copy))
 161                                return -EFAULT;
 162                }
 163                from += copy;
 164                len -= copy;
 165                iov->iov_base += copy;
 166                iov->iov_len -= copy;
 167        }
 168        return 0;
 169}
 170
 171/*
 172 * Attempt to pre-fault in the user memory, so we can use atomic copies.
 173 * Returns the number of bytes not faulted in.
 174 */
 175static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
 176{
 177        while (!iov->iov_len)
 178                iov++;
 179
 180        while (len > 0) {
 181                unsigned long this_len;
 182
 183                this_len = min_t(unsigned long, len, iov->iov_len);
 184                if (fault_in_pages_writeable(iov->iov_base, this_len))
 185                        break;
 186
 187                len -= this_len;
 188                iov++;
 189        }
 190
 191        return len;
 192}
 193
 194/*
 195 * Pre-fault in the user memory, so we can use atomic copies.
 196 */
 197static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
 198{
 199        while (!iov->iov_len)
 200                iov++;
 201
 202        while (len > 0) {
 203                unsigned long this_len;
 204
 205                this_len = min_t(unsigned long, len, iov->iov_len);
 206                fault_in_pages_readable(iov->iov_base, this_len);
 207                len -= this_len;
 208                iov++;
 209        }
 210}
 211
 212static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 213                                  struct pipe_buffer *buf)
 214{
 215        struct page *page = buf->page;
 216
 217        /*
 218         * If nobody else uses this page, and we don't already have a
 219         * temporary page, let's keep track of it as a one-deep
 220         * allocation cache. (Otherwise just release our reference to it)
 221         */
 222        if (page_count(page) == 1 && !pipe->tmp_page)
 223                pipe->tmp_page = page;
 224        else
 225                page_cache_release(page);
 226}
 227
 228/**
 229 * generic_pipe_buf_map - virtually map a pipe buffer
 230 * @pipe:       the pipe that the buffer belongs to
 231 * @buf:        the buffer that should be mapped
 232 * @atomic:     whether to use an atomic map
 233 *
 234 * Description:
 235 *      This function returns a kernel virtual address mapping for the
 236 *      pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
 237 *      and the caller has to be careful not to fault before calling
 238 *      the unmap function.
 239 *
 240 *      Note that this function calls kmap_atomic() if @atomic != 0.
 241 */
 242void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
 243                           struct pipe_buffer *buf, int atomic)
 244{
 245        if (atomic) {
 246                buf->flags |= PIPE_BUF_FLAG_ATOMIC;
 247                return kmap_atomic(buf->page);
 248        }
 249
 250        return kmap(buf->page);
 251}
 252EXPORT_SYMBOL(generic_pipe_buf_map);
 253
 254/**
 255 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
 256 * @pipe:       the pipe that the buffer belongs to
 257 * @buf:        the buffer that should be unmapped
 258 * @map_data:   the data that the mapping function returned
 259 *
 260 * Description:
 261 *      This function undoes the mapping that ->map() provided.
 262 */
 263void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
 264                            struct pipe_buffer *buf, void *map_data)
 265{
 266        if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
 267                buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
 268                kunmap_atomic(map_data);
 269        } else
 270                kunmap(buf->page);
 271}
 272EXPORT_SYMBOL(generic_pipe_buf_unmap);
 273
 274/**
 275 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
 276 * @pipe:       the pipe that the buffer belongs to
 277 * @buf:        the buffer to attempt to steal
 278 *
 279 * Description:
 280 *      This function attempts to steal the &struct page attached to
 281 *      @buf. If successful, this function returns 0 and returns with
 282 *      the page locked. The caller may then reuse the page for whatever
 283 *      he wishes; the typical use is insertion into a different file
 284 *      page cache.
 285 */
 286int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
 287                           struct pipe_buffer *buf)
 288{
 289        struct page *page = buf->page;
 290
 291        /*
 292         * A reference of one is golden, that means that the owner of this
 293         * page is the only one holding a reference to it. lock the page
 294         * and return OK.
 295         */
 296        if (page_count(page) == 1) {
 297                lock_page(page);
 298                return 0;
 299        }
 300
 301        return 1;
 302}
 303EXPORT_SYMBOL(generic_pipe_buf_steal);
 304
 305/**
 306 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
 307 * @pipe:       the pipe that the buffer belongs to
 308 * @buf:        the buffer to get a reference to
 309 *
 310 * Description:
 311 *      This function grabs an extra reference to @buf. It's used in
 312 *      in the tee() system call, when we duplicate the buffers in one
 313 *      pipe into another.
 314 */
 315void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 316{
 317        page_cache_get(buf->page);
 318}
 319EXPORT_SYMBOL(generic_pipe_buf_get);
 320
 321/**
 322 * generic_pipe_buf_confirm - verify contents of the pipe buffer
 323 * @info:       the pipe that the buffer belongs to
 324 * @buf:        the buffer to confirm
 325 *
 326 * Description:
 327 *      This function does nothing, because the generic pipe code uses
 328 *      pages that are always good when inserted into the pipe.
 329 */
 330int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 331                             struct pipe_buffer *buf)
 332{
 333        return 0;
 334}
 335EXPORT_SYMBOL(generic_pipe_buf_confirm);
 336
 337/**
 338 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
 339 * @pipe:       the pipe that the buffer belongs to
 340 * @buf:        the buffer to put a reference to
 341 *
 342 * Description:
 343 *      This function releases a reference to @buf.
 344 */
 345void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 346                              struct pipe_buffer *buf)
 347{
 348        page_cache_release(buf->page);
 349}
 350EXPORT_SYMBOL(generic_pipe_buf_release);
 351
 352static const struct pipe_buf_operations anon_pipe_buf_ops = {
 353        .can_merge = 1,
 354        .map = generic_pipe_buf_map,
 355        .unmap = generic_pipe_buf_unmap,
 356        .confirm = generic_pipe_buf_confirm,
 357        .release = anon_pipe_buf_release,
 358        .steal = generic_pipe_buf_steal,
 359        .get = generic_pipe_buf_get,
 360};
 361
 362static const struct pipe_buf_operations packet_pipe_buf_ops = {
 363        .can_merge = 0,
 364        .map = generic_pipe_buf_map,
 365        .unmap = generic_pipe_buf_unmap,
 366        .confirm = generic_pipe_buf_confirm,
 367        .release = anon_pipe_buf_release,
 368        .steal = generic_pipe_buf_steal,
 369        .get = generic_pipe_buf_get,
 370};
 371
 372static ssize_t
 373pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 374           unsigned long nr_segs, loff_t pos)
 375{
 376        struct file *filp = iocb->ki_filp;
 377        struct pipe_inode_info *pipe = filp->private_data;
 378        int do_wakeup;
 379        ssize_t ret;
 380        struct iovec *iov = (struct iovec *)_iov;
 381        size_t total_len;
 382
 383        total_len = iov_length(iov, nr_segs);
 384        /* Null read succeeds. */
 385        if (unlikely(total_len == 0))
 386                return 0;
 387
 388        do_wakeup = 0;
 389        ret = 0;
 390        __pipe_lock(pipe);
 391        for (;;) {
 392                int bufs = pipe->nrbufs;
 393                if (bufs) {
 394                        int curbuf = pipe->curbuf;
 395                        struct pipe_buffer *buf = pipe->bufs + curbuf;
 396                        const struct pipe_buf_operations *ops = buf->ops;
 397                        void *addr;
 398                        size_t chars = buf->len;
 399                        int error, atomic;
 400
 401                        if (chars > total_len)
 402                                chars = total_len;
 403
 404                        error = ops->confirm(pipe, buf);
 405                        if (error) {
 406                                if (!ret)
 407                                        ret = error;
 408                                break;
 409                        }
 410
 411                        atomic = !iov_fault_in_pages_write(iov, chars);
 412redo:
 413                        addr = ops->map(pipe, buf, atomic);
 414                        error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
 415                        ops->unmap(pipe, buf, addr);
 416                        if (unlikely(error)) {
 417                                /*
 418                                 * Just retry with the slow path if we failed.
 419                                 */
 420                                if (atomic) {
 421                                        atomic = 0;
 422                                        goto redo;
 423                                }
 424                                if (!ret)
 425                                        ret = error;
 426                                break;
 427                        }
 428                        ret += chars;
 429                        buf->offset += chars;
 430                        buf->len -= chars;
 431
 432                        /* Was it a packet buffer? Clean up and exit */
 433                        if (buf->flags & PIPE_BUF_FLAG_PACKET) {
 434                                total_len = chars;
 435                                buf->len = 0;
 436                        }
 437
 438                        if (!buf->len) {
 439                                buf->ops = NULL;
 440                                ops->release(pipe, buf);
 441                                curbuf = (curbuf + 1) & (pipe->buffers - 1);
 442                                pipe->curbuf = curbuf;
 443                                pipe->nrbufs = --bufs;
 444                                do_wakeup = 1;
 445                        }
 446                        total_len -= chars;
 447                        if (!total_len)
 448                                break;  /* common path: read succeeded */
 449                }
 450                if (bufs)       /* More to do? */
 451                        continue;
 452                if (!pipe->writers)
 453                        break;
 454                if (!pipe->waiting_writers) {
 455                        /* syscall merging: Usually we must not sleep
 456                         * if O_NONBLOCK is set, or if we got some data.
 457                         * But if a writer sleeps in kernel space, then
 458                         * we can wait for that data without violating POSIX.
 459                         */
 460                        if (ret)
 461                                break;
 462                        if (filp->f_flags & O_NONBLOCK) {
 463                                ret = -EAGAIN;
 464                                break;
 465                        }
 466                }
 467                if (signal_pending(current)) {
 468                        if (!ret)
 469                                ret = -ERESTARTSYS;
 470                        break;
 471                }
 472                if (do_wakeup) {
 473                        wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
 474                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 475                }
 476                pipe_wait(pipe);
 477        }
 478        __pipe_unlock(pipe);
 479
 480        /* Signal writers asynchronously that there is more room. */
 481        if (do_wakeup) {
 482                wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
 483                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 484        }
 485        if (ret > 0)
 486                file_accessed(filp);
 487        return ret;
 488}
 489
 490static inline int is_packetized(struct file *file)
 491{
 492        return (file->f_flags & O_DIRECT) != 0;
 493}
 494
 495static ssize_t
 496pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 497            unsigned long nr_segs, loff_t ppos)
 498{
 499        struct file *filp = iocb->ki_filp;
 500        struct pipe_inode_info *pipe = filp->private_data;
 501        ssize_t ret;
 502        int do_wakeup;
 503        struct iovec *iov = (struct iovec *)_iov;
 504        size_t total_len;
 505        ssize_t chars;
 506
 507        total_len = iov_length(iov, nr_segs);
 508        /* Null write succeeds. */
 509        if (unlikely(total_len == 0))
 510                return 0;
 511
 512        do_wakeup = 0;
 513        ret = 0;
 514        __pipe_lock(pipe);
 515
 516        if (!pipe->readers) {
 517                send_sig(SIGPIPE, current, 0);
 518                ret = -EPIPE;
 519                goto out;
 520        }
 521
 522        /* We try to merge small writes */
 523        chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
 524        if (pipe->nrbufs && chars != 0) {
 525                int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
 526                                                        (pipe->buffers - 1);
 527                struct pipe_buffer *buf = pipe->bufs + lastbuf;
 528                const struct pipe_buf_operations *ops = buf->ops;
 529                int offset = buf->offset + buf->len;
 530
 531                if (ops->can_merge && offset + chars <= PAGE_SIZE) {
 532                        int error, atomic = 1;
 533                        void *addr;
 534
 535                        error = ops->confirm(pipe, buf);
 536                        if (error)
 537                                goto out;
 538
 539                        iov_fault_in_pages_read(iov, chars);
 540redo1:
 541                        addr = ops->map(pipe, buf, atomic);
 542                        error = pipe_iov_copy_from_user(offset + addr, iov,
 543                                                        chars, atomic);
 544                        ops->unmap(pipe, buf, addr);
 545                        ret = error;
 546                        do_wakeup = 1;
 547                        if (error) {
 548                                if (atomic) {
 549                                        atomic = 0;
 550                                        goto redo1;
 551                                }
 552                                goto out;
 553                        }
 554                        buf->len += chars;
 555                        total_len -= chars;
 556                        ret = chars;
 557                        if (!total_len)
 558                                goto out;
 559                }
 560        }
 561
 562        for (;;) {
 563                int bufs;
 564
 565                if (!pipe->readers) {
 566                        send_sig(SIGPIPE, current, 0);
 567                        if (!ret)
 568                                ret = -EPIPE;
 569                        break;
 570                }
 571                bufs = pipe->nrbufs;
 572                if (bufs < pipe->buffers) {
 573                        int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
 574                        struct pipe_buffer *buf = pipe->bufs + newbuf;
 575                        struct page *page = pipe->tmp_page;
 576                        char *src;
 577                        int error, atomic = 1;
 578
 579                        if (!page) {
 580                                page = alloc_page(GFP_HIGHUSER);
 581                                if (unlikely(!page)) {
 582                                        ret = ret ? : -ENOMEM;
 583                                        break;
 584                                }
 585                                pipe->tmp_page = page;
 586                        }
 587                        /* Always wake up, even if the copy fails. Otherwise
 588                         * we lock up (O_NONBLOCK-)readers that sleep due to
 589                         * syscall merging.
 590                         * FIXME! Is this really true?
 591                         */
 592                        do_wakeup = 1;
 593                        chars = PAGE_SIZE;
 594                        if (chars > total_len)
 595                                chars = total_len;
 596
 597                        iov_fault_in_pages_read(iov, chars);
 598redo2:
 599                        if (atomic)
 600                                src = kmap_atomic(page);
 601                        else
 602                                src = kmap(page);
 603
 604                        error = pipe_iov_copy_from_user(src, iov, chars,
 605                                                        atomic);
 606                        if (atomic)
 607                                kunmap_atomic(src);
 608                        else
 609                                kunmap(page);
 610
 611                        if (unlikely(error)) {
 612                                if (atomic) {
 613                                        atomic = 0;
 614                                        goto redo2;
 615                                }
 616                                if (!ret)
 617                                        ret = error;
 618                                break;
 619                        }
 620                        ret += chars;
 621
 622                        /* Insert it into the buffer array */
 623                        buf->page = page;
 624                        buf->ops = &anon_pipe_buf_ops;
 625                        buf->offset = 0;
 626                        buf->len = chars;
 627                        buf->flags = 0;
 628                        if (is_packetized(filp)) {
 629                                buf->ops = &packet_pipe_buf_ops;
 630                                buf->flags = PIPE_BUF_FLAG_PACKET;
 631                        }
 632                        pipe->nrbufs = ++bufs;
 633                        pipe->tmp_page = NULL;
 634
 635                        total_len -= chars;
 636                        if (!total_len)
 637                                break;
 638                }
 639                if (bufs < pipe->buffers)
 640                        continue;
 641                if (filp->f_flags & O_NONBLOCK) {
 642                        if (!ret)
 643                                ret = -EAGAIN;
 644                        break;
 645                }
 646                if (signal_pending(current)) {
 647                        if (!ret)
 648                                ret = -ERESTARTSYS;
 649                        break;
 650                }
 651                if (do_wakeup) {
 652                        wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 653                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 654                        do_wakeup = 0;
 655                }
 656                pipe->waiting_writers++;
 657                pipe_wait(pipe);
 658                pipe->waiting_writers--;
 659        }
 660out:
 661        __pipe_unlock(pipe);
 662        if (do_wakeup) {
 663                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 664                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 665        }
 666        if (ret > 0) {
 667                int err = file_update_time(filp);
 668                if (err)
 669                        ret = err;
 670        }
 671        return ret;
 672}
 673
 674static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 675{
 676        struct pipe_inode_info *pipe = filp->private_data;
 677        int count, buf, nrbufs;
 678
 679        switch (cmd) {
 680                case FIONREAD:
 681                        __pipe_lock(pipe);
 682                        count = 0;
 683                        buf = pipe->curbuf;
 684                        nrbufs = pipe->nrbufs;
 685                        while (--nrbufs >= 0) {
 686                                count += pipe->bufs[buf].len;
 687                                buf = (buf+1) & (pipe->buffers - 1);
 688                        }
 689                        __pipe_unlock(pipe);
 690
 691                        return put_user(count, (int __user *)arg);
 692                default:
 693                        return -ENOIOCTLCMD;
 694        }
 695}
 696
 697/* No kernel lock held - fine */
 698static unsigned int
 699pipe_poll(struct file *filp, poll_table *wait)
 700{
 701        unsigned int mask;
 702        struct pipe_inode_info *pipe = filp->private_data;
 703        int nrbufs;
 704
 705        poll_wait(filp, &pipe->wait, wait);
 706
 707        /* Reading only -- no need for acquiring the semaphore.  */
 708        nrbufs = pipe->nrbufs;
 709        mask = 0;
 710        if (filp->f_mode & FMODE_READ) {
 711                mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
 712                if (!pipe->writers && filp->f_version != pipe->w_counter)
 713                        mask |= POLLHUP;
 714        }
 715
 716        if (filp->f_mode & FMODE_WRITE) {
 717                mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
 718                /*
 719                 * Most Unices do not set POLLERR for FIFOs but on Linux they
 720                 * behave exactly like pipes for poll().
 721                 */
 722                if (!pipe->readers)
 723                        mask |= POLLERR;
 724        }
 725
 726        return mask;
 727}
 728
 729static int
 730pipe_release(struct inode *inode, struct file *file)
 731{
 732        struct pipe_inode_info *pipe = inode->i_pipe;
 733        int kill = 0;
 734
 735        __pipe_lock(pipe);
 736        if (file->f_mode & FMODE_READ)
 737                pipe->readers--;
 738        if (file->f_mode & FMODE_WRITE)
 739                pipe->writers--;
 740
 741        if (pipe->readers || pipe->writers) {
 742                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
 743                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 744                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 745        }
 746        spin_lock(&inode->i_lock);
 747        if (!--pipe->files) {
 748                inode->i_pipe = NULL;
 749                kill = 1;
 750        }
 751        spin_unlock(&inode->i_lock);
 752        __pipe_unlock(pipe);
 753
 754        if (kill)
 755                free_pipe_info(pipe);
 756
 757        return 0;
 758}
 759
 760static int
 761pipe_fasync(int fd, struct file *filp, int on)
 762{
 763        struct pipe_inode_info *pipe = filp->private_data;
 764        int retval = 0;
 765
 766        __pipe_lock(pipe);
 767        if (filp->f_mode & FMODE_READ)
 768                retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 769        if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
 770                retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
 771                if (retval < 0 && (filp->f_mode & FMODE_READ))
 772                        /* this can happen only if on == T */
 773                        fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 774        }
 775        __pipe_unlock(pipe);
 776        return retval;
 777}
 778
 779struct pipe_inode_info *alloc_pipe_info(void)
 780{
 781        struct pipe_inode_info *pipe;
 782
 783        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
 784        if (pipe) {
 785                pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
 786                if (pipe->bufs) {
 787                        init_waitqueue_head(&pipe->wait);
 788                        pipe->r_counter = pipe->w_counter = 1;
 789                        pipe->buffers = PIPE_DEF_BUFFERS;
 790                        mutex_init(&pipe->mutex);
 791                        return pipe;
 792                }
 793                kfree(pipe);
 794        }
 795
 796        return NULL;
 797}
 798
 799void free_pipe_info(struct pipe_inode_info *pipe)
 800{
 801        int i;
 802
 803        for (i = 0; i < pipe->buffers; i++) {
 804                struct pipe_buffer *buf = pipe->bufs + i;
 805                if (buf->ops)
 806                        buf->ops->release(pipe, buf);
 807        }
 808        if (pipe->tmp_page)
 809                __free_page(pipe->tmp_page);
 810        kfree(pipe->bufs);
 811        kfree(pipe);
 812}
 813
 814static struct vfsmount *pipe_mnt __read_mostly;
 815
 816/*
 817 * pipefs_dname() is called from d_path().
 818 */
 819static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 820{
 821        return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
 822                                dentry->d_inode->i_ino);
 823}
 824
 825static const struct dentry_operations pipefs_dentry_operations = {
 826        .d_dname        = pipefs_dname,
 827};
 828
 829static struct inode * get_pipe_inode(void)
 830{
 831        struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
 832        struct pipe_inode_info *pipe;
 833
 834        if (!inode)
 835                goto fail_inode;
 836
 837        inode->i_ino = get_next_ino();
 838
 839        pipe = alloc_pipe_info();
 840        if (!pipe)
 841                goto fail_iput;
 842
 843        inode->i_pipe = pipe;
 844        pipe->files = 2;
 845        pipe->readers = pipe->writers = 1;
 846        inode->i_fop = &pipefifo_fops;
 847
 848        /*
 849         * Mark the inode dirty from the very beginning,
 850         * that way it will never be moved to the dirty
 851         * list because "mark_inode_dirty()" will think
 852         * that it already _is_ on the dirty list.
 853         */
 854        inode->i_state = I_DIRTY;
 855        inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
 856        inode->i_uid = current_fsuid();
 857        inode->i_gid = current_fsgid();
 858        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 859
 860        return inode;
 861
 862fail_iput:
 863        iput(inode);
 864
 865fail_inode:
 866        return NULL;
 867}
 868
 869int create_pipe_files(struct file **res, int flags)
 870{
 871        int err;
 872        struct inode *inode = get_pipe_inode();
 873        struct file *f;
 874        struct path path;
 875        static struct qstr name = { .name = "" };
 876
 877        if (!inode)
 878                return -ENFILE;
 879
 880        err = -ENOMEM;
 881        path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
 882        if (!path.dentry)
 883                goto err_inode;
 884        path.mnt = mntget(pipe_mnt);
 885
 886        d_instantiate(path.dentry, inode);
 887
 888        err = -ENFILE;
 889        f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
 890        if (IS_ERR(f))
 891                goto err_dentry;
 892
 893        f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
 894        f->private_data = inode->i_pipe;
 895
 896        res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
 897        if (IS_ERR(res[0]))
 898                goto err_file;
 899
 900        path_get(&path);
 901        res[0]->private_data = inode->i_pipe;
 902        res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
 903        res[1] = f;
 904        return 0;
 905
 906err_file:
 907        put_filp(f);
 908err_dentry:
 909        free_pipe_info(inode->i_pipe);
 910        path_put(&path);
 911        return err;
 912
 913err_inode:
 914        free_pipe_info(inode->i_pipe);
 915        iput(inode);
 916        return err;
 917}
 918
 919static int __do_pipe_flags(int *fd, struct file **files, int flags)
 920{
 921        int error;
 922        int fdw, fdr;
 923
 924        if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
 925                return -EINVAL;
 926
 927        error = create_pipe_files(files, flags);
 928        if (error)
 929                return error;
 930
 931        error = get_unused_fd_flags(flags);
 932        if (error < 0)
 933                goto err_read_pipe;
 934        fdr = error;
 935
 936        error = get_unused_fd_flags(flags);
 937        if (error < 0)
 938                goto err_fdr;
 939        fdw = error;
 940
 941        audit_fd_pair(fdr, fdw);
 942        fd[0] = fdr;
 943        fd[1] = fdw;
 944        return 0;
 945
 946 err_fdr:
 947        put_unused_fd(fdr);
 948 err_read_pipe:
 949        fput(files[0]);
 950        fput(files[1]);
 951        return error;
 952}
 953
 954int do_pipe_flags(int *fd, int flags)
 955{
 956        struct file *files[2];
 957        int error = __do_pipe_flags(fd, files, flags);
 958        if (!error) {
 959                fd_install(fd[0], files[0]);
 960                fd_install(fd[1], files[1]);
 961        }
 962        return error;
 963}
 964
 965/*
 966 * sys_pipe() is the normal C calling standard for creating
 967 * a pipe. It's not the way Unix traditionally does this, though.
 968 */
 969SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
 970{
 971        struct file *files[2];
 972        int fd[2];
 973        int error;
 974
 975        error = __do_pipe_flags(fd, files, flags);
 976        if (!error) {
 977                if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
 978                        fput(files[0]);
 979                        fput(files[1]);
 980                        put_unused_fd(fd[0]);
 981                        put_unused_fd(fd[1]);
 982                        error = -EFAULT;
 983                } else {
 984                        fd_install(fd[0], files[0]);
 985                        fd_install(fd[1], files[1]);
 986                }
 987        }
 988        return error;
 989}
 990
 991SYSCALL_DEFINE1(pipe, int __user *, fildes)
 992{
 993        return sys_pipe2(fildes, 0);
 994}
 995
 996static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
 997{
 998        int cur = *cnt; 
 999
1000        while (cur == *cnt) {
1001                pipe_wait(pipe);
1002                if (signal_pending(current))
1003                        break;
1004        }
1005        return cur == *cnt ? -ERESTARTSYS : 0;
1006}
1007
1008static void wake_up_partner(struct pipe_inode_info *pipe)
1009{
1010        wake_up_interruptible(&pipe->wait);
1011}
1012
1013static int fifo_open(struct inode *inode, struct file *filp)
1014{
1015        struct pipe_inode_info *pipe;
1016        bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
1017        int kill = 0;
1018        int ret;
1019
1020        filp->f_version = 0;
1021
1022        spin_lock(&inode->i_lock);
1023        if (inode->i_pipe) {
1024                pipe = inode->i_pipe;
1025                pipe->files++;
1026                spin_unlock(&inode->i_lock);
1027        } else {
1028                spin_unlock(&inode->i_lock);
1029                pipe = alloc_pipe_info();
1030                if (!pipe)
1031                        return -ENOMEM;
1032                pipe->files = 1;
1033                spin_lock(&inode->i_lock);
1034                if (unlikely(inode->i_pipe)) {
1035                        inode->i_pipe->files++;
1036                        spin_unlock(&inode->i_lock);
1037                        free_pipe_info(pipe);
1038                        pipe = inode->i_pipe;
1039                } else {
1040                        inode->i_pipe = pipe;
1041                        spin_unlock(&inode->i_lock);
1042                }
1043        }
1044        filp->private_data = pipe;
1045        /* OK, we have a pipe and it's pinned down */
1046
1047        __pipe_lock(pipe);
1048
1049        /* We can only do regular read/write on fifos */
1050        filp->f_mode &= (FMODE_READ | FMODE_WRITE);
1051
1052        switch (filp->f_mode) {
1053        case FMODE_READ:
1054        /*
1055         *  O_RDONLY
1056         *  POSIX.1 says that O_NONBLOCK means return with the FIFO
1057         *  opened, even when there is no process writing the FIFO.
1058         */
1059                pipe->r_counter++;
1060                if (pipe->readers++ == 0)
1061                        wake_up_partner(pipe);
1062
1063                if (!is_pipe && !pipe->writers) {
1064                        if ((filp->f_flags & O_NONBLOCK)) {
1065                                /* suppress POLLHUP until we have
1066                                 * seen a writer */
1067                                filp->f_version = pipe->w_counter;
1068                        } else {
1069                                if (wait_for_partner(pipe, &pipe->w_counter))
1070                                        goto err_rd;
1071                        }
1072                }
1073                break;
1074        
1075        case FMODE_WRITE:
1076        /*
1077         *  O_WRONLY
1078         *  POSIX.1 says that O_NONBLOCK means return -1 with
1079         *  errno=ENXIO when there is no process reading the FIFO.
1080         */
1081                ret = -ENXIO;
1082                if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
1083                        goto err;
1084
1085                pipe->w_counter++;
1086                if (!pipe->writers++)
1087                        wake_up_partner(pipe);
1088
1089                if (!is_pipe && !pipe->readers) {
1090                        if (wait_for_partner(pipe, &pipe->r_counter))
1091                                goto err_wr;
1092                }
1093                break;
1094        
1095        case FMODE_READ | FMODE_WRITE:
1096        /*
1097         *  O_RDWR
1098         *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
1099         *  This implementation will NEVER block on a O_RDWR open, since
1100         *  the process can at least talk to itself.
1101         */
1102
1103                pipe->readers++;
1104                pipe->writers++;
1105                pipe->r_counter++;
1106                pipe->w_counter++;
1107                if (pipe->readers == 1 || pipe->writers == 1)
1108                        wake_up_partner(pipe);
1109                break;
1110
1111        default:
1112                ret = -EINVAL;
1113                goto err;
1114        }
1115
1116        /* Ok! */
1117        __pipe_unlock(pipe);
1118        return 0;
1119
1120err_rd:
1121        if (!--pipe->readers)
1122                wake_up_interruptible(&pipe->wait);
1123        ret = -ERESTARTSYS;
1124        goto err;
1125
1126err_wr:
1127        if (!--pipe->writers)
1128                wake_up_interruptible(&pipe->wait);
1129        ret = -ERESTARTSYS;
1130        goto err;
1131
1132err:
1133        spin_lock(&inode->i_lock);
1134        if (!--pipe->files) {
1135                inode->i_pipe = NULL;
1136                kill = 1;
1137        }
1138        spin_unlock(&inode->i_lock);
1139        __pipe_unlock(pipe);
1140        if (kill)
1141                free_pipe_info(pipe);
1142        return ret;
1143}
1144
1145const struct file_operations pipefifo_fops = {
1146        .open           = fifo_open,
1147        .llseek         = no_llseek,
1148        .read           = do_sync_read,
1149        .aio_read       = pipe_read,
1150        .write          = do_sync_write,
1151        .aio_write      = pipe_write,
1152        .poll           = pipe_poll,
1153        .unlocked_ioctl = pipe_ioctl,
1154        .release        = pipe_release,
1155        .fasync         = pipe_fasync,
1156};
1157
1158/*
1159 * Allocate a new array of pipe buffers and copy the info over. Returns the
1160 * pipe size if successful, or return -ERROR on error.
1161 */
1162static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1163{
1164        struct pipe_buffer *bufs;
1165
1166        /*
1167         * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1168         * expect a lot of shrink+grow operations, just free and allocate
1169         * again like we would do for growing. If the pipe currently
1170         * contains more buffers than arg, then return busy.
1171         */
1172        if (nr_pages < pipe->nrbufs)
1173                return -EBUSY;
1174
1175        bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
1176        if (unlikely(!bufs))
1177                return -ENOMEM;
1178
1179        /*
1180         * The pipe array wraps around, so just start the new one at zero
1181         * and adjust the indexes.
1182         */
1183        if (pipe->nrbufs) {
1184                unsigned int tail;
1185                unsigned int head;
1186
1187                tail = pipe->curbuf + pipe->nrbufs;
1188                if (tail < pipe->buffers)
1189                        tail = 0;
1190                else
1191                        tail &= (pipe->buffers - 1);
1192
1193                head = pipe->nrbufs - tail;
1194                if (head)
1195                        memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1196                if (tail)
1197                        memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
1198        }
1199
1200        pipe->curbuf = 0;
1201        kfree(pipe->bufs);
1202        pipe->bufs = bufs;
1203        pipe->buffers = nr_pages;
1204        return nr_pages * PAGE_SIZE;
1205}
1206
1207/*
1208 * Currently we rely on the pipe array holding a power-of-2 number
1209 * of pages.
1210 */
1211static inline unsigned int round_pipe_size(unsigned int size)
1212{
1213        unsigned long nr_pages;
1214
1215        nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1216        return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
1217}
1218
1219/*
1220 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
1221 * will return an error.
1222 */
1223int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1224                 size_t *lenp, loff_t *ppos)
1225{
1226        int ret;
1227
1228        ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
1229        if (ret < 0 || !write)
1230                return ret;
1231
1232        pipe_max_size = round_pipe_size(pipe_max_size);
1233        return ret;
1234}
1235
1236/*
1237 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1238 * location, so checking ->i_pipe is not enough to verify that this is a
1239 * pipe.
1240 */
1241struct pipe_inode_info *get_pipe_info(struct file *file)
1242{
1243        return file->f_op == &pipefifo_fops ? file->private_data : NULL;
1244}
1245
1246long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1247{
1248        struct pipe_inode_info *pipe;
1249        long ret;
1250
1251        pipe = get_pipe_info(file);
1252        if (!pipe)
1253                return -EBADF;
1254
1255        __pipe_lock(pipe);
1256
1257        switch (cmd) {
1258        case F_SETPIPE_SZ: {
1259                unsigned int size, nr_pages;
1260
1261                size = round_pipe_size(arg);
1262                nr_pages = size >> PAGE_SHIFT;
1263
1264                ret = -EINVAL;
1265                if (!nr_pages)
1266                        goto out;
1267
1268                if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
1269                        ret = -EPERM;
1270                        goto out;
1271                }
1272                ret = pipe_set_size(pipe, nr_pages);
1273                break;
1274                }
1275        case F_GETPIPE_SZ:
1276                ret = pipe->buffers * PAGE_SIZE;
1277                break;
1278        default:
1279                ret = -EINVAL;
1280                break;
1281        }
1282
1283out:
1284        __pipe_unlock(pipe);
1285        return ret;
1286}
1287
1288static const struct super_operations pipefs_ops = {
1289        .destroy_inode = free_inode_nonrcu,
1290        .statfs = simple_statfs,
1291};
1292
1293/*
1294 * pipefs should _never_ be mounted by userland - too much of security hassle,
1295 * no real gain from having the whole whorehouse mounted. So we don't need
1296 * any operations on the root directory. However, we need a non-trivial
1297 * d_name - pipe: will go nicely and kill the special-casing in procfs.
1298 */
1299static struct dentry *pipefs_mount(struct file_system_type *fs_type,
1300                         int flags, const char *dev_name, void *data)
1301{
1302        return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
1303                        &pipefs_dentry_operations, PIPEFS_MAGIC);
1304}
1305
1306static struct file_system_type pipe_fs_type = {
1307        .name           = "pipefs",
1308        .mount          = pipefs_mount,
1309        .kill_sb        = kill_anon_super,
1310};
1311
1312static int __init init_pipe_fs(void)
1313{
1314        int err = register_filesystem(&pipe_fs_type);
1315
1316        if (!err) {
1317                pipe_mnt = kern_mount(&pipe_fs_type);
1318                if (IS_ERR(pipe_mnt)) {
1319                        err = PTR_ERR(pipe_mnt);
1320                        unregister_filesystem(&pipe_fs_type);
1321                }
1322        }
1323        return err;
1324}
1325
1326fs_initcall(init_pipe_fs);
1327