linux/fs/pipe.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/pipe.c
   4 *
   5 *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
   6 */
   7
   8#include <linux/mm.h>
   9#include <linux/file.h>
  10#include <linux/poll.h>
  11#include <linux/slab.h>
  12#include <linux/module.h>
  13#include <linux/init.h>
  14#include <linux/fs.h>
  15#include <linux/log2.h>
  16#include <linux/mount.h>
  17#include <linux/pseudo_fs.h>
  18#include <linux/magic.h>
  19#include <linux/pipe_fs_i.h>
  20#include <linux/uio.h>
  21#include <linux/highmem.h>
  22#include <linux/pagemap.h>
  23#include <linux/audit.h>
  24#include <linux/syscalls.h>
  25#include <linux/fcntl.h>
  26#include <linux/memcontrol.h>
  27#include <linux/watch_queue.h>
  28
  29#include <linux/uaccess.h>
  30#include <asm/ioctls.h>
  31
  32#include "internal.h"
  33
  34/*
  35 * The max size that a non-root user is allowed to grow the pipe. Can
  36 * be set by root in /proc/sys/fs/pipe-max-size
  37 */
  38unsigned int pipe_max_size = 1048576;
  39
  40/* Maximum allocatable pages per user. Hard limit is unset by default, soft
  41 * matches default values.
  42 */
  43unsigned long pipe_user_pages_hard;
  44unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
  45
  46/*
  47 * We use head and tail indices that aren't masked off, except at the point of
  48 * dereference, but rather they're allowed to wrap naturally.  This means there
  49 * isn't a dead spot in the buffer, but the ring has to be a power of two and
  50 * <= 2^31.
  51 * -- David Howells 2019-09-23.
  52 *
  53 * Reads with count = 0 should always return 0.
  54 * -- Julian Bradfield 1999-06-07.
  55 *
  56 * FIFOs and Pipes now generate SIGIO for both readers and writers.
  57 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
  58 *
  59 * pipe_read & write cleanup
  60 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  61 */
  62
  63static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
  64{
  65        if (pipe->files)
  66                mutex_lock_nested(&pipe->mutex, subclass);
  67}
  68
  69void pipe_lock(struct pipe_inode_info *pipe)
  70{
  71        /*
  72         * pipe_lock() nests non-pipe inode locks (for writing to a file)
  73         */
  74        pipe_lock_nested(pipe, I_MUTEX_PARENT);
  75}
  76EXPORT_SYMBOL(pipe_lock);
  77
  78void pipe_unlock(struct pipe_inode_info *pipe)
  79{
  80        if (pipe->files)
  81                mutex_unlock(&pipe->mutex);
  82}
  83EXPORT_SYMBOL(pipe_unlock);
  84
  85static inline void __pipe_lock(struct pipe_inode_info *pipe)
  86{
  87        mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
  88}
  89
  90static inline void __pipe_unlock(struct pipe_inode_info *pipe)
  91{
  92        mutex_unlock(&pipe->mutex);
  93}
  94
  95void pipe_double_lock(struct pipe_inode_info *pipe1,
  96                      struct pipe_inode_info *pipe2)
  97{
  98        BUG_ON(pipe1 == pipe2);
  99
 100        if (pipe1 < pipe2) {
 101                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
 102                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
 103        } else {
 104                pipe_lock_nested(pipe2, I_MUTEX_PARENT);
 105                pipe_lock_nested(pipe1, I_MUTEX_CHILD);
 106        }
 107}
 108
 109static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 110                                  struct pipe_buffer *buf)
 111{
 112        struct page *page = buf->page;
 113
 114        /*
 115         * If nobody else uses this page, and we don't already have a
 116         * temporary page, let's keep track of it as a one-deep
 117         * allocation cache. (Otherwise just release our reference to it)
 118         */
 119        if (page_count(page) == 1 && !pipe->tmp_page)
 120                pipe->tmp_page = page;
 121        else
 122                put_page(page);
 123}
 124
 125static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
 126                struct pipe_buffer *buf)
 127{
 128        struct page *page = buf->page;
 129
 130        if (page_count(page) != 1)
 131                return false;
 132        memcg_kmem_uncharge_page(page, 0);
 133        __SetPageLocked(page);
 134        return true;
 135}
 136
 137/**
 138 * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
 139 * @pipe:       the pipe that the buffer belongs to
 140 * @buf:        the buffer to attempt to steal
 141 *
 142 * Description:
 143 *      This function attempts to steal the &struct page attached to
 144 *      @buf. If successful, this function returns 0 and returns with
 145 *      the page locked. The caller may then reuse the page for whatever
 146 *      he wishes; the typical use is insertion into a different file
 147 *      page cache.
 148 */
 149bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
 150                struct pipe_buffer *buf)
 151{
 152        struct page *page = buf->page;
 153
 154        /*
 155         * A reference of one is golden, that means that the owner of this
 156         * page is the only one holding a reference to it. lock the page
 157         * and return OK.
 158         */
 159        if (page_count(page) == 1) {
 160                lock_page(page);
 161                return true;
 162        }
 163        return false;
 164}
 165EXPORT_SYMBOL(generic_pipe_buf_try_steal);
 166
 167/**
 168 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
 169 * @pipe:       the pipe that the buffer belongs to
 170 * @buf:        the buffer to get a reference to
 171 *
 172 * Description:
 173 *      This function grabs an extra reference to @buf. It's used in
 174 *      in the tee() system call, when we duplicate the buffers in one
 175 *      pipe into another.
 176 */
 177bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 178{
 179        return try_get_page(buf->page);
 180}
 181EXPORT_SYMBOL(generic_pipe_buf_get);
 182
 183/**
 184 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
 185 * @pipe:       the pipe that the buffer belongs to
 186 * @buf:        the buffer to put a reference to
 187 *
 188 * Description:
 189 *      This function releases a reference to @buf.
 190 */
 191void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 192                              struct pipe_buffer *buf)
 193{
 194        put_page(buf->page);
 195}
 196EXPORT_SYMBOL(generic_pipe_buf_release);
 197
 198static const struct pipe_buf_operations anon_pipe_buf_ops = {
 199        .release        = anon_pipe_buf_release,
 200        .try_steal      = anon_pipe_buf_try_steal,
 201        .get            = generic_pipe_buf_get,
 202};
 203
 204/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
 205static inline bool pipe_readable(const struct pipe_inode_info *pipe)
 206{
 207        unsigned int head = READ_ONCE(pipe->head);
 208        unsigned int tail = READ_ONCE(pipe->tail);
 209        unsigned int writers = READ_ONCE(pipe->writers);
 210
 211        return !pipe_empty(head, tail) || !writers;
 212}
 213
 214static ssize_t
 215pipe_read(struct kiocb *iocb, struct iov_iter *to)
 216{
 217        size_t total_len = iov_iter_count(to);
 218        struct file *filp = iocb->ki_filp;
 219        struct pipe_inode_info *pipe = filp->private_data;
 220        bool was_full, wake_next_reader = false;
 221        ssize_t ret;
 222
 223        /* Null read succeeds. */
 224        if (unlikely(total_len == 0))
 225                return 0;
 226
 227        ret = 0;
 228        __pipe_lock(pipe);
 229
 230        /*
 231         * We only wake up writers if the pipe was full when we started
 232         * reading in order to avoid unnecessary wakeups.
 233         *
 234         * But when we do wake up writers, we do so using a sync wakeup
 235         * (WF_SYNC), because we want them to get going and generate more
 236         * data for us.
 237         */
 238        was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
 239        for (;;) {
 240                unsigned int head = pipe->head;
 241                unsigned int tail = pipe->tail;
 242                unsigned int mask = pipe->ring_size - 1;
 243
 244#ifdef CONFIG_WATCH_QUEUE
 245                if (pipe->note_loss) {
 246                        struct watch_notification n;
 247
 248                        if (total_len < 8) {
 249                                if (ret == 0)
 250                                        ret = -ENOBUFS;
 251                                break;
 252                        }
 253
 254                        n.type = WATCH_TYPE_META;
 255                        n.subtype = WATCH_META_LOSS_NOTIFICATION;
 256                        n.info = watch_sizeof(n);
 257                        if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
 258                                if (ret == 0)
 259                                        ret = -EFAULT;
 260                                break;
 261                        }
 262                        ret += sizeof(n);
 263                        total_len -= sizeof(n);
 264                        pipe->note_loss = false;
 265                }
 266#endif
 267
 268                if (!pipe_empty(head, tail)) {
 269                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 270                        size_t chars = buf->len;
 271                        size_t written;
 272                        int error;
 273
 274                        if (chars > total_len) {
 275                                if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
 276                                        if (ret == 0)
 277                                                ret = -ENOBUFS;
 278                                        break;
 279                                }
 280                                chars = total_len;
 281                        }
 282
 283                        error = pipe_buf_confirm(pipe, buf);
 284                        if (error) {
 285                                if (!ret)
 286                                        ret = error;
 287                                break;
 288                        }
 289
 290                        written = copy_page_to_iter(buf->page, buf->offset, chars, to);
 291                        if (unlikely(written < chars)) {
 292                                if (!ret)
 293                                        ret = -EFAULT;
 294                                break;
 295                        }
 296                        ret += chars;
 297                        buf->offset += chars;
 298                        buf->len -= chars;
 299
 300                        /* Was it a packet buffer? Clean up and exit */
 301                        if (buf->flags & PIPE_BUF_FLAG_PACKET) {
 302                                total_len = chars;
 303                                buf->len = 0;
 304                        }
 305
 306                        if (!buf->len) {
 307                                pipe_buf_release(pipe, buf);
 308                                spin_lock_irq(&pipe->rd_wait.lock);
 309#ifdef CONFIG_WATCH_QUEUE
 310                                if (buf->flags & PIPE_BUF_FLAG_LOSS)
 311                                        pipe->note_loss = true;
 312#endif
 313                                tail++;
 314                                pipe->tail = tail;
 315                                spin_unlock_irq(&pipe->rd_wait.lock);
 316                        }
 317                        total_len -= chars;
 318                        if (!total_len)
 319                                break;  /* common path: read succeeded */
 320                        if (!pipe_empty(head, tail))    /* More to do? */
 321                                continue;
 322                }
 323
 324                if (!pipe->writers)
 325                        break;
 326                if (ret)
 327                        break;
 328                if (filp->f_flags & O_NONBLOCK) {
 329                        ret = -EAGAIN;
 330                        break;
 331                }
 332                __pipe_unlock(pipe);
 333
 334                /*
 335                 * We only get here if we didn't actually read anything.
 336                 *
 337                 * However, we could have seen (and removed) a zero-sized
 338                 * pipe buffer, and might have made space in the buffers
 339                 * that way.
 340                 *
 341                 * You can't make zero-sized pipe buffers by doing an empty
 342                 * write (not even in packet mode), but they can happen if
 343                 * the writer gets an EFAULT when trying to fill a buffer
 344                 * that already got allocated and inserted in the buffer
 345                 * array.
 346                 *
 347                 * So we still need to wake up any pending writers in the
 348                 * _very_ unlikely case that the pipe was full, but we got
 349                 * no data.
 350                 */
 351                if (unlikely(was_full)) {
 352                        wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
 353                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 354                }
 355
 356                /*
 357                 * But because we didn't read anything, at this point we can
 358                 * just return directly with -ERESTARTSYS if we're interrupted,
 359                 * since we've done any required wakeups and there's no need
 360                 * to mark anything accessed. And we've dropped the lock.
 361                 */
 362                if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
 363                        return -ERESTARTSYS;
 364
 365                __pipe_lock(pipe);
 366                was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
 367                wake_next_reader = true;
 368        }
 369        if (pipe_empty(pipe->head, pipe->tail))
 370                wake_next_reader = false;
 371        __pipe_unlock(pipe);
 372
 373        if (was_full) {
 374                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
 375                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 376        }
 377        if (wake_next_reader)
 378                wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 379        if (ret > 0)
 380                file_accessed(filp);
 381        return ret;
 382}
 383
 384static inline int is_packetized(struct file *file)
 385{
 386        return (file->f_flags & O_DIRECT) != 0;
 387}
 388
 389/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
 390static inline bool pipe_writable(const struct pipe_inode_info *pipe)
 391{
 392        unsigned int head = READ_ONCE(pipe->head);
 393        unsigned int tail = READ_ONCE(pipe->tail);
 394        unsigned int max_usage = READ_ONCE(pipe->max_usage);
 395
 396        return !pipe_full(head, tail, max_usage) ||
 397                !READ_ONCE(pipe->readers);
 398}
 399
 400static ssize_t
 401pipe_write(struct kiocb *iocb, struct iov_iter *from)
 402{
 403        struct file *filp = iocb->ki_filp;
 404        struct pipe_inode_info *pipe = filp->private_data;
 405        unsigned int head;
 406        ssize_t ret = 0;
 407        size_t total_len = iov_iter_count(from);
 408        ssize_t chars;
 409        bool was_empty = false;
 410        bool wake_next_writer = false;
 411
 412        /* Null write succeeds. */
 413        if (unlikely(total_len == 0))
 414                return 0;
 415
 416        __pipe_lock(pipe);
 417
 418        if (!pipe->readers) {
 419                send_sig(SIGPIPE, current, 0);
 420                ret = -EPIPE;
 421                goto out;
 422        }
 423
 424#ifdef CONFIG_WATCH_QUEUE
 425        if (pipe->watch_queue) {
 426                ret = -EXDEV;
 427                goto out;
 428        }
 429#endif
 430
 431        /*
 432         * Only wake up if the pipe started out empty, since
 433         * otherwise there should be no readers waiting.
 434         *
 435         * If it wasn't empty we try to merge new data into
 436         * the last buffer.
 437         *
 438         * That naturally merges small writes, but it also
 439         * page-aligs the rest of the writes for large writes
 440         * spanning multiple pages.
 441         */
 442        head = pipe->head;
 443        was_empty = pipe_empty(head, pipe->tail);
 444        chars = total_len & (PAGE_SIZE-1);
 445        if (chars && !was_empty) {
 446                unsigned int mask = pipe->ring_size - 1;
 447                struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
 448                int offset = buf->offset + buf->len;
 449
 450                if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
 451                    offset + chars <= PAGE_SIZE) {
 452                        ret = pipe_buf_confirm(pipe, buf);
 453                        if (ret)
 454                                goto out;
 455
 456                        ret = copy_page_from_iter(buf->page, offset, chars, from);
 457                        if (unlikely(ret < chars)) {
 458                                ret = -EFAULT;
 459                                goto out;
 460                        }
 461
 462                        buf->len += ret;
 463                        if (!iov_iter_count(from))
 464                                goto out;
 465                }
 466        }
 467
 468        for (;;) {
 469                if (!pipe->readers) {
 470                        send_sig(SIGPIPE, current, 0);
 471                        if (!ret)
 472                                ret = -EPIPE;
 473                        break;
 474                }
 475
 476                head = pipe->head;
 477                if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
 478                        unsigned int mask = pipe->ring_size - 1;
 479                        struct pipe_buffer *buf = &pipe->bufs[head & mask];
 480                        struct page *page = pipe->tmp_page;
 481                        int copied;
 482
 483                        if (!page) {
 484                                page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
 485                                if (unlikely(!page)) {
 486                                        ret = ret ? : -ENOMEM;
 487                                        break;
 488                                }
 489                                pipe->tmp_page = page;
 490                        }
 491
 492                        /* Allocate a slot in the ring in advance and attach an
 493                         * empty buffer.  If we fault or otherwise fail to use
 494                         * it, either the reader will consume it or it'll still
 495                         * be there for the next write.
 496                         */
 497                        spin_lock_irq(&pipe->rd_wait.lock);
 498
 499                        head = pipe->head;
 500                        if (pipe_full(head, pipe->tail, pipe->max_usage)) {
 501                                spin_unlock_irq(&pipe->rd_wait.lock);
 502                                continue;
 503                        }
 504
 505                        pipe->head = head + 1;
 506                        spin_unlock_irq(&pipe->rd_wait.lock);
 507
 508                        /* Insert it into the buffer array */
 509                        buf = &pipe->bufs[head & mask];
 510                        buf->page = page;
 511                        buf->ops = &anon_pipe_buf_ops;
 512                        buf->offset = 0;
 513                        buf->len = 0;
 514                        if (is_packetized(filp))
 515                                buf->flags = PIPE_BUF_FLAG_PACKET;
 516                        else
 517                                buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
 518                        pipe->tmp_page = NULL;
 519
 520                        copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
 521                        if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
 522                                if (!ret)
 523                                        ret = -EFAULT;
 524                                break;
 525                        }
 526                        ret += copied;
 527                        buf->offset = 0;
 528                        buf->len = copied;
 529
 530                        if (!iov_iter_count(from))
 531                                break;
 532                }
 533
 534                if (!pipe_full(head, pipe->tail, pipe->max_usage))
 535                        continue;
 536
 537                /* Wait for buffer space to become available. */
 538                if (filp->f_flags & O_NONBLOCK) {
 539                        if (!ret)
 540                                ret = -EAGAIN;
 541                        break;
 542                }
 543                if (signal_pending(current)) {
 544                        if (!ret)
 545                                ret = -ERESTARTSYS;
 546                        break;
 547                }
 548
 549                /*
 550                 * We're going to release the pipe lock and wait for more
 551                 * space. We wake up any readers if necessary, and then
 552                 * after waiting we need to re-check whether the pipe
 553                 * become empty while we dropped the lock.
 554                 */
 555                __pipe_unlock(pipe);
 556                if (was_empty) {
 557                        wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 558                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 559                }
 560                wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
 561                __pipe_lock(pipe);
 562                was_empty = pipe_empty(pipe->head, pipe->tail);
 563                wake_next_writer = true;
 564        }
 565out:
 566        if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
 567                wake_next_writer = false;
 568        __pipe_unlock(pipe);
 569
 570        /*
 571         * If we do do a wakeup event, we do a 'sync' wakeup, because we
 572         * want the reader to start processing things asap, rather than
 573         * leave the data pending.
 574         *
 575         * This is particularly important for small writes, because of
 576         * how (for example) the GNU make jobserver uses small writes to
 577         * wake up pending jobs
 578         */
 579        if (was_empty) {
 580                wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 581                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 582        }
 583        if (wake_next_writer)
 584                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
 585        if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
 586                int err = file_update_time(filp);
 587                if (err)
 588                        ret = err;
 589                sb_end_write(file_inode(filp)->i_sb);
 590        }
 591        return ret;
 592}
 593
 594static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 595{
 596        struct pipe_inode_info *pipe = filp->private_data;
 597        int count, head, tail, mask;
 598
 599        switch (cmd) {
 600        case FIONREAD:
 601                __pipe_lock(pipe);
 602                count = 0;
 603                head = pipe->head;
 604                tail = pipe->tail;
 605                mask = pipe->ring_size - 1;
 606
 607                while (tail != head) {
 608                        count += pipe->bufs[tail & mask].len;
 609                        tail++;
 610                }
 611                __pipe_unlock(pipe);
 612
 613                return put_user(count, (int __user *)arg);
 614
 615#ifdef CONFIG_WATCH_QUEUE
 616        case IOC_WATCH_QUEUE_SET_SIZE: {
 617                int ret;
 618                __pipe_lock(pipe);
 619                ret = watch_queue_set_size(pipe, arg);
 620                __pipe_unlock(pipe);
 621                return ret;
 622        }
 623
 624        case IOC_WATCH_QUEUE_SET_FILTER:
 625                return watch_queue_set_filter(
 626                        pipe, (struct watch_notification_filter __user *)arg);
 627#endif
 628
 629        default:
 630                return -ENOIOCTLCMD;
 631        }
 632}
 633
 634/* No kernel lock held - fine */
 635static __poll_t
 636pipe_poll(struct file *filp, poll_table *wait)
 637{
 638        __poll_t mask;
 639        struct pipe_inode_info *pipe = filp->private_data;
 640        unsigned int head, tail;
 641
 642        /*
 643         * Reading pipe state only -- no need for acquiring the semaphore.
 644         *
 645         * But because this is racy, the code has to add the
 646         * entry to the poll table _first_ ..
 647         */
 648        if (filp->f_mode & FMODE_READ)
 649                poll_wait(filp, &pipe->rd_wait, wait);
 650        if (filp->f_mode & FMODE_WRITE)
 651                poll_wait(filp, &pipe->wr_wait, wait);
 652
 653        /*
 654         * .. and only then can you do the racy tests. That way,
 655         * if something changes and you got it wrong, the poll
 656         * table entry will wake you up and fix it.
 657         */
 658        head = READ_ONCE(pipe->head);
 659        tail = READ_ONCE(pipe->tail);
 660
 661        mask = 0;
 662        if (filp->f_mode & FMODE_READ) {
 663                if (!pipe_empty(head, tail))
 664                        mask |= EPOLLIN | EPOLLRDNORM;
 665                if (!pipe->writers && filp->f_version != pipe->w_counter)
 666                        mask |= EPOLLHUP;
 667        }
 668
 669        if (filp->f_mode & FMODE_WRITE) {
 670                if (!pipe_full(head, tail, pipe->max_usage))
 671                        mask |= EPOLLOUT | EPOLLWRNORM;
 672                /*
 673                 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
 674                 * behave exactly like pipes for poll().
 675                 */
 676                if (!pipe->readers)
 677                        mask |= EPOLLERR;
 678        }
 679
 680        return mask;
 681}
 682
 683static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
 684{
 685        int kill = 0;
 686
 687        spin_lock(&inode->i_lock);
 688        if (!--pipe->files) {
 689                inode->i_pipe = NULL;
 690                kill = 1;
 691        }
 692        spin_unlock(&inode->i_lock);
 693
 694        if (kill)
 695                free_pipe_info(pipe);
 696}
 697
 698static int
 699pipe_release(struct inode *inode, struct file *file)
 700{
 701        struct pipe_inode_info *pipe = file->private_data;
 702
 703        __pipe_lock(pipe);
 704        if (file->f_mode & FMODE_READ)
 705                pipe->readers--;
 706        if (file->f_mode & FMODE_WRITE)
 707                pipe->writers--;
 708
 709        /* Was that the last reader or writer, but not the other side? */
 710        if (!pipe->readers != !pipe->writers) {
 711                wake_up_interruptible_all(&pipe->rd_wait);
 712                wake_up_interruptible_all(&pipe->wr_wait);
 713                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 714                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 715        }
 716        __pipe_unlock(pipe);
 717
 718        put_pipe_info(inode, pipe);
 719        return 0;
 720}
 721
 722static int
 723pipe_fasync(int fd, struct file *filp, int on)
 724{
 725        struct pipe_inode_info *pipe = filp->private_data;
 726        int retval = 0;
 727
 728        __pipe_lock(pipe);
 729        if (filp->f_mode & FMODE_READ)
 730                retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 731        if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
 732                retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
 733                if (retval < 0 && (filp->f_mode & FMODE_READ))
 734                        /* this can happen only if on == T */
 735                        fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 736        }
 737        __pipe_unlock(pipe);
 738        return retval;
 739}
 740
 741unsigned long account_pipe_buffers(struct user_struct *user,
 742                                   unsigned long old, unsigned long new)
 743{
 744        return atomic_long_add_return(new - old, &user->pipe_bufs);
 745}
 746
 747bool too_many_pipe_buffers_soft(unsigned long user_bufs)
 748{
 749        unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
 750
 751        return soft_limit && user_bufs > soft_limit;
 752}
 753
 754bool too_many_pipe_buffers_hard(unsigned long user_bufs)
 755{
 756        unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
 757
 758        return hard_limit && user_bufs > hard_limit;
 759}
 760
 761bool pipe_is_unprivileged_user(void)
 762{
 763        return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
 764}
 765
 766struct pipe_inode_info *alloc_pipe_info(void)
 767{
 768        struct pipe_inode_info *pipe;
 769        unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
 770        struct user_struct *user = get_current_user();
 771        unsigned long user_bufs;
 772        unsigned int max_size = READ_ONCE(pipe_max_size);
 773
 774        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
 775        if (pipe == NULL)
 776                goto out_free_uid;
 777
 778        if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
 779                pipe_bufs = max_size >> PAGE_SHIFT;
 780
 781        user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
 782
 783        if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
 784                user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
 785                pipe_bufs = 1;
 786        }
 787
 788        if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
 789                goto out_revert_acct;
 790
 791        pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
 792                             GFP_KERNEL_ACCOUNT);
 793
 794        if (pipe->bufs) {
 795                init_waitqueue_head(&pipe->rd_wait);
 796                init_waitqueue_head(&pipe->wr_wait);
 797                pipe->r_counter = pipe->w_counter = 1;
 798                pipe->max_usage = pipe_bufs;
 799                pipe->ring_size = pipe_bufs;
 800                pipe->nr_accounted = pipe_bufs;
 801                pipe->user = user;
 802                mutex_init(&pipe->mutex);
 803                return pipe;
 804        }
 805
 806out_revert_acct:
 807        (void) account_pipe_buffers(user, pipe_bufs, 0);
 808        kfree(pipe);
 809out_free_uid:
 810        free_uid(user);
 811        return NULL;
 812}
 813
 814void free_pipe_info(struct pipe_inode_info *pipe)
 815{
 816        int i;
 817
 818#ifdef CONFIG_WATCH_QUEUE
 819        if (pipe->watch_queue) {
 820                watch_queue_clear(pipe->watch_queue);
 821                put_watch_queue(pipe->watch_queue);
 822        }
 823#endif
 824
 825        (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
 826        free_uid(pipe->user);
 827        for (i = 0; i < pipe->ring_size; i++) {
 828                struct pipe_buffer *buf = pipe->bufs + i;
 829                if (buf->ops)
 830                        pipe_buf_release(pipe, buf);
 831        }
 832        if (pipe->tmp_page)
 833                __free_page(pipe->tmp_page);
 834        kfree(pipe->bufs);
 835        kfree(pipe);
 836}
 837
 838static struct vfsmount *pipe_mnt __read_mostly;
 839
 840/*
 841 * pipefs_dname() is called from d_path().
 842 */
 843static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 844{
 845        return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
 846                                d_inode(dentry)->i_ino);
 847}
 848
 849static const struct dentry_operations pipefs_dentry_operations = {
 850        .d_dname        = pipefs_dname,
 851};
 852
 853static struct inode * get_pipe_inode(void)
 854{
 855        struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
 856        struct pipe_inode_info *pipe;
 857
 858        if (!inode)
 859                goto fail_inode;
 860
 861        inode->i_ino = get_next_ino();
 862
 863        pipe = alloc_pipe_info();
 864        if (!pipe)
 865                goto fail_iput;
 866
 867        inode->i_pipe = pipe;
 868        pipe->files = 2;
 869        pipe->readers = pipe->writers = 1;
 870        inode->i_fop = &pipefifo_fops;
 871
 872        /*
 873         * Mark the inode dirty from the very beginning,
 874         * that way it will never be moved to the dirty
 875         * list because "mark_inode_dirty()" will think
 876         * that it already _is_ on the dirty list.
 877         */
 878        inode->i_state = I_DIRTY;
 879        inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
 880        inode->i_uid = current_fsuid();
 881        inode->i_gid = current_fsgid();
 882        inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 883
 884        return inode;
 885
 886fail_iput:
 887        iput(inode);
 888
 889fail_inode:
 890        return NULL;
 891}
 892
 893int create_pipe_files(struct file **res, int flags)
 894{
 895        struct inode *inode = get_pipe_inode();
 896        struct file *f;
 897        int error;
 898
 899        if (!inode)
 900                return -ENFILE;
 901
 902        if (flags & O_NOTIFICATION_PIPE) {
 903                error = watch_queue_init(inode->i_pipe);
 904                if (error) {
 905                        free_pipe_info(inode->i_pipe);
 906                        iput(inode);
 907                        return error;
 908                }
 909        }
 910
 911        f = alloc_file_pseudo(inode, pipe_mnt, "",
 912                                O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
 913                                &pipefifo_fops);
 914        if (IS_ERR(f)) {
 915                free_pipe_info(inode->i_pipe);
 916                iput(inode);
 917                return PTR_ERR(f);
 918        }
 919
 920        f->private_data = inode->i_pipe;
 921
 922        res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
 923                                  &pipefifo_fops);
 924        if (IS_ERR(res[0])) {
 925                put_pipe_info(inode, inode->i_pipe);
 926                fput(f);
 927                return PTR_ERR(res[0]);
 928        }
 929        res[0]->private_data = inode->i_pipe;
 930        res[1] = f;
 931        stream_open(inode, res[0]);
 932        stream_open(inode, res[1]);
 933        return 0;
 934}
 935
 936static int __do_pipe_flags(int *fd, struct file **files, int flags)
 937{
 938        int error;
 939        int fdw, fdr;
 940
 941        if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
 942                return -EINVAL;
 943
 944        error = create_pipe_files(files, flags);
 945        if (error)
 946                return error;
 947
 948        error = get_unused_fd_flags(flags);
 949        if (error < 0)
 950                goto err_read_pipe;
 951        fdr = error;
 952
 953        error = get_unused_fd_flags(flags);
 954        if (error < 0)
 955                goto err_fdr;
 956        fdw = error;
 957
 958        audit_fd_pair(fdr, fdw);
 959        fd[0] = fdr;
 960        fd[1] = fdw;
 961        return 0;
 962
 963 err_fdr:
 964        put_unused_fd(fdr);
 965 err_read_pipe:
 966        fput(files[0]);
 967        fput(files[1]);
 968        return error;
 969}
 970
 971int do_pipe_flags(int *fd, int flags)
 972{
 973        struct file *files[2];
 974        int error = __do_pipe_flags(fd, files, flags);
 975        if (!error) {
 976                fd_install(fd[0], files[0]);
 977                fd_install(fd[1], files[1]);
 978        }
 979        return error;
 980}
 981
 982/*
 983 * sys_pipe() is the normal C calling standard for creating
 984 * a pipe. It's not the way Unix traditionally does this, though.
 985 */
 986static int do_pipe2(int __user *fildes, int flags)
 987{
 988        struct file *files[2];
 989        int fd[2];
 990        int error;
 991
 992        error = __do_pipe_flags(fd, files, flags);
 993        if (!error) {
 994                if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
 995                        fput(files[0]);
 996                        fput(files[1]);
 997                        put_unused_fd(fd[0]);
 998                        put_unused_fd(fd[1]);
 999                        error = -EFAULT;
1000                } else {
1001                        fd_install(fd[0], files[0]);
1002                        fd_install(fd[1], files[1]);
1003                }
1004        }
1005        return error;
1006}
1007
1008SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
1009{
1010        return do_pipe2(fildes, flags);
1011}
1012
1013SYSCALL_DEFINE1(pipe, int __user *, fildes)
1014{
1015        return do_pipe2(fildes, 0);
1016}
1017
1018/*
1019 * This is the stupid "wait for pipe to be readable or writable"
1020 * model.
1021 *
1022 * See pipe_read/write() for the proper kind of exclusive wait,
1023 * but that requires that we wake up any other readers/writers
1024 * if we then do not end up reading everything (ie the whole
1025 * "wake_next_reader/writer" logic in pipe_read/write()).
1026 */
1027void pipe_wait_readable(struct pipe_inode_info *pipe)
1028{
1029        pipe_unlock(pipe);
1030        wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
1031        pipe_lock(pipe);
1032}
1033
1034void pipe_wait_writable(struct pipe_inode_info *pipe)
1035{
1036        pipe_unlock(pipe);
1037        wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
1038        pipe_lock(pipe);
1039}
1040
1041/*
1042 * This depends on both the wait (here) and the wakeup (wake_up_partner)
1043 * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
1044 * race with the count check and waitqueue prep.
1045 *
1046 * Normally in order to avoid races, you'd do the prepare_to_wait() first,
1047 * then check the condition you're waiting for, and only then sleep. But
1048 * because of the pipe lock, we can check the condition before being on
1049 * the wait queue.
1050 *
1051 * We use the 'rd_wait' waitqueue for pipe partner waiting.
1052 */
1053static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
1054{
1055        DEFINE_WAIT(rdwait);
1056        int cur = *cnt;
1057
1058        while (cur == *cnt) {
1059                prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
1060                pipe_unlock(pipe);
1061                schedule();
1062                finish_wait(&pipe->rd_wait, &rdwait);
1063                pipe_lock(pipe);
1064                if (signal_pending(current))
1065                        break;
1066        }
1067        return cur == *cnt ? -ERESTARTSYS : 0;
1068}
1069
1070static void wake_up_partner(struct pipe_inode_info *pipe)
1071{
1072        wake_up_interruptible_all(&pipe->rd_wait);
1073}
1074
1075static int fifo_open(struct inode *inode, struct file *filp)
1076{
1077        struct pipe_inode_info *pipe;
1078        bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
1079        int ret;
1080
1081        filp->f_version = 0;
1082
1083        spin_lock(&inode->i_lock);
1084        if (inode->i_pipe) {
1085                pipe = inode->i_pipe;
1086                pipe->files++;
1087                spin_unlock(&inode->i_lock);
1088        } else {
1089                spin_unlock(&inode->i_lock);
1090                pipe = alloc_pipe_info();
1091                if (!pipe)
1092                        return -ENOMEM;
1093                pipe->files = 1;
1094                spin_lock(&inode->i_lock);
1095                if (unlikely(inode->i_pipe)) {
1096                        inode->i_pipe->files++;
1097                        spin_unlock(&inode->i_lock);
1098                        free_pipe_info(pipe);
1099                        pipe = inode->i_pipe;
1100                } else {
1101                        inode->i_pipe = pipe;
1102                        spin_unlock(&inode->i_lock);
1103                }
1104        }
1105        filp->private_data = pipe;
1106        /* OK, we have a pipe and it's pinned down */
1107
1108        __pipe_lock(pipe);
1109
1110        /* We can only do regular read/write on fifos */
1111        stream_open(inode, filp);
1112
1113        switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
1114        case FMODE_READ:
1115        /*
1116         *  O_RDONLY
1117         *  POSIX.1 says that O_NONBLOCK means return with the FIFO
1118         *  opened, even when there is no process writing the FIFO.
1119         */
1120                pipe->r_counter++;
1121                if (pipe->readers++ == 0)
1122                        wake_up_partner(pipe);
1123
1124                if (!is_pipe && !pipe->writers) {
1125                        if ((filp->f_flags & O_NONBLOCK)) {
1126                                /* suppress EPOLLHUP until we have
1127                                 * seen a writer */
1128                                filp->f_version = pipe->w_counter;
1129                        } else {
1130                                if (wait_for_partner(pipe, &pipe->w_counter))
1131                                        goto err_rd;
1132                        }
1133                }
1134                break;
1135
1136        case FMODE_WRITE:
1137        /*
1138         *  O_WRONLY
1139         *  POSIX.1 says that O_NONBLOCK means return -1 with
1140         *  errno=ENXIO when there is no process reading the FIFO.
1141         */
1142                ret = -ENXIO;
1143                if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
1144                        goto err;
1145
1146                pipe->w_counter++;
1147                if (!pipe->writers++)
1148                        wake_up_partner(pipe);
1149
1150                if (!is_pipe && !pipe->readers) {
1151                        if (wait_for_partner(pipe, &pipe->r_counter))
1152                                goto err_wr;
1153                }
1154                break;
1155
1156        case FMODE_READ | FMODE_WRITE:
1157        /*
1158         *  O_RDWR
1159         *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
1160         *  This implementation will NEVER block on a O_RDWR open, since
1161         *  the process can at least talk to itself.
1162         */
1163
1164                pipe->readers++;
1165                pipe->writers++;
1166                pipe->r_counter++;
1167                pipe->w_counter++;
1168                if (pipe->readers == 1 || pipe->writers == 1)
1169                        wake_up_partner(pipe);
1170                break;
1171
1172        default:
1173                ret = -EINVAL;
1174                goto err;
1175        }
1176
1177        /* Ok! */
1178        __pipe_unlock(pipe);
1179        return 0;
1180
1181err_rd:
1182        if (!--pipe->readers)
1183                wake_up_interruptible(&pipe->wr_wait);
1184        ret = -ERESTARTSYS;
1185        goto err;
1186
1187err_wr:
1188        if (!--pipe->writers)
1189                wake_up_interruptible_all(&pipe->rd_wait);
1190        ret = -ERESTARTSYS;
1191        goto err;
1192
1193err:
1194        __pipe_unlock(pipe);
1195
1196        put_pipe_info(inode, pipe);
1197        return ret;
1198}
1199
1200const struct file_operations pipefifo_fops = {
1201        .open           = fifo_open,
1202        .llseek         = no_llseek,
1203        .read_iter      = pipe_read,
1204        .write_iter     = pipe_write,
1205        .poll           = pipe_poll,
1206        .unlocked_ioctl = pipe_ioctl,
1207        .release        = pipe_release,
1208        .fasync         = pipe_fasync,
1209        .splice_write   = iter_file_splice_write,
1210};
1211
1212/*
1213 * Currently we rely on the pipe array holding a power-of-2 number
1214 * of pages. Returns 0 on error.
1215 */
1216unsigned int round_pipe_size(unsigned long size)
1217{
1218        if (size > (1U << 31))
1219                return 0;
1220
1221        /* Minimum pipe size, as required by POSIX */
1222        if (size < PAGE_SIZE)
1223                return PAGE_SIZE;
1224
1225        return roundup_pow_of_two(size);
1226}
1227
1228/*
1229 * Resize the pipe ring to a number of slots.
1230 */
1231int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
1232{
1233        struct pipe_buffer *bufs;
1234        unsigned int head, tail, mask, n;
1235
1236        /*
1237         * We can shrink the pipe, if arg is greater than the ring occupancy.
1238         * Since we don't expect a lot of shrink+grow operations, just free and
1239         * allocate again like we would do for growing.  If the pipe currently
1240         * contains more buffers than arg, then return busy.
1241         */
1242        mask = pipe->ring_size - 1;
1243        head = pipe->head;
1244        tail = pipe->tail;
1245        n = pipe_occupancy(pipe->head, pipe->tail);
1246        if (nr_slots < n)
1247                return -EBUSY;
1248
1249        bufs = kcalloc(nr_slots, sizeof(*bufs),
1250                       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
1251        if (unlikely(!bufs))
1252                return -ENOMEM;
1253
1254        /*
1255         * The pipe array wraps around, so just start the new one at zero
1256         * and adjust the indices.
1257         */
1258        if (n > 0) {
1259                unsigned int h = head & mask;
1260                unsigned int t = tail & mask;
1261                if (h > t) {
1262                        memcpy(bufs, pipe->bufs + t,
1263                               n * sizeof(struct pipe_buffer));
1264                } else {
1265                        unsigned int tsize = pipe->ring_size - t;
1266                        if (h > 0)
1267                                memcpy(bufs + tsize, pipe->bufs,
1268                                       h * sizeof(struct pipe_buffer));
1269                        memcpy(bufs, pipe->bufs + t,
1270                               tsize * sizeof(struct pipe_buffer));
1271                }
1272        }
1273
1274        head = n;
1275        tail = 0;
1276
1277        kfree(pipe->bufs);
1278        pipe->bufs = bufs;
1279        pipe->ring_size = nr_slots;
1280        if (pipe->max_usage > nr_slots)
1281                pipe->max_usage = nr_slots;
1282        pipe->tail = tail;
1283        pipe->head = head;
1284
1285        /* This might have made more room for writers */
1286        wake_up_interruptible(&pipe->wr_wait);
1287        return 0;
1288}
1289
1290/*
1291 * Allocate a new array of pipe buffers and copy the info over. Returns the
1292 * pipe size if successful, or return -ERROR on error.
1293 */
1294static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1295{
1296        unsigned long user_bufs;
1297        unsigned int nr_slots, size;
1298        long ret = 0;
1299
1300#ifdef CONFIG_WATCH_QUEUE
1301        if (pipe->watch_queue)
1302                return -EBUSY;
1303#endif
1304
1305        size = round_pipe_size(arg);
1306        nr_slots = size >> PAGE_SHIFT;
1307
1308        if (!nr_slots)
1309                return -EINVAL;
1310
1311        /*
1312         * If trying to increase the pipe capacity, check that an
1313         * unprivileged user is not trying to exceed various limits
1314         * (soft limit check here, hard limit check just below).
1315         * Decreasing the pipe capacity is always permitted, even
1316         * if the user is currently over a limit.
1317         */
1318        if (nr_slots > pipe->max_usage &&
1319                        size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
1320                return -EPERM;
1321
1322        user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
1323
1324        if (nr_slots > pipe->max_usage &&
1325                        (too_many_pipe_buffers_hard(user_bufs) ||
1326                         too_many_pipe_buffers_soft(user_bufs)) &&
1327                        pipe_is_unprivileged_user()) {
1328                ret = -EPERM;
1329                goto out_revert_acct;
1330        }
1331
1332        ret = pipe_resize_ring(pipe, nr_slots);
1333        if (ret < 0)
1334                goto out_revert_acct;
1335
1336        pipe->max_usage = nr_slots;
1337        pipe->nr_accounted = nr_slots;
1338        return pipe->max_usage * PAGE_SIZE;
1339
1340out_revert_acct:
1341        (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
1342        return ret;
1343}
1344
1345/*
1346 * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
1347 * not enough to verify that this is a pipe.
1348 */
1349struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
1350{
1351        struct pipe_inode_info *pipe = file->private_data;
1352
1353        if (file->f_op != &pipefifo_fops || !pipe)
1354                return NULL;
1355#ifdef CONFIG_WATCH_QUEUE
1356        if (for_splice && pipe->watch_queue)
1357                return NULL;
1358#endif
1359        return pipe;
1360}
1361
1362long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1363{
1364        struct pipe_inode_info *pipe;
1365        long ret;
1366
1367        pipe = get_pipe_info(file, false);
1368        if (!pipe)
1369                return -EBADF;
1370
1371        __pipe_lock(pipe);
1372
1373        switch (cmd) {
1374        case F_SETPIPE_SZ:
1375                ret = pipe_set_size(pipe, arg);
1376                break;
1377        case F_GETPIPE_SZ:
1378                ret = pipe->max_usage * PAGE_SIZE;
1379                break;
1380        default:
1381                ret = -EINVAL;
1382                break;
1383        }
1384
1385        __pipe_unlock(pipe);
1386        return ret;
1387}
1388
1389static const struct super_operations pipefs_ops = {
1390        .destroy_inode = free_inode_nonrcu,
1391        .statfs = simple_statfs,
1392};
1393
1394/*
1395 * pipefs should _never_ be mounted by userland - too much of security hassle,
1396 * no real gain from having the whole whorehouse mounted. So we don't need
1397 * any operations on the root directory. However, we need a non-trivial
1398 * d_name - pipe: will go nicely and kill the special-casing in procfs.
1399 */
1400
1401static int pipefs_init_fs_context(struct fs_context *fc)
1402{
1403        struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
1404        if (!ctx)
1405                return -ENOMEM;
1406        ctx->ops = &pipefs_ops;
1407        ctx->dops = &pipefs_dentry_operations;
1408        return 0;
1409}
1410
1411static struct file_system_type pipe_fs_type = {
1412        .name           = "pipefs",
1413        .init_fs_context = pipefs_init_fs_context,
1414        .kill_sb        = kill_anon_super,
1415};
1416
1417static int __init init_pipe_fs(void)
1418{
1419        int err = register_filesystem(&pipe_fs_type);
1420
1421        if (!err) {
1422                pipe_mnt = kern_mount(&pipe_fs_type);
1423                if (IS_ERR(pipe_mnt)) {
1424                        err = PTR_ERR(pipe_mnt);
1425                        unregister_filesystem(&pipe_fs_type);
1426                }
1427        }
1428        return err;
1429}
1430
1431fs_initcall(init_pipe_fs);
1432