linux/fs/splice.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * "splice": joining two ropes together by interweaving their strands.
   4 *
   5 * This is the "extended pipe" functionality, where a pipe is used as
   6 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
   7 * buffer that you can use to transfer data from one end to the other.
   8 *
   9 * The traditional unix read/write is extended with a "splice()" operation
  10 * that transfers data buffers to or from a pipe buffer.
  11 *
  12 * Named by Larry McVoy, original implementation from Linus, extended by
  13 * Jens to support splicing to files, network, direct splicing, etc and
  14 * fixing lots of bugs.
  15 *
  16 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  17 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  18 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  19 *
  20 */
  21#include <linux/bvec.h>
  22#include <linux/fs.h>
  23#include <linux/file.h>
  24#include <linux/pagemap.h>
  25#include <linux/splice.h>
  26#include <linux/memcontrol.h>
  27#include <linux/mm_inline.h>
  28#include <linux/swap.h>
  29#include <linux/writeback.h>
  30#include <linux/export.h>
  31#include <linux/syscalls.h>
  32#include <linux/uio.h>
  33#include <linux/security.h>
  34#include <linux/gfp.h>
  35#include <linux/socket.h>
  36#include <linux/compat.h>
  37#include <linux/sched/signal.h>
  38
  39#include "internal.h"
  40
  41/*
  42 * Attempt to steal a page from a pipe buffer. This should perhaps go into
  43 * a vm helper function, it's already simplified quite a bit by the
  44 * addition of remove_mapping(). If success is returned, the caller may
  45 * attempt to reuse this page for another destination.
  46 */
  47static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
  48                                     struct pipe_buffer *buf)
  49{
  50        struct page *page = buf->page;
  51        struct address_space *mapping;
  52
  53        lock_page(page);
  54
  55        mapping = page_mapping(page);
  56        if (mapping) {
  57                WARN_ON(!PageUptodate(page));
  58
  59                /*
  60                 * At least for ext2 with nobh option, we need to wait on
  61                 * writeback completing on this page, since we'll remove it
  62                 * from the pagecache.  Otherwise truncate wont wait on the
  63                 * page, allowing the disk blocks to be reused by someone else
  64                 * before we actually wrote our data to them. fs corruption
  65                 * ensues.
  66                 */
  67                wait_on_page_writeback(page);
  68
  69                if (page_has_private(page) &&
  70                    !try_to_release_page(page, GFP_KERNEL))
  71                        goto out_unlock;
  72
  73                /*
  74                 * If we succeeded in removing the mapping, set LRU flag
  75                 * and return good.
  76                 */
  77                if (remove_mapping(mapping, page)) {
  78                        buf->flags |= PIPE_BUF_FLAG_LRU;
  79                        return 0;
  80                }
  81        }
  82
  83        /*
  84         * Raced with truncate or failed to remove page from current
  85         * address space, unlock and return failure.
  86         */
  87out_unlock:
  88        unlock_page(page);
  89        return 1;
  90}
  91
  92static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  93                                        struct pipe_buffer *buf)
  94{
  95        put_page(buf->page);
  96        buf->flags &= ~PIPE_BUF_FLAG_LRU;
  97}
  98
  99/*
 100 * Check whether the contents of buf is OK to access. Since the content
 101 * is a page cache page, IO may be in flight.
 102 */
 103static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
 104                                       struct pipe_buffer *buf)
 105{
 106        struct page *page = buf->page;
 107        int err;
 108
 109        if (!PageUptodate(page)) {
 110                lock_page(page);
 111
 112                /*
 113                 * Page got truncated/unhashed. This will cause a 0-byte
 114                 * splice, if this is the first page.
 115                 */
 116                if (!page->mapping) {
 117                        err = -ENODATA;
 118                        goto error;
 119                }
 120
 121                /*
 122                 * Uh oh, read-error from disk.
 123                 */
 124                if (!PageUptodate(page)) {
 125                        err = -EIO;
 126                        goto error;
 127                }
 128
 129                /*
 130                 * Page is ok afterall, we are done.
 131                 */
 132                unlock_page(page);
 133        }
 134
 135        return 0;
 136error:
 137        unlock_page(page);
 138        return err;
 139}
 140
 141const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 142        .confirm = page_cache_pipe_buf_confirm,
 143        .release = page_cache_pipe_buf_release,
 144        .steal = page_cache_pipe_buf_steal,
 145        .get = generic_pipe_buf_get,
 146};
 147
 148static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 149                                    struct pipe_buffer *buf)
 150{
 151        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 152                return 1;
 153
 154        buf->flags |= PIPE_BUF_FLAG_LRU;
 155        return generic_pipe_buf_steal(pipe, buf);
 156}
 157
 158static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 159        .confirm = generic_pipe_buf_confirm,
 160        .release = page_cache_pipe_buf_release,
 161        .steal = user_page_pipe_buf_steal,
 162        .get = generic_pipe_buf_get,
 163};
 164
 165static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
 166{
 167        smp_mb();
 168        if (waitqueue_active(&pipe->wait))
 169                wake_up_interruptible(&pipe->wait);
 170        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 171}
 172
 173/**
 174 * splice_to_pipe - fill passed data into a pipe
 175 * @pipe:       pipe to fill
 176 * @spd:        data to fill
 177 *
 178 * Description:
 179 *    @spd contains a map of pages and len/offset tuples, along with
 180 *    the struct pipe_buf_operations associated with these pages. This
 181 *    function will link that data to the pipe.
 182 *
 183 */
 184ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 185                       struct splice_pipe_desc *spd)
 186{
 187        unsigned int spd_pages = spd->nr_pages;
 188        int ret = 0, page_nr = 0;
 189
 190        if (!spd_pages)
 191                return 0;
 192
 193        if (unlikely(!pipe->readers)) {
 194                send_sig(SIGPIPE, current, 0);
 195                ret = -EPIPE;
 196                goto out;
 197        }
 198
 199        while (pipe->nrbufs < pipe->buffers) {
 200                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 201                struct pipe_buffer *buf = pipe->bufs + newbuf;
 202
 203                buf->page = spd->pages[page_nr];
 204                buf->offset = spd->partial[page_nr].offset;
 205                buf->len = spd->partial[page_nr].len;
 206                buf->private = spd->partial[page_nr].private;
 207                buf->ops = spd->ops;
 208                buf->flags = 0;
 209
 210                pipe->nrbufs++;
 211                page_nr++;
 212                ret += buf->len;
 213
 214                if (!--spd->nr_pages)
 215                        break;
 216        }
 217
 218        if (!ret)
 219                ret = -EAGAIN;
 220
 221out:
 222        while (page_nr < spd_pages)
 223                spd->spd_release(spd, page_nr++);
 224
 225        return ret;
 226}
 227EXPORT_SYMBOL_GPL(splice_to_pipe);
 228
 229ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 230{
 231        int ret;
 232
 233        if (unlikely(!pipe->readers)) {
 234                send_sig(SIGPIPE, current, 0);
 235                ret = -EPIPE;
 236        } else if (pipe->nrbufs == pipe->buffers) {
 237                ret = -EAGAIN;
 238        } else {
 239                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 240                pipe->bufs[newbuf] = *buf;
 241                pipe->nrbufs++;
 242                return buf->len;
 243        }
 244        pipe_buf_release(pipe, buf);
 245        return ret;
 246}
 247EXPORT_SYMBOL(add_to_pipe);
 248
 249/*
 250 * Check if we need to grow the arrays holding pages and partial page
 251 * descriptions.
 252 */
 253int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
 254{
 255        unsigned int buffers = READ_ONCE(pipe->buffers);
 256
 257        spd->nr_pages_max = buffers;
 258        if (buffers <= PIPE_DEF_BUFFERS)
 259                return 0;
 260
 261        spd->pages = kmalloc_array(buffers, sizeof(struct page *), GFP_KERNEL);
 262        spd->partial = kmalloc_array(buffers, sizeof(struct partial_page),
 263                                     GFP_KERNEL);
 264
 265        if (spd->pages && spd->partial)
 266                return 0;
 267
 268        kfree(spd->pages);
 269        kfree(spd->partial);
 270        return -ENOMEM;
 271}
 272
 273void splice_shrink_spd(struct splice_pipe_desc *spd)
 274{
 275        if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
 276                return;
 277
 278        kfree(spd->pages);
 279        kfree(spd->partial);
 280}
 281
 282/**
 283 * generic_file_splice_read - splice data from file to a pipe
 284 * @in:         file to splice from
 285 * @ppos:       position in @in
 286 * @pipe:       pipe to splice to
 287 * @len:        number of bytes to splice
 288 * @flags:      splice modifier flags
 289 *
 290 * Description:
 291 *    Will read pages from given file and fill them into a pipe. Can be
 292 *    used as long as it has more or less sane ->read_iter().
 293 *
 294 */
 295ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 296                                 struct pipe_inode_info *pipe, size_t len,
 297                                 unsigned int flags)
 298{
 299        struct iov_iter to;
 300        struct kiocb kiocb;
 301        int idx, ret;
 302
 303        iov_iter_pipe(&to, READ, pipe, len);
 304        idx = to.idx;
 305        init_sync_kiocb(&kiocb, in);
 306        kiocb.ki_pos = *ppos;
 307        ret = call_read_iter(in, &kiocb, &to);
 308        if (ret > 0) {
 309                *ppos = kiocb.ki_pos;
 310                file_accessed(in);
 311        } else if (ret < 0) {
 312                to.idx = idx;
 313                to.iov_offset = 0;
 314                iov_iter_advance(&to, 0); /* to free what was emitted */
 315                /*
 316                 * callers of ->splice_read() expect -EAGAIN on
 317                 * "can't put anything in there", rather than -EFAULT.
 318                 */
 319                if (ret == -EFAULT)
 320                        ret = -EAGAIN;
 321        }
 322
 323        return ret;
 324}
 325EXPORT_SYMBOL(generic_file_splice_read);
 326
 327const struct pipe_buf_operations default_pipe_buf_ops = {
 328        .confirm = generic_pipe_buf_confirm,
 329        .release = generic_pipe_buf_release,
 330        .steal = generic_pipe_buf_steal,
 331        .get = generic_pipe_buf_get,
 332};
 333
 334int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
 335                             struct pipe_buffer *buf)
 336{
 337        return 1;
 338}
 339
 340/* Pipe buffer operations for a socket and similar. */
 341const struct pipe_buf_operations nosteal_pipe_buf_ops = {
 342        .confirm = generic_pipe_buf_confirm,
 343        .release = generic_pipe_buf_release,
 344        .steal = generic_pipe_buf_nosteal,
 345        .get = generic_pipe_buf_get,
 346};
 347EXPORT_SYMBOL(nosteal_pipe_buf_ops);
 348
 349static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
 350                            unsigned long vlen, loff_t offset)
 351{
 352        mm_segment_t old_fs;
 353        loff_t pos = offset;
 354        ssize_t res;
 355
 356        old_fs = get_fs();
 357        set_fs(KERNEL_DS);
 358        /* The cast to a user pointer is valid due to the set_fs() */
 359        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
 360        set_fs(old_fs);
 361
 362        return res;
 363}
 364
 365static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 366                                 struct pipe_inode_info *pipe, size_t len,
 367                                 unsigned int flags)
 368{
 369        struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
 370        struct iov_iter to;
 371        struct page **pages;
 372        unsigned int nr_pages;
 373        size_t offset, base, copied = 0;
 374        ssize_t res;
 375        int i;
 376
 377        if (pipe->nrbufs == pipe->buffers)
 378                return -EAGAIN;
 379
 380        /*
 381         * Try to keep page boundaries matching to source pagecache ones -
 382         * it probably won't be much help, but...
 383         */
 384        offset = *ppos & ~PAGE_MASK;
 385
 386        iov_iter_pipe(&to, READ, pipe, len + offset);
 387
 388        res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
 389        if (res <= 0)
 390                return -ENOMEM;
 391
 392        nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE);
 393
 394        vec = __vec;
 395        if (nr_pages > PIPE_DEF_BUFFERS) {
 396                vec = kmalloc_array(nr_pages, sizeof(struct kvec), GFP_KERNEL);
 397                if (unlikely(!vec)) {
 398                        res = -ENOMEM;
 399                        goto out;
 400                }
 401        }
 402
 403        pipe->bufs[to.idx].offset = offset;
 404        pipe->bufs[to.idx].len -= offset;
 405
 406        for (i = 0; i < nr_pages; i++) {
 407                size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
 408                vec[i].iov_base = page_address(pages[i]) + offset;
 409                vec[i].iov_len = this_len;
 410                len -= this_len;
 411                offset = 0;
 412        }
 413
 414        res = kernel_readv(in, vec, nr_pages, *ppos);
 415        if (res > 0) {
 416                copied = res;
 417                *ppos += res;
 418        }
 419
 420        if (vec != __vec)
 421                kfree(vec);
 422out:
 423        for (i = 0; i < nr_pages; i++)
 424                put_page(pages[i]);
 425        kvfree(pages);
 426        iov_iter_advance(&to, copied);  /* truncates and discards */
 427        return res;
 428}
 429
 430/*
 431 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 432 * using sendpage(). Return the number of bytes sent.
 433 */
 434static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 435                            struct pipe_buffer *buf, struct splice_desc *sd)
 436{
 437        struct file *file = sd->u.file;
 438        loff_t pos = sd->pos;
 439        int more;
 440
 441        if (!likely(file->f_op->sendpage))
 442                return -EINVAL;
 443
 444        more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
 445
 446        if (sd->len < sd->total_len && pipe->nrbufs > 1)
 447                more |= MSG_SENDPAGE_NOTLAST;
 448
 449        return file->f_op->sendpage(file, buf->page, buf->offset,
 450                                    sd->len, &pos, more);
 451}
 452
 453static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 454{
 455        smp_mb();
 456        if (waitqueue_active(&pipe->wait))
 457                wake_up_interruptible(&pipe->wait);
 458        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 459}
 460
 461/**
 462 * splice_from_pipe_feed - feed available data from a pipe to a file
 463 * @pipe:       pipe to splice from
 464 * @sd:         information to @actor
 465 * @actor:      handler that splices the data
 466 *
 467 * Description:
 468 *    This function loops over the pipe and calls @actor to do the
 469 *    actual moving of a single struct pipe_buffer to the desired
 470 *    destination.  It returns when there's no more buffers left in
 471 *    the pipe or if the requested number of bytes (@sd->total_len)
 472 *    have been copied.  It returns a positive number (one) if the
 473 *    pipe needs to be filled with more data, zero if the required
 474 *    number of bytes have been copied and -errno on error.
 475 *
 476 *    This, together with splice_from_pipe_{begin,end,next}, may be
 477 *    used to implement the functionality of __splice_from_pipe() when
 478 *    locking is required around copying the pipe buffers to the
 479 *    destination.
 480 */
 481static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 482                          splice_actor *actor)
 483{
 484        int ret;
 485
 486        while (pipe->nrbufs) {
 487                struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 488
 489                sd->len = buf->len;
 490                if (sd->len > sd->total_len)
 491                        sd->len = sd->total_len;
 492
 493                ret = pipe_buf_confirm(pipe, buf);
 494                if (unlikely(ret)) {
 495                        if (ret == -ENODATA)
 496                                ret = 0;
 497                        return ret;
 498                }
 499
 500                ret = actor(pipe, buf, sd);
 501                if (ret <= 0)
 502                        return ret;
 503
 504                buf->offset += ret;
 505                buf->len -= ret;
 506
 507                sd->num_spliced += ret;
 508                sd->len -= ret;
 509                sd->pos += ret;
 510                sd->total_len -= ret;
 511
 512                if (!buf->len) {
 513                        pipe_buf_release(pipe, buf);
 514                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 515                        pipe->nrbufs--;
 516                        if (pipe->files)
 517                                sd->need_wakeup = true;
 518                }
 519
 520                if (!sd->total_len)
 521                        return 0;
 522        }
 523
 524        return 1;
 525}
 526
 527/**
 528 * splice_from_pipe_next - wait for some data to splice from
 529 * @pipe:       pipe to splice from
 530 * @sd:         information about the splice operation
 531 *
 532 * Description:
 533 *    This function will wait for some data and return a positive
 534 *    value (one) if pipe buffers are available.  It will return zero
 535 *    or -errno if no more data needs to be spliced.
 536 */
 537static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 538{
 539        /*
 540         * Check for signal early to make process killable when there are
 541         * always buffers available
 542         */
 543        if (signal_pending(current))
 544                return -ERESTARTSYS;
 545
 546        while (!pipe->nrbufs) {
 547                if (!pipe->writers)
 548                        return 0;
 549
 550                if (!pipe->waiting_writers && sd->num_spliced)
 551                        return 0;
 552
 553                if (sd->flags & SPLICE_F_NONBLOCK)
 554                        return -EAGAIN;
 555
 556                if (signal_pending(current))
 557                        return -ERESTARTSYS;
 558
 559                if (sd->need_wakeup) {
 560                        wakeup_pipe_writers(pipe);
 561                        sd->need_wakeup = false;
 562                }
 563
 564                pipe_wait(pipe);
 565        }
 566
 567        return 1;
 568}
 569
 570/**
 571 * splice_from_pipe_begin - start splicing from pipe
 572 * @sd:         information about the splice operation
 573 *
 574 * Description:
 575 *    This function should be called before a loop containing
 576 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 577 *    initialize the necessary fields of @sd.
 578 */
 579static void splice_from_pipe_begin(struct splice_desc *sd)
 580{
 581        sd->num_spliced = 0;
 582        sd->need_wakeup = false;
 583}
 584
 585/**
 586 * splice_from_pipe_end - finish splicing from pipe
 587 * @pipe:       pipe to splice from
 588 * @sd:         information about the splice operation
 589 *
 590 * Description:
 591 *    This function will wake up pipe writers if necessary.  It should
 592 *    be called after a loop containing splice_from_pipe_next() and
 593 *    splice_from_pipe_feed().
 594 */
 595static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
 596{
 597        if (sd->need_wakeup)
 598                wakeup_pipe_writers(pipe);
 599}
 600
 601/**
 602 * __splice_from_pipe - splice data from a pipe to given actor
 603 * @pipe:       pipe to splice from
 604 * @sd:         information to @actor
 605 * @actor:      handler that splices the data
 606 *
 607 * Description:
 608 *    This function does little more than loop over the pipe and call
 609 *    @actor to do the actual moving of a single struct pipe_buffer to
 610 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 611 *    pipe_to_user.
 612 *
 613 */
 614ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 615                           splice_actor *actor)
 616{
 617        int ret;
 618
 619        splice_from_pipe_begin(sd);
 620        do {
 621                cond_resched();
 622                ret = splice_from_pipe_next(pipe, sd);
 623                if (ret > 0)
 624                        ret = splice_from_pipe_feed(pipe, sd, actor);
 625        } while (ret > 0);
 626        splice_from_pipe_end(pipe, sd);
 627
 628        return sd->num_spliced ? sd->num_spliced : ret;
 629}
 630EXPORT_SYMBOL(__splice_from_pipe);
 631
 632/**
 633 * splice_from_pipe - splice data from a pipe to a file
 634 * @pipe:       pipe to splice from
 635 * @out:        file to splice to
 636 * @ppos:       position in @out
 637 * @len:        how many bytes to splice
 638 * @flags:      splice modifier flags
 639 * @actor:      handler that splices the data
 640 *
 641 * Description:
 642 *    See __splice_from_pipe. This function locks the pipe inode,
 643 *    otherwise it's identical to __splice_from_pipe().
 644 *
 645 */
 646ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 647                         loff_t *ppos, size_t len, unsigned int flags,
 648                         splice_actor *actor)
 649{
 650        ssize_t ret;
 651        struct splice_desc sd = {
 652                .total_len = len,
 653                .flags = flags,
 654                .pos = *ppos,
 655                .u.file = out,
 656        };
 657
 658        pipe_lock(pipe);
 659        ret = __splice_from_pipe(pipe, &sd, actor);
 660        pipe_unlock(pipe);
 661
 662        return ret;
 663}
 664
 665/**
 666 * iter_file_splice_write - splice data from a pipe to a file
 667 * @pipe:       pipe info
 668 * @out:        file to write to
 669 * @ppos:       position in @out
 670 * @len:        number of bytes to splice
 671 * @flags:      splice modifier flags
 672 *
 673 * Description:
 674 *    Will either move or copy pages (determined by @flags options) from
 675 *    the given pipe inode to the given file.
 676 *    This one is ->write_iter-based.
 677 *
 678 */
 679ssize_t
 680iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 681                          loff_t *ppos, size_t len, unsigned int flags)
 682{
 683        struct splice_desc sd = {
 684                .total_len = len,
 685                .flags = flags,
 686                .pos = *ppos,
 687                .u.file = out,
 688        };
 689        int nbufs = pipe->buffers;
 690        struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
 691                                        GFP_KERNEL);
 692        ssize_t ret;
 693
 694        if (unlikely(!array))
 695                return -ENOMEM;
 696
 697        pipe_lock(pipe);
 698
 699        splice_from_pipe_begin(&sd);
 700        while (sd.total_len) {
 701                struct iov_iter from;
 702                size_t left;
 703                int n, idx;
 704
 705                ret = splice_from_pipe_next(pipe, &sd);
 706                if (ret <= 0)
 707                        break;
 708
 709                if (unlikely(nbufs < pipe->buffers)) {
 710                        kfree(array);
 711                        nbufs = pipe->buffers;
 712                        array = kcalloc(nbufs, sizeof(struct bio_vec),
 713                                        GFP_KERNEL);
 714                        if (!array) {
 715                                ret = -ENOMEM;
 716                                break;
 717                        }
 718                }
 719
 720                /* build the vector */
 721                left = sd.total_len;
 722                for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
 723                        struct pipe_buffer *buf = pipe->bufs + idx;
 724                        size_t this_len = buf->len;
 725
 726                        if (this_len > left)
 727                                this_len = left;
 728
 729                        if (idx == pipe->buffers - 1)
 730                                idx = -1;
 731
 732                        ret = pipe_buf_confirm(pipe, buf);
 733                        if (unlikely(ret)) {
 734                                if (ret == -ENODATA)
 735                                        ret = 0;
 736                                goto done;
 737                        }
 738
 739                        array[n].bv_page = buf->page;
 740                        array[n].bv_len = this_len;
 741                        array[n].bv_offset = buf->offset;
 742                        left -= this_len;
 743                }
 744
 745                iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
 746                ret = vfs_iter_write(out, &from, &sd.pos, 0);
 747                if (ret <= 0)
 748                        break;
 749
 750                sd.num_spliced += ret;
 751                sd.total_len -= ret;
 752                *ppos = sd.pos;
 753
 754                /* dismiss the fully eaten buffers, adjust the partial one */
 755                while (ret) {
 756                        struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 757                        if (ret >= buf->len) {
 758                                ret -= buf->len;
 759                                buf->len = 0;
 760                                pipe_buf_release(pipe, buf);
 761                                pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 762                                pipe->nrbufs--;
 763                                if (pipe->files)
 764                                        sd.need_wakeup = true;
 765                        } else {
 766                                buf->offset += ret;
 767                                buf->len -= ret;
 768                                ret = 0;
 769                        }
 770                }
 771        }
 772done:
 773        kfree(array);
 774        splice_from_pipe_end(pipe, &sd);
 775
 776        pipe_unlock(pipe);
 777
 778        if (sd.num_spliced)
 779                ret = sd.num_spliced;
 780
 781        return ret;
 782}
 783
 784EXPORT_SYMBOL(iter_file_splice_write);
 785
 786static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 787                          struct splice_desc *sd)
 788{
 789        int ret;
 790        void *data;
 791        loff_t tmp = sd->pos;
 792
 793        data = kmap(buf->page);
 794        ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
 795        kunmap(buf->page);
 796
 797        return ret;
 798}
 799
 800static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
 801                                         struct file *out, loff_t *ppos,
 802                                         size_t len, unsigned int flags)
 803{
 804        ssize_t ret;
 805
 806        ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
 807        if (ret > 0)
 808                *ppos += ret;
 809
 810        return ret;
 811}
 812
 813/**
 814 * generic_splice_sendpage - splice data from a pipe to a socket
 815 * @pipe:       pipe to splice from
 816 * @out:        socket to write to
 817 * @ppos:       position in @out
 818 * @len:        number of bytes to splice
 819 * @flags:      splice modifier flags
 820 *
 821 * Description:
 822 *    Will send @len bytes from the pipe to a network socket. No data copying
 823 *    is involved.
 824 *
 825 */
 826ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 827                                loff_t *ppos, size_t len, unsigned int flags)
 828{
 829        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 830}
 831
 832EXPORT_SYMBOL(generic_splice_sendpage);
 833
 834/*
 835 * Attempt to initiate a splice from pipe to file.
 836 */
 837static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 838                           loff_t *ppos, size_t len, unsigned int flags)
 839{
 840        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 841                                loff_t *, size_t, unsigned int);
 842
 843        if (out->f_op->splice_write)
 844                splice_write = out->f_op->splice_write;
 845        else
 846                splice_write = default_file_splice_write;
 847
 848        return splice_write(pipe, out, ppos, len, flags);
 849}
 850
 851/*
 852 * Attempt to initiate a splice from a file to a pipe.
 853 */
 854static long do_splice_to(struct file *in, loff_t *ppos,
 855                         struct pipe_inode_info *pipe, size_t len,
 856                         unsigned int flags)
 857{
 858        ssize_t (*splice_read)(struct file *, loff_t *,
 859                               struct pipe_inode_info *, size_t, unsigned int);
 860        int ret;
 861
 862        if (unlikely(!(in->f_mode & FMODE_READ)))
 863                return -EBADF;
 864
 865        ret = rw_verify_area(READ, in, ppos, len);
 866        if (unlikely(ret < 0))
 867                return ret;
 868
 869        if (unlikely(len > MAX_RW_COUNT))
 870                len = MAX_RW_COUNT;
 871
 872        if (in->f_op->splice_read)
 873                splice_read = in->f_op->splice_read;
 874        else
 875                splice_read = default_file_splice_read;
 876
 877        return splice_read(in, ppos, pipe, len, flags);
 878}
 879
 880/**
 881 * splice_direct_to_actor - splices data directly between two non-pipes
 882 * @in:         file to splice from
 883 * @sd:         actor information on where to splice to
 884 * @actor:      handles the data splicing
 885 *
 886 * Description:
 887 *    This is a special case helper to splice directly between two
 888 *    points, without requiring an explicit pipe. Internally an allocated
 889 *    pipe is cached in the process, and reused during the lifetime of
 890 *    that process.
 891 *
 892 */
 893ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 894                               splice_direct_actor *actor)
 895{
 896        struct pipe_inode_info *pipe;
 897        long ret, bytes;
 898        umode_t i_mode;
 899        size_t len;
 900        int i, flags, more;
 901
 902        /*
 903         * We require the input being a regular file, as we don't want to
 904         * randomly drop data for eg socket -> socket splicing. Use the
 905         * piped splicing for that!
 906         */
 907        i_mode = file_inode(in)->i_mode;
 908        if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
 909                return -EINVAL;
 910
 911        /*
 912         * neither in nor out is a pipe, setup an internal pipe attached to
 913         * 'out' and transfer the wanted data from 'in' to 'out' through that
 914         */
 915        pipe = current->splice_pipe;
 916        if (unlikely(!pipe)) {
 917                pipe = alloc_pipe_info();
 918                if (!pipe)
 919                        return -ENOMEM;
 920
 921                /*
 922                 * We don't have an immediate reader, but we'll read the stuff
 923                 * out of the pipe right after the splice_to_pipe(). So set
 924                 * PIPE_READERS appropriately.
 925                 */
 926                pipe->readers = 1;
 927
 928                current->splice_pipe = pipe;
 929        }
 930
 931        /*
 932         * Do the splice.
 933         */
 934        ret = 0;
 935        bytes = 0;
 936        len = sd->total_len;
 937        flags = sd->flags;
 938
 939        /*
 940         * Don't block on output, we have to drain the direct pipe.
 941         */
 942        sd->flags &= ~SPLICE_F_NONBLOCK;
 943        more = sd->flags & SPLICE_F_MORE;
 944
 945        WARN_ON_ONCE(pipe->nrbufs != 0);
 946
 947        while (len) {
 948                size_t read_len;
 949                loff_t pos = sd->pos, prev_pos = pos;
 950
 951                /* Don't try to read more the pipe has space for. */
 952                read_len = min_t(size_t, len,
 953                                 (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT);
 954                ret = do_splice_to(in, &pos, pipe, read_len, flags);
 955                if (unlikely(ret <= 0))
 956                        goto out_release;
 957
 958                read_len = ret;
 959                sd->total_len = read_len;
 960
 961                /*
 962                 * If more data is pending, set SPLICE_F_MORE
 963                 * If this is the last data and SPLICE_F_MORE was not set
 964                 * initially, clears it.
 965                 */
 966                if (read_len < len)
 967                        sd->flags |= SPLICE_F_MORE;
 968                else if (!more)
 969                        sd->flags &= ~SPLICE_F_MORE;
 970                /*
 971                 * NOTE: nonblocking mode only applies to the input. We
 972                 * must not do the output in nonblocking mode as then we
 973                 * could get stuck data in the internal pipe:
 974                 */
 975                ret = actor(pipe, sd);
 976                if (unlikely(ret <= 0)) {
 977                        sd->pos = prev_pos;
 978                        goto out_release;
 979                }
 980
 981                bytes += ret;
 982                len -= ret;
 983                sd->pos = pos;
 984
 985                if (ret < read_len) {
 986                        sd->pos = prev_pos + ret;
 987                        goto out_release;
 988                }
 989        }
 990
 991done:
 992        pipe->nrbufs = pipe->curbuf = 0;
 993        file_accessed(in);
 994        return bytes;
 995
 996out_release:
 997        /*
 998         * If we did an incomplete transfer we must release
 999         * the pipe buffers in question:
1000         */
1001        for (i = 0; i < pipe->buffers; i++) {
1002                struct pipe_buffer *buf = pipe->bufs + i;
1003
1004                if (buf->ops)
1005                        pipe_buf_release(pipe, buf);
1006        }
1007
1008        if (!bytes)
1009                bytes = ret;
1010
1011        goto done;
1012}
1013EXPORT_SYMBOL(splice_direct_to_actor);
1014
1015static int direct_splice_actor(struct pipe_inode_info *pipe,
1016                               struct splice_desc *sd)
1017{
1018        struct file *file = sd->u.file;
1019
1020        return do_splice_from(pipe, file, sd->opos, sd->total_len,
1021                              sd->flags);
1022}
1023
1024/**
1025 * do_splice_direct - splices data directly between two files
1026 * @in:         file to splice from
1027 * @ppos:       input file offset
1028 * @out:        file to splice to
1029 * @opos:       output file offset
1030 * @len:        number of bytes to splice
1031 * @flags:      splice modifier flags
1032 *
1033 * Description:
1034 *    For use by do_sendfile(). splice can easily emulate sendfile, but
1035 *    doing it in the application would incur an extra system call
1036 *    (splice in + splice out, as compared to just sendfile()). So this helper
1037 *    can splice directly through a process-private pipe.
1038 *
1039 */
1040long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1041                      loff_t *opos, size_t len, unsigned int flags)
1042{
1043        struct splice_desc sd = {
1044                .len            = len,
1045                .total_len      = len,
1046                .flags          = flags,
1047                .pos            = *ppos,
1048                .u.file         = out,
1049                .opos           = opos,
1050        };
1051        long ret;
1052
1053        if (unlikely(!(out->f_mode & FMODE_WRITE)))
1054                return -EBADF;
1055
1056        if (unlikely(out->f_flags & O_APPEND))
1057                return -EINVAL;
1058
1059        ret = rw_verify_area(WRITE, out, opos, len);
1060        if (unlikely(ret < 0))
1061                return ret;
1062
1063        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1064        if (ret > 0)
1065                *ppos = sd.pos;
1066
1067        return ret;
1068}
1069EXPORT_SYMBOL(do_splice_direct);
1070
1071static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1072{
1073        for (;;) {
1074                if (unlikely(!pipe->readers)) {
1075                        send_sig(SIGPIPE, current, 0);
1076                        return -EPIPE;
1077                }
1078                if (pipe->nrbufs != pipe->buffers)
1079                        return 0;
1080                if (flags & SPLICE_F_NONBLOCK)
1081                        return -EAGAIN;
1082                if (signal_pending(current))
1083                        return -ERESTARTSYS;
1084                pipe->waiting_writers++;
1085                pipe_wait(pipe);
1086                pipe->waiting_writers--;
1087        }
1088}
1089
1090static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1091                               struct pipe_inode_info *opipe,
1092                               size_t len, unsigned int flags);
1093
1094/*
1095 * Determine where to splice to/from.
1096 */
1097static long do_splice(struct file *in, loff_t __user *off_in,
1098                      struct file *out, loff_t __user *off_out,
1099                      size_t len, unsigned int flags)
1100{
1101        struct pipe_inode_info *ipipe;
1102        struct pipe_inode_info *opipe;
1103        loff_t offset;
1104        long ret;
1105
1106        ipipe = get_pipe_info(in);
1107        opipe = get_pipe_info(out);
1108
1109        if (ipipe && opipe) {
1110                if (off_in || off_out)
1111                        return -ESPIPE;
1112
1113                if (!(in->f_mode & FMODE_READ))
1114                        return -EBADF;
1115
1116                if (!(out->f_mode & FMODE_WRITE))
1117                        return -EBADF;
1118
1119                /* Splicing to self would be fun, but... */
1120                if (ipipe == opipe)
1121                        return -EINVAL;
1122
1123                if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1124                        flags |= SPLICE_F_NONBLOCK;
1125
1126                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1127        }
1128
1129        if (ipipe) {
1130                if (off_in)
1131                        return -ESPIPE;
1132                if (off_out) {
1133                        if (!(out->f_mode & FMODE_PWRITE))
1134                                return -EINVAL;
1135                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1136                                return -EFAULT;
1137                } else {
1138                        offset = out->f_pos;
1139                }
1140
1141                if (unlikely(!(out->f_mode & FMODE_WRITE)))
1142                        return -EBADF;
1143
1144                if (unlikely(out->f_flags & O_APPEND))
1145                        return -EINVAL;
1146
1147                ret = rw_verify_area(WRITE, out, &offset, len);
1148                if (unlikely(ret < 0))
1149                        return ret;
1150
1151                if (in->f_flags & O_NONBLOCK)
1152                        flags |= SPLICE_F_NONBLOCK;
1153
1154                file_start_write(out);
1155                ret = do_splice_from(ipipe, out, &offset, len, flags);
1156                file_end_write(out);
1157
1158                if (!off_out)
1159                        out->f_pos = offset;
1160                else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1161                        ret = -EFAULT;
1162
1163                return ret;
1164        }
1165
1166        if (opipe) {
1167                if (off_out)
1168                        return -ESPIPE;
1169                if (off_in) {
1170                        if (!(in->f_mode & FMODE_PREAD))
1171                                return -EINVAL;
1172                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1173                                return -EFAULT;
1174                } else {
1175                        offset = in->f_pos;
1176                }
1177
1178                if (out->f_flags & O_NONBLOCK)
1179                        flags |= SPLICE_F_NONBLOCK;
1180
1181                pipe_lock(opipe);
1182                ret = wait_for_space(opipe, flags);
1183                if (!ret)
1184                        ret = do_splice_to(in, &offset, opipe, len, flags);
1185                pipe_unlock(opipe);
1186                if (ret > 0)
1187                        wakeup_pipe_readers(opipe);
1188                if (!off_in)
1189                        in->f_pos = offset;
1190                else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1191                        ret = -EFAULT;
1192
1193                return ret;
1194        }
1195
1196        return -EINVAL;
1197}
1198
1199static int iter_to_pipe(struct iov_iter *from,
1200                        struct pipe_inode_info *pipe,
1201                        unsigned flags)
1202{
1203        struct pipe_buffer buf = {
1204                .ops = &user_page_pipe_buf_ops,
1205                .flags = flags
1206        };
1207        size_t total = 0;
1208        int ret = 0;
1209        bool failed = false;
1210
1211        while (iov_iter_count(from) && !failed) {
1212                struct page *pages[16];
1213                ssize_t copied;
1214                size_t start;
1215                int n;
1216
1217                copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1218                if (copied <= 0) {
1219                        ret = copied;
1220                        break;
1221                }
1222
1223                for (n = 0; copied; n++, start = 0) {
1224                        int size = min_t(int, copied, PAGE_SIZE - start);
1225                        if (!failed) {
1226                                buf.page = pages[n];
1227                                buf.offset = start;
1228                                buf.len = size;
1229                                ret = add_to_pipe(pipe, &buf);
1230                                if (unlikely(ret < 0)) {
1231                                        failed = true;
1232                                } else {
1233                                        iov_iter_advance(from, ret);
1234                                        total += ret;
1235                                }
1236                        } else {
1237                                put_page(pages[n]);
1238                        }
1239                        copied -= size;
1240                }
1241        }
1242        return total ? total : ret;
1243}
1244
1245static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1246                        struct splice_desc *sd)
1247{
1248        int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1249        return n == sd->len ? n : -EFAULT;
1250}
1251
1252/*
1253 * For lack of a better implementation, implement vmsplice() to userspace
1254 * as a simple copy of the pipes pages to the user iov.
1255 */
1256static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1257                             unsigned int flags)
1258{
1259        struct pipe_inode_info *pipe = get_pipe_info(file);
1260        struct splice_desc sd = {
1261                .total_len = iov_iter_count(iter),
1262                .flags = flags,
1263                .u.data = iter
1264        };
1265        long ret = 0;
1266
1267        if (!pipe)
1268                return -EBADF;
1269
1270        if (sd.total_len) {
1271                pipe_lock(pipe);
1272                ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1273                pipe_unlock(pipe);
1274        }
1275
1276        return ret;
1277}
1278
1279/*
1280 * vmsplice splices a user address range into a pipe. It can be thought of
1281 * as splice-from-memory, where the regular splice is splice-from-file (or
1282 * to file). In both cases the output is a pipe, naturally.
1283 */
1284static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1285                             unsigned int flags)
1286{
1287        struct pipe_inode_info *pipe;
1288        long ret = 0;
1289        unsigned buf_flag = 0;
1290
1291        if (flags & SPLICE_F_GIFT)
1292                buf_flag = PIPE_BUF_FLAG_GIFT;
1293
1294        pipe = get_pipe_info(file);
1295        if (!pipe)
1296                return -EBADF;
1297
1298        pipe_lock(pipe);
1299        ret = wait_for_space(pipe, flags);
1300        if (!ret)
1301                ret = iter_to_pipe(iter, pipe, buf_flag);
1302        pipe_unlock(pipe);
1303        if (ret > 0)
1304                wakeup_pipe_readers(pipe);
1305        return ret;
1306}
1307
1308static int vmsplice_type(struct fd f, int *type)
1309{
1310        if (!f.file)
1311                return -EBADF;
1312        if (f.file->f_mode & FMODE_WRITE) {
1313                *type = WRITE;
1314        } else if (f.file->f_mode & FMODE_READ) {
1315                *type = READ;
1316        } else {
1317                fdput(f);
1318                return -EBADF;
1319        }
1320        return 0;
1321}
1322
1323/*
1324 * Note that vmsplice only really supports true splicing _from_ user memory
1325 * to a pipe, not the other way around. Splicing from user memory is a simple
1326 * operation that can be supported without any funky alignment restrictions
1327 * or nasty vm tricks. We simply map in the user memory and fill them into
1328 * a pipe. The reverse isn't quite as easy, though. There are two possible
1329 * solutions for that:
1330 *
1331 *      - memcpy() the data internally, at which point we might as well just
1332 *        do a regular read() on the buffer anyway.
1333 *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1334 *        has restriction limitations on both ends of the pipe).
1335 *
1336 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1337 *
1338 */
1339static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags)
1340{
1341        if (unlikely(flags & ~SPLICE_F_ALL))
1342                return -EINVAL;
1343
1344        if (!iov_iter_count(iter))
1345                return 0;
1346
1347        if (iov_iter_rw(iter) == WRITE)
1348                return vmsplice_to_pipe(f, iter, flags);
1349        else
1350                return vmsplice_to_user(f, iter, flags);
1351}
1352
1353SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1354                unsigned long, nr_segs, unsigned int, flags)
1355{
1356        struct iovec iovstack[UIO_FASTIOV];
1357        struct iovec *iov = iovstack;
1358        struct iov_iter iter;
1359        ssize_t error;
1360        struct fd f;
1361        int type;
1362
1363        f = fdget(fd);
1364        error = vmsplice_type(f, &type);
1365        if (error)
1366                return error;
1367
1368        error = import_iovec(type, uiov, nr_segs,
1369                             ARRAY_SIZE(iovstack), &iov, &iter);
1370        if (error >= 0) {
1371                error = do_vmsplice(f.file, &iter, flags);
1372                kfree(iov);
1373        }
1374        fdput(f);
1375        return error;
1376}
1377
1378#ifdef CONFIG_COMPAT
1379COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1380                    unsigned int, nr_segs, unsigned int, flags)
1381{
1382        struct iovec iovstack[UIO_FASTIOV];
1383        struct iovec *iov = iovstack;
1384        struct iov_iter iter;
1385        ssize_t error;
1386        struct fd f;
1387        int type;
1388
1389        f = fdget(fd);
1390        error = vmsplice_type(f, &type);
1391        if (error)
1392                return error;
1393
1394        error = compat_import_iovec(type, iov32, nr_segs,
1395                             ARRAY_SIZE(iovstack), &iov, &iter);
1396        if (error >= 0) {
1397                error = do_vmsplice(f.file, &iter, flags);
1398                kfree(iov);
1399        }
1400        fdput(f);
1401        return error;
1402}
1403#endif
1404
1405SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1406                int, fd_out, loff_t __user *, off_out,
1407                size_t, len, unsigned int, flags)
1408{
1409        struct fd in, out;
1410        long error;
1411
1412        if (unlikely(!len))
1413                return 0;
1414
1415        if (unlikely(flags & ~SPLICE_F_ALL))
1416                return -EINVAL;
1417
1418        error = -EBADF;
1419        in = fdget(fd_in);
1420        if (in.file) {
1421                if (in.file->f_mode & FMODE_READ) {
1422                        out = fdget(fd_out);
1423                        if (out.file) {
1424                                if (out.file->f_mode & FMODE_WRITE)
1425                                        error = do_splice(in.file, off_in,
1426                                                          out.file, off_out,
1427                                                          len, flags);
1428                                fdput(out);
1429                        }
1430                }
1431                fdput(in);
1432        }
1433        return error;
1434}
1435
1436/*
1437 * Make sure there's data to read. Wait for input if we can, otherwise
1438 * return an appropriate error.
1439 */
1440static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1441{
1442        int ret;
1443
1444        /*
1445         * Check ->nrbufs without the inode lock first. This function
1446         * is speculative anyways, so missing one is ok.
1447         */
1448        if (pipe->nrbufs)
1449                return 0;
1450
1451        ret = 0;
1452        pipe_lock(pipe);
1453
1454        while (!pipe->nrbufs) {
1455                if (signal_pending(current)) {
1456                        ret = -ERESTARTSYS;
1457                        break;
1458                }
1459                if (!pipe->writers)
1460                        break;
1461                if (!pipe->waiting_writers) {
1462                        if (flags & SPLICE_F_NONBLOCK) {
1463                                ret = -EAGAIN;
1464                                break;
1465                        }
1466                }
1467                pipe_wait(pipe);
1468        }
1469
1470        pipe_unlock(pipe);
1471        return ret;
1472}
1473
1474/*
1475 * Make sure there's writeable room. Wait for room if we can, otherwise
1476 * return an appropriate error.
1477 */
1478static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1479{
1480        int ret;
1481
1482        /*
1483         * Check ->nrbufs without the inode lock first. This function
1484         * is speculative anyways, so missing one is ok.
1485         */
1486        if (pipe->nrbufs < pipe->buffers)
1487                return 0;
1488
1489        ret = 0;
1490        pipe_lock(pipe);
1491
1492        while (pipe->nrbufs >= pipe->buffers) {
1493                if (!pipe->readers) {
1494                        send_sig(SIGPIPE, current, 0);
1495                        ret = -EPIPE;
1496                        break;
1497                }
1498                if (flags & SPLICE_F_NONBLOCK) {
1499                        ret = -EAGAIN;
1500                        break;
1501                }
1502                if (signal_pending(current)) {
1503                        ret = -ERESTARTSYS;
1504                        break;
1505                }
1506                pipe->waiting_writers++;
1507                pipe_wait(pipe);
1508                pipe->waiting_writers--;
1509        }
1510
1511        pipe_unlock(pipe);
1512        return ret;
1513}
1514
1515/*
1516 * Splice contents of ipipe to opipe.
1517 */
1518static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1519                               struct pipe_inode_info *opipe,
1520                               size_t len, unsigned int flags)
1521{
1522        struct pipe_buffer *ibuf, *obuf;
1523        int ret = 0, nbuf;
1524        bool input_wakeup = false;
1525
1526
1527retry:
1528        ret = ipipe_prep(ipipe, flags);
1529        if (ret)
1530                return ret;
1531
1532        ret = opipe_prep(opipe, flags);
1533        if (ret)
1534                return ret;
1535
1536        /*
1537         * Potential ABBA deadlock, work around it by ordering lock
1538         * grabbing by pipe info address. Otherwise two different processes
1539         * could deadlock (one doing tee from A -> B, the other from B -> A).
1540         */
1541        pipe_double_lock(ipipe, opipe);
1542
1543        do {
1544                if (!opipe->readers) {
1545                        send_sig(SIGPIPE, current, 0);
1546                        if (!ret)
1547                                ret = -EPIPE;
1548                        break;
1549                }
1550
1551                if (!ipipe->nrbufs && !ipipe->writers)
1552                        break;
1553
1554                /*
1555                 * Cannot make any progress, because either the input
1556                 * pipe is empty or the output pipe is full.
1557                 */
1558                if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1559                        /* Already processed some buffers, break */
1560                        if (ret)
1561                                break;
1562
1563                        if (flags & SPLICE_F_NONBLOCK) {
1564                                ret = -EAGAIN;
1565                                break;
1566                        }
1567
1568                        /*
1569                         * We raced with another reader/writer and haven't
1570                         * managed to process any buffers.  A zero return
1571                         * value means EOF, so retry instead.
1572                         */
1573                        pipe_unlock(ipipe);
1574                        pipe_unlock(opipe);
1575                        goto retry;
1576                }
1577
1578                ibuf = ipipe->bufs + ipipe->curbuf;
1579                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1580                obuf = opipe->bufs + nbuf;
1581
1582                if (len >= ibuf->len) {
1583                        /*
1584                         * Simply move the whole buffer from ipipe to opipe
1585                         */
1586                        *obuf = *ibuf;
1587                        ibuf->ops = NULL;
1588                        opipe->nrbufs++;
1589                        ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1590                        ipipe->nrbufs--;
1591                        input_wakeup = true;
1592                } else {
1593                        /*
1594                         * Get a reference to this pipe buffer,
1595                         * so we can copy the contents over.
1596                         */
1597                        if (!pipe_buf_get(ipipe, ibuf)) {
1598                                if (ret == 0)
1599                                        ret = -EFAULT;
1600                                break;
1601                        }
1602                        *obuf = *ibuf;
1603
1604                        /*
1605                         * Don't inherit the gift flag, we need to
1606                         * prevent multiple steals of this page.
1607                         */
1608                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1609
1610                        pipe_buf_mark_unmergeable(obuf);
1611
1612                        obuf->len = len;
1613                        opipe->nrbufs++;
1614                        ibuf->offset += obuf->len;
1615                        ibuf->len -= obuf->len;
1616                }
1617                ret += obuf->len;
1618                len -= obuf->len;
1619        } while (len);
1620
1621        pipe_unlock(ipipe);
1622        pipe_unlock(opipe);
1623
1624        /*
1625         * If we put data in the output pipe, wakeup any potential readers.
1626         */
1627        if (ret > 0)
1628                wakeup_pipe_readers(opipe);
1629
1630        if (input_wakeup)
1631                wakeup_pipe_writers(ipipe);
1632
1633        return ret;
1634}
1635
1636/*
1637 * Link contents of ipipe to opipe.
1638 */
1639static int link_pipe(struct pipe_inode_info *ipipe,
1640                     struct pipe_inode_info *opipe,
1641                     size_t len, unsigned int flags)
1642{
1643        struct pipe_buffer *ibuf, *obuf;
1644        int ret = 0, i = 0, nbuf;
1645
1646        /*
1647         * Potential ABBA deadlock, work around it by ordering lock
1648         * grabbing by pipe info address. Otherwise two different processes
1649         * could deadlock (one doing tee from A -> B, the other from B -> A).
1650         */
1651        pipe_double_lock(ipipe, opipe);
1652
1653        do {
1654                if (!opipe->readers) {
1655                        send_sig(SIGPIPE, current, 0);
1656                        if (!ret)
1657                                ret = -EPIPE;
1658                        break;
1659                }
1660
1661                /*
1662                 * If we have iterated all input buffers or ran out of
1663                 * output room, break.
1664                 */
1665                if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1666                        break;
1667
1668                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1669                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1670
1671                /*
1672                 * Get a reference to this pipe buffer,
1673                 * so we can copy the contents over.
1674                 */
1675                if (!pipe_buf_get(ipipe, ibuf)) {
1676                        if (ret == 0)
1677                                ret = -EFAULT;
1678                        break;
1679                }
1680
1681                obuf = opipe->bufs + nbuf;
1682                *obuf = *ibuf;
1683
1684                /*
1685                 * Don't inherit the gift flag, we need to
1686                 * prevent multiple steals of this page.
1687                 */
1688                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1689
1690                pipe_buf_mark_unmergeable(obuf);
1691
1692                if (obuf->len > len)
1693                        obuf->len = len;
1694
1695                opipe->nrbufs++;
1696                ret += obuf->len;
1697                len -= obuf->len;
1698                i++;
1699        } while (len);
1700
1701        /*
1702         * return EAGAIN if we have the potential of some data in the
1703         * future, otherwise just return 0
1704         */
1705        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1706                ret = -EAGAIN;
1707
1708        pipe_unlock(ipipe);
1709        pipe_unlock(opipe);
1710
1711        /*
1712         * If we put data in the output pipe, wakeup any potential readers.
1713         */
1714        if (ret > 0)
1715                wakeup_pipe_readers(opipe);
1716
1717        return ret;
1718}
1719
1720/*
1721 * This is a tee(1) implementation that works on pipes. It doesn't copy
1722 * any data, it simply references the 'in' pages on the 'out' pipe.
1723 * The 'flags' used are the SPLICE_F_* variants, currently the only
1724 * applicable one is SPLICE_F_NONBLOCK.
1725 */
1726static long do_tee(struct file *in, struct file *out, size_t len,
1727                   unsigned int flags)
1728{
1729        struct pipe_inode_info *ipipe = get_pipe_info(in);
1730        struct pipe_inode_info *opipe = get_pipe_info(out);
1731        int ret = -EINVAL;
1732
1733        /*
1734         * Duplicate the contents of ipipe to opipe without actually
1735         * copying the data.
1736         */
1737        if (ipipe && opipe && ipipe != opipe) {
1738                if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1739                        flags |= SPLICE_F_NONBLOCK;
1740
1741                /*
1742                 * Keep going, unless we encounter an error. The ipipe/opipe
1743                 * ordering doesn't really matter.
1744                 */
1745                ret = ipipe_prep(ipipe, flags);
1746                if (!ret) {
1747                        ret = opipe_prep(opipe, flags);
1748                        if (!ret)
1749                                ret = link_pipe(ipipe, opipe, len, flags);
1750                }
1751        }
1752
1753        return ret;
1754}
1755
1756SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1757{
1758        struct fd in;
1759        int error;
1760
1761        if (unlikely(flags & ~SPLICE_F_ALL))
1762                return -EINVAL;
1763
1764        if (unlikely(!len))
1765                return 0;
1766
1767        error = -EBADF;
1768        in = fdget(fdin);
1769        if (in.file) {
1770                if (in.file->f_mode & FMODE_READ) {
1771                        struct fd out = fdget(fdout);
1772                        if (out.file) {
1773                                if (out.file->f_mode & FMODE_WRITE)
1774                                        error = do_tee(in.file, out.file,
1775                                                        len, flags);
1776                                fdput(out);
1777                        }
1778                }
1779                fdput(in);
1780        }
1781
1782        return error;
1783}
1784