linux/fs/splice.c
<<
>>
Prefs
   1/*
   2 * "splice": joining two ropes together by interweaving their strands.
   3 *
   4 * This is the "extended pipe" functionality, where a pipe is used as
   5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
   6 * buffer that you can use to transfer data from one end to the other.
   7 *
   8 * The traditional unix read/write is extended with a "splice()" operation
   9 * that transfers data buffers to or from a pipe buffer.
  10 *
  11 * Named by Larry McVoy, original implementation from Linus, extended by
  12 * Jens to support splicing to files, network, direct splicing, etc and
  13 * fixing lots of bugs.
  14 *
  15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  18 *
  19 */
  20#include <linux/bvec.h>
  21#include <linux/fs.h>
  22#include <linux/file.h>
  23#include <linux/pagemap.h>
  24#include <linux/splice.h>
  25#include <linux/memcontrol.h>
  26#include <linux/mm_inline.h>
  27#include <linux/swap.h>
  28#include <linux/writeback.h>
  29#include <linux/export.h>
  30#include <linux/syscalls.h>
  31#include <linux/uio.h>
  32#include <linux/security.h>
  33#include <linux/gfp.h>
  34#include <linux/socket.h>
  35#include <linux/compat.h>
  36#include <linux/sched/signal.h>
  37
  38#include "internal.h"
  39
  40/*
  41 * Attempt to steal a page from a pipe buffer. This should perhaps go into
  42 * a vm helper function, it's already simplified quite a bit by the
  43 * addition of remove_mapping(). If success is returned, the caller may
  44 * attempt to reuse this page for another destination.
  45 */
  46static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
  47                                     struct pipe_buffer *buf)
  48{
  49        struct page *page = buf->page;
  50        struct address_space *mapping;
  51
  52        lock_page(page);
  53
  54        mapping = page_mapping(page);
  55        if (mapping) {
  56                WARN_ON(!PageUptodate(page));
  57
  58                /*
  59                 * At least for ext2 with nobh option, we need to wait on
  60                 * writeback completing on this page, since we'll remove it
  61                 * from the pagecache.  Otherwise truncate wont wait on the
  62                 * page, allowing the disk blocks to be reused by someone else
  63                 * before we actually wrote our data to them. fs corruption
  64                 * ensues.
  65                 */
  66                wait_on_page_writeback(page);
  67
  68                if (page_has_private(page) &&
  69                    !try_to_release_page(page, GFP_KERNEL))
  70                        goto out_unlock;
  71
  72                /*
  73                 * If we succeeded in removing the mapping, set LRU flag
  74                 * and return good.
  75                 */
  76                if (remove_mapping(mapping, page)) {
  77                        buf->flags |= PIPE_BUF_FLAG_LRU;
  78                        return 0;
  79                }
  80        }
  81
  82        /*
  83         * Raced with truncate or failed to remove page from current
  84         * address space, unlock and return failure.
  85         */
  86out_unlock:
  87        unlock_page(page);
  88        return 1;
  89}
  90
  91static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  92                                        struct pipe_buffer *buf)
  93{
  94        put_page(buf->page);
  95        buf->flags &= ~PIPE_BUF_FLAG_LRU;
  96}
  97
  98/*
  99 * Check whether the contents of buf is OK to access. Since the content
 100 * is a page cache page, IO may be in flight.
 101 */
 102static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
 103                                       struct pipe_buffer *buf)
 104{
 105        struct page *page = buf->page;
 106        int err;
 107
 108        if (!PageUptodate(page)) {
 109                lock_page(page);
 110
 111                /*
 112                 * Page got truncated/unhashed. This will cause a 0-byte
 113                 * splice, if this is the first page.
 114                 */
 115                if (!page->mapping) {
 116                        err = -ENODATA;
 117                        goto error;
 118                }
 119
 120                /*
 121                 * Uh oh, read-error from disk.
 122                 */
 123                if (!PageUptodate(page)) {
 124                        err = -EIO;
 125                        goto error;
 126                }
 127
 128                /*
 129                 * Page is ok afterall, we are done.
 130                 */
 131                unlock_page(page);
 132        }
 133
 134        return 0;
 135error:
 136        unlock_page(page);
 137        return err;
 138}
 139
 140const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 141        .can_merge = 0,
 142        .confirm = page_cache_pipe_buf_confirm,
 143        .release = page_cache_pipe_buf_release,
 144        .steal = page_cache_pipe_buf_steal,
 145        .get = generic_pipe_buf_get,
 146};
 147
 148static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 149                                    struct pipe_buffer *buf)
 150{
 151        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 152                return 1;
 153
 154        buf->flags |= PIPE_BUF_FLAG_LRU;
 155        return generic_pipe_buf_steal(pipe, buf);
 156}
 157
 158static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 159        .can_merge = 0,
 160        .confirm = generic_pipe_buf_confirm,
 161        .release = page_cache_pipe_buf_release,
 162        .steal = user_page_pipe_buf_steal,
 163        .get = generic_pipe_buf_get,
 164};
 165
 166static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
 167{
 168        smp_mb();
 169        if (waitqueue_active(&pipe->wait))
 170                wake_up_interruptible(&pipe->wait);
 171        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 172}
 173
 174/**
 175 * splice_to_pipe - fill passed data into a pipe
 176 * @pipe:       pipe to fill
 177 * @spd:        data to fill
 178 *
 179 * Description:
 180 *    @spd contains a map of pages and len/offset tuples, along with
 181 *    the struct pipe_buf_operations associated with these pages. This
 182 *    function will link that data to the pipe.
 183 *
 184 */
 185ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 186                       struct splice_pipe_desc *spd)
 187{
 188        unsigned int spd_pages = spd->nr_pages;
 189        int ret = 0, page_nr = 0;
 190
 191        if (!spd_pages)
 192                return 0;
 193
 194        if (unlikely(!pipe->readers)) {
 195                send_sig(SIGPIPE, current, 0);
 196                ret = -EPIPE;
 197                goto out;
 198        }
 199
 200        while (pipe->nrbufs < pipe->buffers) {
 201                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 202                struct pipe_buffer *buf = pipe->bufs + newbuf;
 203
 204                buf->page = spd->pages[page_nr];
 205                buf->offset = spd->partial[page_nr].offset;
 206                buf->len = spd->partial[page_nr].len;
 207                buf->private = spd->partial[page_nr].private;
 208                buf->ops = spd->ops;
 209                buf->flags = 0;
 210
 211                pipe->nrbufs++;
 212                page_nr++;
 213                ret += buf->len;
 214
 215                if (!--spd->nr_pages)
 216                        break;
 217        }
 218
 219        if (!ret)
 220                ret = -EAGAIN;
 221
 222out:
 223        while (page_nr < spd_pages)
 224                spd->spd_release(spd, page_nr++);
 225
 226        return ret;
 227}
 228EXPORT_SYMBOL_GPL(splice_to_pipe);
 229
 230ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 231{
 232        int ret;
 233
 234        if (unlikely(!pipe->readers)) {
 235                send_sig(SIGPIPE, current, 0);
 236                ret = -EPIPE;
 237        } else if (pipe->nrbufs == pipe->buffers) {
 238                ret = -EAGAIN;
 239        } else {
 240                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 241                pipe->bufs[newbuf] = *buf;
 242                pipe->nrbufs++;
 243                return buf->len;
 244        }
 245        pipe_buf_release(pipe, buf);
 246        return ret;
 247}
 248EXPORT_SYMBOL(add_to_pipe);
 249
 250/*
 251 * Check if we need to grow the arrays holding pages and partial page
 252 * descriptions.
 253 */
 254int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
 255{
 256        unsigned int buffers = READ_ONCE(pipe->buffers);
 257
 258        spd->nr_pages_max = buffers;
 259        if (buffers <= PIPE_DEF_BUFFERS)
 260                return 0;
 261
 262        spd->pages = kmalloc_array(buffers, sizeof(struct page *), GFP_KERNEL);
 263        spd->partial = kmalloc_array(buffers, sizeof(struct partial_page),
 264                                     GFP_KERNEL);
 265
 266        if (spd->pages && spd->partial)
 267                return 0;
 268
 269        kfree(spd->pages);
 270        kfree(spd->partial);
 271        return -ENOMEM;
 272}
 273
 274void splice_shrink_spd(struct splice_pipe_desc *spd)
 275{
 276        if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
 277                return;
 278
 279        kfree(spd->pages);
 280        kfree(spd->partial);
 281}
 282
 283/**
 284 * generic_file_splice_read - splice data from file to a pipe
 285 * @in:         file to splice from
 286 * @ppos:       position in @in
 287 * @pipe:       pipe to splice to
 288 * @len:        number of bytes to splice
 289 * @flags:      splice modifier flags
 290 *
 291 * Description:
 292 *    Will read pages from given file and fill them into a pipe. Can be
 293 *    used as long as it has more or less sane ->read_iter().
 294 *
 295 */
 296ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 297                                 struct pipe_inode_info *pipe, size_t len,
 298                                 unsigned int flags)
 299{
 300        struct iov_iter to;
 301        struct kiocb kiocb;
 302        int idx, ret;
 303
 304        iov_iter_pipe(&to, READ, pipe, len);
 305        idx = to.idx;
 306        init_sync_kiocb(&kiocb, in);
 307        kiocb.ki_pos = *ppos;
 308        ret = call_read_iter(in, &kiocb, &to);
 309        if (ret > 0) {
 310                *ppos = kiocb.ki_pos;
 311                file_accessed(in);
 312        } else if (ret < 0) {
 313                to.idx = idx;
 314                to.iov_offset = 0;
 315                iov_iter_advance(&to, 0); /* to free what was emitted */
 316                /*
 317                 * callers of ->splice_read() expect -EAGAIN on
 318                 * "can't put anything in there", rather than -EFAULT.
 319                 */
 320                if (ret == -EFAULT)
 321                        ret = -EAGAIN;
 322        }
 323
 324        return ret;
 325}
 326EXPORT_SYMBOL(generic_file_splice_read);
 327
 328const struct pipe_buf_operations default_pipe_buf_ops = {
 329        .can_merge = 0,
 330        .confirm = generic_pipe_buf_confirm,
 331        .release = generic_pipe_buf_release,
 332        .steal = generic_pipe_buf_steal,
 333        .get = generic_pipe_buf_get,
 334};
 335
 336static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
 337                                    struct pipe_buffer *buf)
 338{
 339        return 1;
 340}
 341
 342/* Pipe buffer operations for a socket and similar. */
 343const struct pipe_buf_operations nosteal_pipe_buf_ops = {
 344        .can_merge = 0,
 345        .confirm = generic_pipe_buf_confirm,
 346        .release = generic_pipe_buf_release,
 347        .steal = generic_pipe_buf_nosteal,
 348        .get = generic_pipe_buf_get,
 349};
 350EXPORT_SYMBOL(nosteal_pipe_buf_ops);
 351
 352static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
 353                            unsigned long vlen, loff_t offset)
 354{
 355        mm_segment_t old_fs;
 356        loff_t pos = offset;
 357        ssize_t res;
 358
 359        old_fs = get_fs();
 360        set_fs(get_ds());
 361        /* The cast to a user pointer is valid due to the set_fs() */
 362        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
 363        set_fs(old_fs);
 364
 365        return res;
 366}
 367
 368static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 369                                 struct pipe_inode_info *pipe, size_t len,
 370                                 unsigned int flags)
 371{
 372        struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
 373        struct iov_iter to;
 374        struct page **pages;
 375        unsigned int nr_pages;
 376        size_t offset, base, copied = 0;
 377        ssize_t res;
 378        int i;
 379
 380        if (pipe->nrbufs == pipe->buffers)
 381                return -EAGAIN;
 382
 383        /*
 384         * Try to keep page boundaries matching to source pagecache ones -
 385         * it probably won't be much help, but...
 386         */
 387        offset = *ppos & ~PAGE_MASK;
 388
 389        iov_iter_pipe(&to, READ, pipe, len + offset);
 390
 391        res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
 392        if (res <= 0)
 393                return -ENOMEM;
 394
 395        nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE);
 396
 397        vec = __vec;
 398        if (nr_pages > PIPE_DEF_BUFFERS) {
 399                vec = kmalloc_array(nr_pages, sizeof(struct kvec), GFP_KERNEL);
 400                if (unlikely(!vec)) {
 401                        res = -ENOMEM;
 402                        goto out;
 403                }
 404        }
 405
 406        pipe->bufs[to.idx].offset = offset;
 407        pipe->bufs[to.idx].len -= offset;
 408
 409        for (i = 0; i < nr_pages; i++) {
 410                size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
 411                vec[i].iov_base = page_address(pages[i]) + offset;
 412                vec[i].iov_len = this_len;
 413                len -= this_len;
 414                offset = 0;
 415        }
 416
 417        res = kernel_readv(in, vec, nr_pages, *ppos);
 418        if (res > 0) {
 419                copied = res;
 420                *ppos += res;
 421        }
 422
 423        if (vec != __vec)
 424                kfree(vec);
 425out:
 426        for (i = 0; i < nr_pages; i++)
 427                put_page(pages[i]);
 428        kvfree(pages);
 429        iov_iter_advance(&to, copied);  /* truncates and discards */
 430        return res;
 431}
 432
 433/*
 434 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 435 * using sendpage(). Return the number of bytes sent.
 436 */
 437static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 438                            struct pipe_buffer *buf, struct splice_desc *sd)
 439{
 440        struct file *file = sd->u.file;
 441        loff_t pos = sd->pos;
 442        int more;
 443
 444        if (!likely(file->f_op->sendpage))
 445                return -EINVAL;
 446
 447        more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
 448
 449        if (sd->len < sd->total_len && pipe->nrbufs > 1)
 450                more |= MSG_SENDPAGE_NOTLAST;
 451
 452        return file->f_op->sendpage(file, buf->page, buf->offset,
 453                                    sd->len, &pos, more);
 454}
 455
 456static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 457{
 458        smp_mb();
 459        if (waitqueue_active(&pipe->wait))
 460                wake_up_interruptible(&pipe->wait);
 461        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 462}
 463
 464/**
 465 * splice_from_pipe_feed - feed available data from a pipe to a file
 466 * @pipe:       pipe to splice from
 467 * @sd:         information to @actor
 468 * @actor:      handler that splices the data
 469 *
 470 * Description:
 471 *    This function loops over the pipe and calls @actor to do the
 472 *    actual moving of a single struct pipe_buffer to the desired
 473 *    destination.  It returns when there's no more buffers left in
 474 *    the pipe or if the requested number of bytes (@sd->total_len)
 475 *    have been copied.  It returns a positive number (one) if the
 476 *    pipe needs to be filled with more data, zero if the required
 477 *    number of bytes have been copied and -errno on error.
 478 *
 479 *    This, together with splice_from_pipe_{begin,end,next}, may be
 480 *    used to implement the functionality of __splice_from_pipe() when
 481 *    locking is required around copying the pipe buffers to the
 482 *    destination.
 483 */
 484static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 485                          splice_actor *actor)
 486{
 487        int ret;
 488
 489        while (pipe->nrbufs) {
 490                struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 491
 492                sd->len = buf->len;
 493                if (sd->len > sd->total_len)
 494                        sd->len = sd->total_len;
 495
 496                ret = pipe_buf_confirm(pipe, buf);
 497                if (unlikely(ret)) {
 498                        if (ret == -ENODATA)
 499                                ret = 0;
 500                        return ret;
 501                }
 502
 503                ret = actor(pipe, buf, sd);
 504                if (ret <= 0)
 505                        return ret;
 506
 507                buf->offset += ret;
 508                buf->len -= ret;
 509
 510                sd->num_spliced += ret;
 511                sd->len -= ret;
 512                sd->pos += ret;
 513                sd->total_len -= ret;
 514
 515                if (!buf->len) {
 516                        pipe_buf_release(pipe, buf);
 517                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 518                        pipe->nrbufs--;
 519                        if (pipe->files)
 520                                sd->need_wakeup = true;
 521                }
 522
 523                if (!sd->total_len)
 524                        return 0;
 525        }
 526
 527        return 1;
 528}
 529
 530/**
 531 * splice_from_pipe_next - wait for some data to splice from
 532 * @pipe:       pipe to splice from
 533 * @sd:         information about the splice operation
 534 *
 535 * Description:
 536 *    This function will wait for some data and return a positive
 537 *    value (one) if pipe buffers are available.  It will return zero
 538 *    or -errno if no more data needs to be spliced.
 539 */
 540static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 541{
 542        /*
 543         * Check for signal early to make process killable when there are
 544         * always buffers available
 545         */
 546        if (signal_pending(current))
 547                return -ERESTARTSYS;
 548
 549        while (!pipe->nrbufs) {
 550                if (!pipe->writers)
 551                        return 0;
 552
 553                if (!pipe->waiting_writers && sd->num_spliced)
 554                        return 0;
 555
 556                if (sd->flags & SPLICE_F_NONBLOCK)
 557                        return -EAGAIN;
 558
 559                if (signal_pending(current))
 560                        return -ERESTARTSYS;
 561
 562                if (sd->need_wakeup) {
 563                        wakeup_pipe_writers(pipe);
 564                        sd->need_wakeup = false;
 565                }
 566
 567                pipe_wait(pipe);
 568        }
 569
 570        return 1;
 571}
 572
 573/**
 574 * splice_from_pipe_begin - start splicing from pipe
 575 * @sd:         information about the splice operation
 576 *
 577 * Description:
 578 *    This function should be called before a loop containing
 579 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 580 *    initialize the necessary fields of @sd.
 581 */
 582static void splice_from_pipe_begin(struct splice_desc *sd)
 583{
 584        sd->num_spliced = 0;
 585        sd->need_wakeup = false;
 586}
 587
 588/**
 589 * splice_from_pipe_end - finish splicing from pipe
 590 * @pipe:       pipe to splice from
 591 * @sd:         information about the splice operation
 592 *
 593 * Description:
 594 *    This function will wake up pipe writers if necessary.  It should
 595 *    be called after a loop containing splice_from_pipe_next() and
 596 *    splice_from_pipe_feed().
 597 */
 598static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
 599{
 600        if (sd->need_wakeup)
 601                wakeup_pipe_writers(pipe);
 602}
 603
 604/**
 605 * __splice_from_pipe - splice data from a pipe to given actor
 606 * @pipe:       pipe to splice from
 607 * @sd:         information to @actor
 608 * @actor:      handler that splices the data
 609 *
 610 * Description:
 611 *    This function does little more than loop over the pipe and call
 612 *    @actor to do the actual moving of a single struct pipe_buffer to
 613 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 614 *    pipe_to_user.
 615 *
 616 */
 617ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 618                           splice_actor *actor)
 619{
 620        int ret;
 621
 622        splice_from_pipe_begin(sd);
 623        do {
 624                cond_resched();
 625                ret = splice_from_pipe_next(pipe, sd);
 626                if (ret > 0)
 627                        ret = splice_from_pipe_feed(pipe, sd, actor);
 628        } while (ret > 0);
 629        splice_from_pipe_end(pipe, sd);
 630
 631        return sd->num_spliced ? sd->num_spliced : ret;
 632}
 633EXPORT_SYMBOL(__splice_from_pipe);
 634
 635/**
 636 * splice_from_pipe - splice data from a pipe to a file
 637 * @pipe:       pipe to splice from
 638 * @out:        file to splice to
 639 * @ppos:       position in @out
 640 * @len:        how many bytes to splice
 641 * @flags:      splice modifier flags
 642 * @actor:      handler that splices the data
 643 *
 644 * Description:
 645 *    See __splice_from_pipe. This function locks the pipe inode,
 646 *    otherwise it's identical to __splice_from_pipe().
 647 *
 648 */
 649ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 650                         loff_t *ppos, size_t len, unsigned int flags,
 651                         splice_actor *actor)
 652{
 653        ssize_t ret;
 654        struct splice_desc sd = {
 655                .total_len = len,
 656                .flags = flags,
 657                .pos = *ppos,
 658                .u.file = out,
 659        };
 660
 661        pipe_lock(pipe);
 662        ret = __splice_from_pipe(pipe, &sd, actor);
 663        pipe_unlock(pipe);
 664
 665        return ret;
 666}
 667
 668/**
 669 * iter_file_splice_write - splice data from a pipe to a file
 670 * @pipe:       pipe info
 671 * @out:        file to write to
 672 * @ppos:       position in @out
 673 * @len:        number of bytes to splice
 674 * @flags:      splice modifier flags
 675 *
 676 * Description:
 677 *    Will either move or copy pages (determined by @flags options) from
 678 *    the given pipe inode to the given file.
 679 *    This one is ->write_iter-based.
 680 *
 681 */
 682ssize_t
 683iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 684                          loff_t *ppos, size_t len, unsigned int flags)
 685{
 686        struct splice_desc sd = {
 687                .total_len = len,
 688                .flags = flags,
 689                .pos = *ppos,
 690                .u.file = out,
 691        };
 692        int nbufs = pipe->buffers;
 693        struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
 694                                        GFP_KERNEL);
 695        ssize_t ret;
 696
 697        if (unlikely(!array))
 698                return -ENOMEM;
 699
 700        pipe_lock(pipe);
 701
 702        splice_from_pipe_begin(&sd);
 703        while (sd.total_len) {
 704                struct iov_iter from;
 705                size_t left;
 706                int n, idx;
 707
 708                ret = splice_from_pipe_next(pipe, &sd);
 709                if (ret <= 0)
 710                        break;
 711
 712                if (unlikely(nbufs < pipe->buffers)) {
 713                        kfree(array);
 714                        nbufs = pipe->buffers;
 715                        array = kcalloc(nbufs, sizeof(struct bio_vec),
 716                                        GFP_KERNEL);
 717                        if (!array) {
 718                                ret = -ENOMEM;
 719                                break;
 720                        }
 721                }
 722
 723                /* build the vector */
 724                left = sd.total_len;
 725                for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
 726                        struct pipe_buffer *buf = pipe->bufs + idx;
 727                        size_t this_len = buf->len;
 728
 729                        if (this_len > left)
 730                                this_len = left;
 731
 732                        if (idx == pipe->buffers - 1)
 733                                idx = -1;
 734
 735                        ret = pipe_buf_confirm(pipe, buf);
 736                        if (unlikely(ret)) {
 737                                if (ret == -ENODATA)
 738                                        ret = 0;
 739                                goto done;
 740                        }
 741
 742                        array[n].bv_page = buf->page;
 743                        array[n].bv_len = this_len;
 744                        array[n].bv_offset = buf->offset;
 745                        left -= this_len;
 746                }
 747
 748                iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
 749                ret = vfs_iter_write(out, &from, &sd.pos, 0);
 750                if (ret <= 0)
 751                        break;
 752
 753                sd.num_spliced += ret;
 754                sd.total_len -= ret;
 755                *ppos = sd.pos;
 756
 757                /* dismiss the fully eaten buffers, adjust the partial one */
 758                while (ret) {
 759                        struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 760                        if (ret >= buf->len) {
 761                                ret -= buf->len;
 762                                buf->len = 0;
 763                                pipe_buf_release(pipe, buf);
 764                                pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 765                                pipe->nrbufs--;
 766                                if (pipe->files)
 767                                        sd.need_wakeup = true;
 768                        } else {
 769                                buf->offset += ret;
 770                                buf->len -= ret;
 771                                ret = 0;
 772                        }
 773                }
 774        }
 775done:
 776        kfree(array);
 777        splice_from_pipe_end(pipe, &sd);
 778
 779        pipe_unlock(pipe);
 780
 781        if (sd.num_spliced)
 782                ret = sd.num_spliced;
 783
 784        return ret;
 785}
 786
 787EXPORT_SYMBOL(iter_file_splice_write);
 788
 789static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 790                          struct splice_desc *sd)
 791{
 792        int ret;
 793        void *data;
 794        loff_t tmp = sd->pos;
 795
 796        data = kmap(buf->page);
 797        ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
 798        kunmap(buf->page);
 799
 800        return ret;
 801}
 802
 803static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
 804                                         struct file *out, loff_t *ppos,
 805                                         size_t len, unsigned int flags)
 806{
 807        ssize_t ret;
 808
 809        ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
 810        if (ret > 0)
 811                *ppos += ret;
 812
 813        return ret;
 814}
 815
 816/**
 817 * generic_splice_sendpage - splice data from a pipe to a socket
 818 * @pipe:       pipe to splice from
 819 * @out:        socket to write to
 820 * @ppos:       position in @out
 821 * @len:        number of bytes to splice
 822 * @flags:      splice modifier flags
 823 *
 824 * Description:
 825 *    Will send @len bytes from the pipe to a network socket. No data copying
 826 *    is involved.
 827 *
 828 */
 829ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 830                                loff_t *ppos, size_t len, unsigned int flags)
 831{
 832        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 833}
 834
 835EXPORT_SYMBOL(generic_splice_sendpage);
 836
 837/*
 838 * Attempt to initiate a splice from pipe to file.
 839 */
 840static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 841                           loff_t *ppos, size_t len, unsigned int flags)
 842{
 843        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 844                                loff_t *, size_t, unsigned int);
 845
 846        if (out->f_op->splice_write)
 847                splice_write = out->f_op->splice_write;
 848        else
 849                splice_write = default_file_splice_write;
 850
 851        return splice_write(pipe, out, ppos, len, flags);
 852}
 853
 854/*
 855 * Attempt to initiate a splice from a file to a pipe.
 856 */
 857static long do_splice_to(struct file *in, loff_t *ppos,
 858                         struct pipe_inode_info *pipe, size_t len,
 859                         unsigned int flags)
 860{
 861        ssize_t (*splice_read)(struct file *, loff_t *,
 862                               struct pipe_inode_info *, size_t, unsigned int);
 863        int ret;
 864
 865        if (unlikely(!(in->f_mode & FMODE_READ)))
 866                return -EBADF;
 867
 868        ret = rw_verify_area(READ, in, ppos, len);
 869        if (unlikely(ret < 0))
 870                return ret;
 871
 872        if (unlikely(len > MAX_RW_COUNT))
 873                len = MAX_RW_COUNT;
 874
 875        if (in->f_op->splice_read)
 876                splice_read = in->f_op->splice_read;
 877        else
 878                splice_read = default_file_splice_read;
 879
 880        return splice_read(in, ppos, pipe, len, flags);
 881}
 882
 883/**
 884 * splice_direct_to_actor - splices data directly between two non-pipes
 885 * @in:         file to splice from
 886 * @sd:         actor information on where to splice to
 887 * @actor:      handles the data splicing
 888 *
 889 * Description:
 890 *    This is a special case helper to splice directly between two
 891 *    points, without requiring an explicit pipe. Internally an allocated
 892 *    pipe is cached in the process, and reused during the lifetime of
 893 *    that process.
 894 *
 895 */
 896ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 897                               splice_direct_actor *actor)
 898{
 899        struct pipe_inode_info *pipe;
 900        long ret, bytes;
 901        umode_t i_mode;
 902        size_t len;
 903        int i, flags, more;
 904
 905        /*
 906         * We require the input being a regular file, as we don't want to
 907         * randomly drop data for eg socket -> socket splicing. Use the
 908         * piped splicing for that!
 909         */
 910        i_mode = file_inode(in)->i_mode;
 911        if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
 912                return -EINVAL;
 913
 914        /*
 915         * neither in nor out is a pipe, setup an internal pipe attached to
 916         * 'out' and transfer the wanted data from 'in' to 'out' through that
 917         */
 918        pipe = current->splice_pipe;
 919        if (unlikely(!pipe)) {
 920                pipe = alloc_pipe_info();
 921                if (!pipe)
 922                        return -ENOMEM;
 923
 924                /*
 925                 * We don't have an immediate reader, but we'll read the stuff
 926                 * out of the pipe right after the splice_to_pipe(). So set
 927                 * PIPE_READERS appropriately.
 928                 */
 929                pipe->readers = 1;
 930
 931                current->splice_pipe = pipe;
 932        }
 933
 934        /*
 935         * Do the splice.
 936         */
 937        ret = 0;
 938        bytes = 0;
 939        len = sd->total_len;
 940        flags = sd->flags;
 941
 942        /*
 943         * Don't block on output, we have to drain the direct pipe.
 944         */
 945        sd->flags &= ~SPLICE_F_NONBLOCK;
 946        more = sd->flags & SPLICE_F_MORE;
 947
 948        WARN_ON_ONCE(pipe->nrbufs != 0);
 949
 950        while (len) {
 951                size_t read_len;
 952                loff_t pos = sd->pos, prev_pos = pos;
 953
 954                /* Don't try to read more the pipe has space for. */
 955                read_len = min_t(size_t, len,
 956                                 (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT);
 957                ret = do_splice_to(in, &pos, pipe, read_len, flags);
 958                if (unlikely(ret <= 0))
 959                        goto out_release;
 960
 961                read_len = ret;
 962                sd->total_len = read_len;
 963
 964                /*
 965                 * If more data is pending, set SPLICE_F_MORE
 966                 * If this is the last data and SPLICE_F_MORE was not set
 967                 * initially, clears it.
 968                 */
 969                if (read_len < len)
 970                        sd->flags |= SPLICE_F_MORE;
 971                else if (!more)
 972                        sd->flags &= ~SPLICE_F_MORE;
 973                /*
 974                 * NOTE: nonblocking mode only applies to the input. We
 975                 * must not do the output in nonblocking mode as then we
 976                 * could get stuck data in the internal pipe:
 977                 */
 978                ret = actor(pipe, sd);
 979                if (unlikely(ret <= 0)) {
 980                        sd->pos = prev_pos;
 981                        goto out_release;
 982                }
 983
 984                bytes += ret;
 985                len -= ret;
 986                sd->pos = pos;
 987
 988                if (ret < read_len) {
 989                        sd->pos = prev_pos + ret;
 990                        goto out_release;
 991                }
 992        }
 993
 994done:
 995        pipe->nrbufs = pipe->curbuf = 0;
 996        file_accessed(in);
 997        return bytes;
 998
 999out_release:
1000        /*
1001         * If we did an incomplete transfer we must release
1002         * the pipe buffers in question:
1003         */
1004        for (i = 0; i < pipe->buffers; i++) {
1005                struct pipe_buffer *buf = pipe->bufs + i;
1006
1007                if (buf->ops)
1008                        pipe_buf_release(pipe, buf);
1009        }
1010
1011        if (!bytes)
1012                bytes = ret;
1013
1014        goto done;
1015}
1016EXPORT_SYMBOL(splice_direct_to_actor);
1017
1018static int direct_splice_actor(struct pipe_inode_info *pipe,
1019                               struct splice_desc *sd)
1020{
1021        struct file *file = sd->u.file;
1022
1023        return do_splice_from(pipe, file, sd->opos, sd->total_len,
1024                              sd->flags);
1025}
1026
1027/**
1028 * do_splice_direct - splices data directly between two files
1029 * @in:         file to splice from
1030 * @ppos:       input file offset
1031 * @out:        file to splice to
1032 * @opos:       output file offset
1033 * @len:        number of bytes to splice
1034 * @flags:      splice modifier flags
1035 *
1036 * Description:
1037 *    For use by do_sendfile(). splice can easily emulate sendfile, but
1038 *    doing it in the application would incur an extra system call
1039 *    (splice in + splice out, as compared to just sendfile()). So this helper
1040 *    can splice directly through a process-private pipe.
1041 *
1042 */
1043long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1044                      loff_t *opos, size_t len, unsigned int flags)
1045{
1046        struct splice_desc sd = {
1047                .len            = len,
1048                .total_len      = len,
1049                .flags          = flags,
1050                .pos            = *ppos,
1051                .u.file         = out,
1052                .opos           = opos,
1053        };
1054        long ret;
1055
1056        if (unlikely(!(out->f_mode & FMODE_WRITE)))
1057                return -EBADF;
1058
1059        if (unlikely(out->f_flags & O_APPEND))
1060                return -EINVAL;
1061
1062        ret = rw_verify_area(WRITE, out, opos, len);
1063        if (unlikely(ret < 0))
1064                return ret;
1065
1066        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1067        if (ret > 0)
1068                *ppos = sd.pos;
1069
1070        return ret;
1071}
1072EXPORT_SYMBOL(do_splice_direct);
1073
1074static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1075{
1076        for (;;) {
1077                if (unlikely(!pipe->readers)) {
1078                        send_sig(SIGPIPE, current, 0);
1079                        return -EPIPE;
1080                }
1081                if (pipe->nrbufs != pipe->buffers)
1082                        return 0;
1083                if (flags & SPLICE_F_NONBLOCK)
1084                        return -EAGAIN;
1085                if (signal_pending(current))
1086                        return -ERESTARTSYS;
1087                pipe->waiting_writers++;
1088                pipe_wait(pipe);
1089                pipe->waiting_writers--;
1090        }
1091}
1092
1093static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1094                               struct pipe_inode_info *opipe,
1095                               size_t len, unsigned int flags);
1096
1097/*
1098 * Determine where to splice to/from.
1099 */
1100static long do_splice(struct file *in, loff_t __user *off_in,
1101                      struct file *out, loff_t __user *off_out,
1102                      size_t len, unsigned int flags)
1103{
1104        struct pipe_inode_info *ipipe;
1105        struct pipe_inode_info *opipe;
1106        loff_t offset;
1107        long ret;
1108
1109        ipipe = get_pipe_info(in);
1110        opipe = get_pipe_info(out);
1111
1112        if (ipipe && opipe) {
1113                if (off_in || off_out)
1114                        return -ESPIPE;
1115
1116                if (!(in->f_mode & FMODE_READ))
1117                        return -EBADF;
1118
1119                if (!(out->f_mode & FMODE_WRITE))
1120                        return -EBADF;
1121
1122                /* Splicing to self would be fun, but... */
1123                if (ipipe == opipe)
1124                        return -EINVAL;
1125
1126                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1127        }
1128
1129        if (ipipe) {
1130                if (off_in)
1131                        return -ESPIPE;
1132                if (off_out) {
1133                        if (!(out->f_mode & FMODE_PWRITE))
1134                                return -EINVAL;
1135                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1136                                return -EFAULT;
1137                } else {
1138                        offset = out->f_pos;
1139                }
1140
1141                if (unlikely(!(out->f_mode & FMODE_WRITE)))
1142                        return -EBADF;
1143
1144                if (unlikely(out->f_flags & O_APPEND))
1145                        return -EINVAL;
1146
1147                ret = rw_verify_area(WRITE, out, &offset, len);
1148                if (unlikely(ret < 0))
1149                        return ret;
1150
1151                file_start_write(out);
1152                ret = do_splice_from(ipipe, out, &offset, len, flags);
1153                file_end_write(out);
1154
1155                if (!off_out)
1156                        out->f_pos = offset;
1157                else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1158                        ret = -EFAULT;
1159
1160                return ret;
1161        }
1162
1163        if (opipe) {
1164                if (off_out)
1165                        return -ESPIPE;
1166                if (off_in) {
1167                        if (!(in->f_mode & FMODE_PREAD))
1168                                return -EINVAL;
1169                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1170                                return -EFAULT;
1171                } else {
1172                        offset = in->f_pos;
1173                }
1174
1175                pipe_lock(opipe);
1176                ret = wait_for_space(opipe, flags);
1177                if (!ret)
1178                        ret = do_splice_to(in, &offset, opipe, len, flags);
1179                pipe_unlock(opipe);
1180                if (ret > 0)
1181                        wakeup_pipe_readers(opipe);
1182                if (!off_in)
1183                        in->f_pos = offset;
1184                else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1185                        ret = -EFAULT;
1186
1187                return ret;
1188        }
1189
1190        return -EINVAL;
1191}
1192
1193static int iter_to_pipe(struct iov_iter *from,
1194                        struct pipe_inode_info *pipe,
1195                        unsigned flags)
1196{
1197        struct pipe_buffer buf = {
1198                .ops = &user_page_pipe_buf_ops,
1199                .flags = flags
1200        };
1201        size_t total = 0;
1202        int ret = 0;
1203        bool failed = false;
1204
1205        while (iov_iter_count(from) && !failed) {
1206                struct page *pages[16];
1207                ssize_t copied;
1208                size_t start;
1209                int n;
1210
1211                copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1212                if (copied <= 0) {
1213                        ret = copied;
1214                        break;
1215                }
1216
1217                for (n = 0; copied; n++, start = 0) {
1218                        int size = min_t(int, copied, PAGE_SIZE - start);
1219                        if (!failed) {
1220                                buf.page = pages[n];
1221                                buf.offset = start;
1222                                buf.len = size;
1223                                ret = add_to_pipe(pipe, &buf);
1224                                if (unlikely(ret < 0)) {
1225                                        failed = true;
1226                                } else {
1227                                        iov_iter_advance(from, ret);
1228                                        total += ret;
1229                                }
1230                        } else {
1231                                put_page(pages[n]);
1232                        }
1233                        copied -= size;
1234                }
1235        }
1236        return total ? total : ret;
1237}
1238
1239static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1240                        struct splice_desc *sd)
1241{
1242        int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1243        return n == sd->len ? n : -EFAULT;
1244}
1245
1246/*
1247 * For lack of a better implementation, implement vmsplice() to userspace
1248 * as a simple copy of the pipes pages to the user iov.
1249 */
1250static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1251                             unsigned int flags)
1252{
1253        struct pipe_inode_info *pipe = get_pipe_info(file);
1254        struct splice_desc sd = {
1255                .total_len = iov_iter_count(iter),
1256                .flags = flags,
1257                .u.data = iter
1258        };
1259        long ret = 0;
1260
1261        if (!pipe)
1262                return -EBADF;
1263
1264        if (sd.total_len) {
1265                pipe_lock(pipe);
1266                ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1267                pipe_unlock(pipe);
1268        }
1269
1270        return ret;
1271}
1272
1273/*
1274 * vmsplice splices a user address range into a pipe. It can be thought of
1275 * as splice-from-memory, where the regular splice is splice-from-file (or
1276 * to file). In both cases the output is a pipe, naturally.
1277 */
1278static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1279                             unsigned int flags)
1280{
1281        struct pipe_inode_info *pipe;
1282        long ret = 0;
1283        unsigned buf_flag = 0;
1284
1285        if (flags & SPLICE_F_GIFT)
1286                buf_flag = PIPE_BUF_FLAG_GIFT;
1287
1288        pipe = get_pipe_info(file);
1289        if (!pipe)
1290                return -EBADF;
1291
1292        pipe_lock(pipe);
1293        ret = wait_for_space(pipe, flags);
1294        if (!ret)
1295                ret = iter_to_pipe(iter, pipe, buf_flag);
1296        pipe_unlock(pipe);
1297        if (ret > 0)
1298                wakeup_pipe_readers(pipe);
1299        return ret;
1300}
1301
1302static int vmsplice_type(struct fd f, int *type)
1303{
1304        if (!f.file)
1305                return -EBADF;
1306        if (f.file->f_mode & FMODE_WRITE) {
1307                *type = WRITE;
1308        } else if (f.file->f_mode & FMODE_READ) {
1309                *type = READ;
1310        } else {
1311                fdput(f);
1312                return -EBADF;
1313        }
1314        return 0;
1315}
1316
1317/*
1318 * Note that vmsplice only really supports true splicing _from_ user memory
1319 * to a pipe, not the other way around. Splicing from user memory is a simple
1320 * operation that can be supported without any funky alignment restrictions
1321 * or nasty vm tricks. We simply map in the user memory and fill them into
1322 * a pipe. The reverse isn't quite as easy, though. There are two possible
1323 * solutions for that:
1324 *
1325 *      - memcpy() the data internally, at which point we might as well just
1326 *        do a regular read() on the buffer anyway.
1327 *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1328 *        has restriction limitations on both ends of the pipe).
1329 *
1330 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1331 *
1332 */
1333static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags)
1334{
1335        if (unlikely(flags & ~SPLICE_F_ALL))
1336                return -EINVAL;
1337
1338        if (!iov_iter_count(iter))
1339                return 0;
1340
1341        if (iov_iter_rw(iter) == WRITE)
1342                return vmsplice_to_pipe(f, iter, flags);
1343        else
1344                return vmsplice_to_user(f, iter, flags);
1345}
1346
1347SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1348                unsigned long, nr_segs, unsigned int, flags)
1349{
1350        struct iovec iovstack[UIO_FASTIOV];
1351        struct iovec *iov = iovstack;
1352        struct iov_iter iter;
1353        long error;
1354        struct fd f;
1355        int type;
1356
1357        f = fdget(fd);
1358        error = vmsplice_type(f, &type);
1359        if (error)
1360                return error;
1361
1362        error = import_iovec(type, uiov, nr_segs,
1363                             ARRAY_SIZE(iovstack), &iov, &iter);
1364        if (!error) {
1365                error = do_vmsplice(f.file, &iter, flags);
1366                kfree(iov);
1367        }
1368        fdput(f);
1369        return error;
1370}
1371
1372#ifdef CONFIG_COMPAT
1373COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1374                    unsigned int, nr_segs, unsigned int, flags)
1375{
1376        struct iovec iovstack[UIO_FASTIOV];
1377        struct iovec *iov = iovstack;
1378        struct iov_iter iter;
1379        long error;
1380        struct fd f;
1381        int type;
1382
1383        f = fdget(fd);
1384        error = vmsplice_type(f, &type);
1385        if (error)
1386                return error;
1387
1388        error = compat_import_iovec(type, iov32, nr_segs,
1389                             ARRAY_SIZE(iovstack), &iov, &iter);
1390        if (!error) {
1391                error = do_vmsplice(f.file, &iter, flags);
1392                kfree(iov);
1393        }
1394        fdput(f);
1395        return error;
1396}
1397#endif
1398
1399SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1400                int, fd_out, loff_t __user *, off_out,
1401                size_t, len, unsigned int, flags)
1402{
1403        struct fd in, out;
1404        long error;
1405
1406        if (unlikely(!len))
1407                return 0;
1408
1409        if (unlikely(flags & ~SPLICE_F_ALL))
1410                return -EINVAL;
1411
1412        error = -EBADF;
1413        in = fdget(fd_in);
1414        if (in.file) {
1415                if (in.file->f_mode & FMODE_READ) {
1416                        out = fdget(fd_out);
1417                        if (out.file) {
1418                                if (out.file->f_mode & FMODE_WRITE)
1419                                        error = do_splice(in.file, off_in,
1420                                                          out.file, off_out,
1421                                                          len, flags);
1422                                fdput(out);
1423                        }
1424                }
1425                fdput(in);
1426        }
1427        return error;
1428}
1429
1430/*
1431 * Make sure there's data to read. Wait for input if we can, otherwise
1432 * return an appropriate error.
1433 */
1434static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1435{
1436        int ret;
1437
1438        /*
1439         * Check ->nrbufs without the inode lock first. This function
1440         * is speculative anyways, so missing one is ok.
1441         */
1442        if (pipe->nrbufs)
1443                return 0;
1444
1445        ret = 0;
1446        pipe_lock(pipe);
1447
1448        while (!pipe->nrbufs) {
1449                if (signal_pending(current)) {
1450                        ret = -ERESTARTSYS;
1451                        break;
1452                }
1453                if (!pipe->writers)
1454                        break;
1455                if (!pipe->waiting_writers) {
1456                        if (flags & SPLICE_F_NONBLOCK) {
1457                                ret = -EAGAIN;
1458                                break;
1459                        }
1460                }
1461                pipe_wait(pipe);
1462        }
1463
1464        pipe_unlock(pipe);
1465        return ret;
1466}
1467
1468/*
1469 * Make sure there's writeable room. Wait for room if we can, otherwise
1470 * return an appropriate error.
1471 */
1472static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1473{
1474        int ret;
1475
1476        /*
1477         * Check ->nrbufs without the inode lock first. This function
1478         * is speculative anyways, so missing one is ok.
1479         */
1480        if (pipe->nrbufs < pipe->buffers)
1481                return 0;
1482
1483        ret = 0;
1484        pipe_lock(pipe);
1485
1486        while (pipe->nrbufs >= pipe->buffers) {
1487                if (!pipe->readers) {
1488                        send_sig(SIGPIPE, current, 0);
1489                        ret = -EPIPE;
1490                        break;
1491                }
1492                if (flags & SPLICE_F_NONBLOCK) {
1493                        ret = -EAGAIN;
1494                        break;
1495                }
1496                if (signal_pending(current)) {
1497                        ret = -ERESTARTSYS;
1498                        break;
1499                }
1500                pipe->waiting_writers++;
1501                pipe_wait(pipe);
1502                pipe->waiting_writers--;
1503        }
1504
1505        pipe_unlock(pipe);
1506        return ret;
1507}
1508
1509/*
1510 * Splice contents of ipipe to opipe.
1511 */
1512static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1513                               struct pipe_inode_info *opipe,
1514                               size_t len, unsigned int flags)
1515{
1516        struct pipe_buffer *ibuf, *obuf;
1517        int ret = 0, nbuf;
1518        bool input_wakeup = false;
1519
1520
1521retry:
1522        ret = ipipe_prep(ipipe, flags);
1523        if (ret)
1524                return ret;
1525
1526        ret = opipe_prep(opipe, flags);
1527        if (ret)
1528                return ret;
1529
1530        /*
1531         * Potential ABBA deadlock, work around it by ordering lock
1532         * grabbing by pipe info address. Otherwise two different processes
1533         * could deadlock (one doing tee from A -> B, the other from B -> A).
1534         */
1535        pipe_double_lock(ipipe, opipe);
1536
1537        do {
1538                if (!opipe->readers) {
1539                        send_sig(SIGPIPE, current, 0);
1540                        if (!ret)
1541                                ret = -EPIPE;
1542                        break;
1543                }
1544
1545                if (!ipipe->nrbufs && !ipipe->writers)
1546                        break;
1547
1548                /*
1549                 * Cannot make any progress, because either the input
1550                 * pipe is empty or the output pipe is full.
1551                 */
1552                if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1553                        /* Already processed some buffers, break */
1554                        if (ret)
1555                                break;
1556
1557                        if (flags & SPLICE_F_NONBLOCK) {
1558                                ret = -EAGAIN;
1559                                break;
1560                        }
1561
1562                        /*
1563                         * We raced with another reader/writer and haven't
1564                         * managed to process any buffers.  A zero return
1565                         * value means EOF, so retry instead.
1566                         */
1567                        pipe_unlock(ipipe);
1568                        pipe_unlock(opipe);
1569                        goto retry;
1570                }
1571
1572                ibuf = ipipe->bufs + ipipe->curbuf;
1573                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1574                obuf = opipe->bufs + nbuf;
1575
1576                if (len >= ibuf->len) {
1577                        /*
1578                         * Simply move the whole buffer from ipipe to opipe
1579                         */
1580                        *obuf = *ibuf;
1581                        ibuf->ops = NULL;
1582                        opipe->nrbufs++;
1583                        ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1584                        ipipe->nrbufs--;
1585                        input_wakeup = true;
1586                } else {
1587                        /*
1588                         * Get a reference to this pipe buffer,
1589                         * so we can copy the contents over.
1590                         */
1591                        pipe_buf_get(ipipe, ibuf);
1592                        *obuf = *ibuf;
1593
1594                        /*
1595                         * Don't inherit the gift flag, we need to
1596                         * prevent multiple steals of this page.
1597                         */
1598                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1599
1600                        obuf->len = len;
1601                        opipe->nrbufs++;
1602                        ibuf->offset += obuf->len;
1603                        ibuf->len -= obuf->len;
1604                }
1605                ret += obuf->len;
1606                len -= obuf->len;
1607        } while (len);
1608
1609        pipe_unlock(ipipe);
1610        pipe_unlock(opipe);
1611
1612        /*
1613         * If we put data in the output pipe, wakeup any potential readers.
1614         */
1615        if (ret > 0)
1616                wakeup_pipe_readers(opipe);
1617
1618        if (input_wakeup)
1619                wakeup_pipe_writers(ipipe);
1620
1621        return ret;
1622}
1623
1624/*
1625 * Link contents of ipipe to opipe.
1626 */
1627static int link_pipe(struct pipe_inode_info *ipipe,
1628                     struct pipe_inode_info *opipe,
1629                     size_t len, unsigned int flags)
1630{
1631        struct pipe_buffer *ibuf, *obuf;
1632        int ret = 0, i = 0, nbuf;
1633
1634        /*
1635         * Potential ABBA deadlock, work around it by ordering lock
1636         * grabbing by pipe info address. Otherwise two different processes
1637         * could deadlock (one doing tee from A -> B, the other from B -> A).
1638         */
1639        pipe_double_lock(ipipe, opipe);
1640
1641        do {
1642                if (!opipe->readers) {
1643                        send_sig(SIGPIPE, current, 0);
1644                        if (!ret)
1645                                ret = -EPIPE;
1646                        break;
1647                }
1648
1649                /*
1650                 * If we have iterated all input buffers or ran out of
1651                 * output room, break.
1652                 */
1653                if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1654                        break;
1655
1656                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1657                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1658
1659                /*
1660                 * Get a reference to this pipe buffer,
1661                 * so we can copy the contents over.
1662                 */
1663                pipe_buf_get(ipipe, ibuf);
1664
1665                obuf = opipe->bufs + nbuf;
1666                *obuf = *ibuf;
1667
1668                /*
1669                 * Don't inherit the gift flag, we need to
1670                 * prevent multiple steals of this page.
1671                 */
1672                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1673
1674                if (obuf->len > len)
1675                        obuf->len = len;
1676
1677                opipe->nrbufs++;
1678                ret += obuf->len;
1679                len -= obuf->len;
1680                i++;
1681        } while (len);
1682
1683        /*
1684         * return EAGAIN if we have the potential of some data in the
1685         * future, otherwise just return 0
1686         */
1687        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1688                ret = -EAGAIN;
1689
1690        pipe_unlock(ipipe);
1691        pipe_unlock(opipe);
1692
1693        /*
1694         * If we put data in the output pipe, wakeup any potential readers.
1695         */
1696        if (ret > 0)
1697                wakeup_pipe_readers(opipe);
1698
1699        return ret;
1700}
1701
1702/*
1703 * This is a tee(1) implementation that works on pipes. It doesn't copy
1704 * any data, it simply references the 'in' pages on the 'out' pipe.
1705 * The 'flags' used are the SPLICE_F_* variants, currently the only
1706 * applicable one is SPLICE_F_NONBLOCK.
1707 */
1708static long do_tee(struct file *in, struct file *out, size_t len,
1709                   unsigned int flags)
1710{
1711        struct pipe_inode_info *ipipe = get_pipe_info(in);
1712        struct pipe_inode_info *opipe = get_pipe_info(out);
1713        int ret = -EINVAL;
1714
1715        /*
1716         * Duplicate the contents of ipipe to opipe without actually
1717         * copying the data.
1718         */
1719        if (ipipe && opipe && ipipe != opipe) {
1720                /*
1721                 * Keep going, unless we encounter an error. The ipipe/opipe
1722                 * ordering doesn't really matter.
1723                 */
1724                ret = ipipe_prep(ipipe, flags);
1725                if (!ret) {
1726                        ret = opipe_prep(opipe, flags);
1727                        if (!ret)
1728                                ret = link_pipe(ipipe, opipe, len, flags);
1729                }
1730        }
1731
1732        return ret;
1733}
1734
1735SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1736{
1737        struct fd in;
1738        int error;
1739
1740        if (unlikely(flags & ~SPLICE_F_ALL))
1741                return -EINVAL;
1742
1743        if (unlikely(!len))
1744                return 0;
1745
1746        error = -EBADF;
1747        in = fdget(fdin);
1748        if (in.file) {
1749                if (in.file->f_mode & FMODE_READ) {
1750                        struct fd out = fdget(fdout);
1751                        if (out.file) {
1752                                if (out.file->f_mode & FMODE_WRITE)
1753                                        error = do_tee(in.file, out.file,
1754                                                        len, flags);
1755                                fdput(out);
1756                        }
1757                }
1758                fdput(in);
1759        }
1760
1761        return error;
1762}
1763