linux/fs/splice.c
<<
>>
Prefs
   1/*
   2 * "splice": joining two ropes together by interweaving their strands.
   3 *
   4 * This is the "extended pipe" functionality, where a pipe is used as
   5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
   6 * buffer that you can use to transfer data from one end to the other.
   7 *
   8 * The traditional unix read/write is extended with a "splice()" operation
   9 * that transfers data buffers to or from a pipe buffer.
  10 *
  11 * Named by Larry McVoy, original implementation from Linus, extended by
  12 * Jens to support splicing to files, network, direct splicing, etc and
  13 * fixing lots of bugs.
  14 *
  15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  18 *
  19 */
  20#include <linux/bvec.h>
  21#include <linux/fs.h>
  22#include <linux/file.h>
  23#include <linux/pagemap.h>
  24#include <linux/splice.h>
  25#include <linux/memcontrol.h>
  26#include <linux/mm_inline.h>
  27#include <linux/swap.h>
  28#include <linux/writeback.h>
  29#include <linux/export.h>
  30#include <linux/syscalls.h>
  31#include <linux/uio.h>
  32#include <linux/security.h>
  33#include <linux/gfp.h>
  34#include <linux/socket.h>
  35#include <linux/compat.h>
  36#include <linux/sched/signal.h>
  37
  38#include "internal.h"
  39
  40/*
  41 * Attempt to steal a page from a pipe buffer. This should perhaps go into
  42 * a vm helper function, it's already simplified quite a bit by the
  43 * addition of remove_mapping(). If success is returned, the caller may
  44 * attempt to reuse this page for another destination.
  45 */
  46static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
  47                                     struct pipe_buffer *buf)
  48{
  49        struct page *page = buf->page;
  50        struct address_space *mapping;
  51
  52        lock_page(page);
  53
  54        mapping = page_mapping(page);
  55        if (mapping) {
  56                WARN_ON(!PageUptodate(page));
  57
  58                /*
  59                 * At least for ext2 with nobh option, we need to wait on
  60                 * writeback completing on this page, since we'll remove it
  61                 * from the pagecache.  Otherwise truncate wont wait on the
  62                 * page, allowing the disk blocks to be reused by someone else
  63                 * before we actually wrote our data to them. fs corruption
  64                 * ensues.
  65                 */
  66                wait_on_page_writeback(page);
  67
  68                if (page_has_private(page) &&
  69                    !try_to_release_page(page, GFP_KERNEL))
  70                        goto out_unlock;
  71
  72                /*
  73                 * If we succeeded in removing the mapping, set LRU flag
  74                 * and return good.
  75                 */
  76                if (remove_mapping(mapping, page)) {
  77                        buf->flags |= PIPE_BUF_FLAG_LRU;
  78                        return 0;
  79                }
  80        }
  81
  82        /*
  83         * Raced with truncate or failed to remove page from current
  84         * address space, unlock and return failure.
  85         */
  86out_unlock:
  87        unlock_page(page);
  88        return 1;
  89}
  90
  91static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  92                                        struct pipe_buffer *buf)
  93{
  94        put_page(buf->page);
  95        buf->flags &= ~PIPE_BUF_FLAG_LRU;
  96}
  97
  98/*
  99 * Check whether the contents of buf is OK to access. Since the content
 100 * is a page cache page, IO may be in flight.
 101 */
 102static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
 103                                       struct pipe_buffer *buf)
 104{
 105        struct page *page = buf->page;
 106        int err;
 107
 108        if (!PageUptodate(page)) {
 109                lock_page(page);
 110
 111                /*
 112                 * Page got truncated/unhashed. This will cause a 0-byte
 113                 * splice, if this is the first page.
 114                 */
 115                if (!page->mapping) {
 116                        err = -ENODATA;
 117                        goto error;
 118                }
 119
 120                /*
 121                 * Uh oh, read-error from disk.
 122                 */
 123                if (!PageUptodate(page)) {
 124                        err = -EIO;
 125                        goto error;
 126                }
 127
 128                /*
 129                 * Page is ok afterall, we are done.
 130                 */
 131                unlock_page(page);
 132        }
 133
 134        return 0;
 135error:
 136        unlock_page(page);
 137        return err;
 138}
 139
 140const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 141        .can_merge = 0,
 142        .confirm = page_cache_pipe_buf_confirm,
 143        .release = page_cache_pipe_buf_release,
 144        .steal = page_cache_pipe_buf_steal,
 145        .get = generic_pipe_buf_get,
 146};
 147
 148static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 149                                    struct pipe_buffer *buf)
 150{
 151        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 152                return 1;
 153
 154        buf->flags |= PIPE_BUF_FLAG_LRU;
 155        return generic_pipe_buf_steal(pipe, buf);
 156}
 157
 158static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 159        .can_merge = 0,
 160        .confirm = generic_pipe_buf_confirm,
 161        .release = page_cache_pipe_buf_release,
 162        .steal = user_page_pipe_buf_steal,
 163        .get = generic_pipe_buf_get,
 164};
 165
 166static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
 167{
 168        smp_mb();
 169        if (waitqueue_active(&pipe->wait))
 170                wake_up_interruptible(&pipe->wait);
 171        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 172}
 173
 174/**
 175 * splice_to_pipe - fill passed data into a pipe
 176 * @pipe:       pipe to fill
 177 * @spd:        data to fill
 178 *
 179 * Description:
 180 *    @spd contains a map of pages and len/offset tuples, along with
 181 *    the struct pipe_buf_operations associated with these pages. This
 182 *    function will link that data to the pipe.
 183 *
 184 */
 185ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 186                       struct splice_pipe_desc *spd)
 187{
 188        unsigned int spd_pages = spd->nr_pages;
 189        int ret = 0, page_nr = 0;
 190
 191        if (!spd_pages)
 192                return 0;
 193
 194        if (unlikely(!pipe->readers)) {
 195                send_sig(SIGPIPE, current, 0);
 196                ret = -EPIPE;
 197                goto out;
 198        }
 199
 200        while (pipe->nrbufs < pipe->buffers) {
 201                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 202                struct pipe_buffer *buf = pipe->bufs + newbuf;
 203
 204                buf->page = spd->pages[page_nr];
 205                buf->offset = spd->partial[page_nr].offset;
 206                buf->len = spd->partial[page_nr].len;
 207                buf->private = spd->partial[page_nr].private;
 208                buf->ops = spd->ops;
 209                buf->flags = 0;
 210
 211                pipe->nrbufs++;
 212                page_nr++;
 213                ret += buf->len;
 214
 215                if (!--spd->nr_pages)
 216                        break;
 217        }
 218
 219        if (!ret)
 220                ret = -EAGAIN;
 221
 222out:
 223        while (page_nr < spd_pages)
 224                spd->spd_release(spd, page_nr++);
 225
 226        return ret;
 227}
 228EXPORT_SYMBOL_GPL(splice_to_pipe);
 229
 230ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 231{
 232        int ret;
 233
 234        if (unlikely(!pipe->readers)) {
 235                send_sig(SIGPIPE, current, 0);
 236                ret = -EPIPE;
 237        } else if (pipe->nrbufs == pipe->buffers) {
 238                ret = -EAGAIN;
 239        } else {
 240                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 241                pipe->bufs[newbuf] = *buf;
 242                pipe->nrbufs++;
 243                return buf->len;
 244        }
 245        pipe_buf_release(pipe, buf);
 246        return ret;
 247}
 248EXPORT_SYMBOL(add_to_pipe);
 249
 250/*
 251 * Check if we need to grow the arrays holding pages and partial page
 252 * descriptions.
 253 */
 254int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
 255{
 256        unsigned int buffers = READ_ONCE(pipe->buffers);
 257
 258        spd->nr_pages_max = buffers;
 259        if (buffers <= PIPE_DEF_BUFFERS)
 260                return 0;
 261
 262        spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
 263        spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
 264
 265        if (spd->pages && spd->partial)
 266                return 0;
 267
 268        kfree(spd->pages);
 269        kfree(spd->partial);
 270        return -ENOMEM;
 271}
 272
 273void splice_shrink_spd(struct splice_pipe_desc *spd)
 274{
 275        if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
 276                return;
 277
 278        kfree(spd->pages);
 279        kfree(spd->partial);
 280}
 281
 282/**
 283 * generic_file_splice_read - splice data from file to a pipe
 284 * @in:         file to splice from
 285 * @ppos:       position in @in
 286 * @pipe:       pipe to splice to
 287 * @len:        number of bytes to splice
 288 * @flags:      splice modifier flags
 289 *
 290 * Description:
 291 *    Will read pages from given file and fill them into a pipe. Can be
 292 *    used as long as it has more or less sane ->read_iter().
 293 *
 294 */
 295ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 296                                 struct pipe_inode_info *pipe, size_t len,
 297                                 unsigned int flags)
 298{
 299        struct iov_iter to;
 300        struct kiocb kiocb;
 301        int idx, ret;
 302
 303        iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len);
 304        idx = to.idx;
 305        init_sync_kiocb(&kiocb, in);
 306        kiocb.ki_pos = *ppos;
 307        ret = call_read_iter(in, &kiocb, &to);
 308        if (ret > 0) {
 309                *ppos = kiocb.ki_pos;
 310                file_accessed(in);
 311        } else if (ret < 0) {
 312                to.idx = idx;
 313                to.iov_offset = 0;
 314                iov_iter_advance(&to, 0); /* to free what was emitted */
 315                /*
 316                 * callers of ->splice_read() expect -EAGAIN on
 317                 * "can't put anything in there", rather than -EFAULT.
 318                 */
 319                if (ret == -EFAULT)
 320                        ret = -EAGAIN;
 321        }
 322
 323        return ret;
 324}
 325EXPORT_SYMBOL(generic_file_splice_read);
 326
 327const struct pipe_buf_operations default_pipe_buf_ops = {
 328        .can_merge = 0,
 329        .confirm = generic_pipe_buf_confirm,
 330        .release = generic_pipe_buf_release,
 331        .steal = generic_pipe_buf_steal,
 332        .get = generic_pipe_buf_get,
 333};
 334
 335static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
 336                                    struct pipe_buffer *buf)
 337{
 338        return 1;
 339}
 340
 341/* Pipe buffer operations for a socket and similar. */
 342const struct pipe_buf_operations nosteal_pipe_buf_ops = {
 343        .can_merge = 0,
 344        .confirm = generic_pipe_buf_confirm,
 345        .release = generic_pipe_buf_release,
 346        .steal = generic_pipe_buf_nosteal,
 347        .get = generic_pipe_buf_get,
 348};
 349EXPORT_SYMBOL(nosteal_pipe_buf_ops);
 350
 351static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
 352                            unsigned long vlen, loff_t offset)
 353{
 354        mm_segment_t old_fs;
 355        loff_t pos = offset;
 356        ssize_t res;
 357
 358        old_fs = get_fs();
 359        set_fs(get_ds());
 360        /* The cast to a user pointer is valid due to the set_fs() */
 361        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
 362        set_fs(old_fs);
 363
 364        return res;
 365}
 366
 367static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 368                                 struct pipe_inode_info *pipe, size_t len,
 369                                 unsigned int flags)
 370{
 371        struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
 372        struct iov_iter to;
 373        struct page **pages;
 374        unsigned int nr_pages;
 375        size_t offset, base, copied = 0;
 376        ssize_t res;
 377        int i;
 378
 379        if (pipe->nrbufs == pipe->buffers)
 380                return -EAGAIN;
 381
 382        /*
 383         * Try to keep page boundaries matching to source pagecache ones -
 384         * it probably won't be much help, but...
 385         */
 386        offset = *ppos & ~PAGE_MASK;
 387
 388        iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
 389
 390        res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
 391        if (res <= 0)
 392                return -ENOMEM;
 393
 394        nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE);
 395
 396        vec = __vec;
 397        if (nr_pages > PIPE_DEF_BUFFERS) {
 398                vec = kmalloc(nr_pages * sizeof(struct kvec), GFP_KERNEL);
 399                if (unlikely(!vec)) {
 400                        res = -ENOMEM;
 401                        goto out;
 402                }
 403        }
 404
 405        pipe->bufs[to.idx].offset = offset;
 406        pipe->bufs[to.idx].len -= offset;
 407
 408        for (i = 0; i < nr_pages; i++) {
 409                size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
 410                vec[i].iov_base = page_address(pages[i]) + offset;
 411                vec[i].iov_len = this_len;
 412                len -= this_len;
 413                offset = 0;
 414        }
 415
 416        res = kernel_readv(in, vec, nr_pages, *ppos);
 417        if (res > 0) {
 418                copied = res;
 419                *ppos += res;
 420        }
 421
 422        if (vec != __vec)
 423                kfree(vec);
 424out:
 425        for (i = 0; i < nr_pages; i++)
 426                put_page(pages[i]);
 427        kvfree(pages);
 428        iov_iter_advance(&to, copied);  /* truncates and discards */
 429        return res;
 430}
 431
 432/*
 433 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 434 * using sendpage(). Return the number of bytes sent.
 435 */
 436static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 437                            struct pipe_buffer *buf, struct splice_desc *sd)
 438{
 439        struct file *file = sd->u.file;
 440        loff_t pos = sd->pos;
 441        int more;
 442
 443        if (!likely(file->f_op->sendpage))
 444                return -EINVAL;
 445
 446        more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
 447
 448        if (sd->len < sd->total_len && pipe->nrbufs > 1)
 449                more |= MSG_SENDPAGE_NOTLAST;
 450
 451        return file->f_op->sendpage(file, buf->page, buf->offset,
 452                                    sd->len, &pos, more);
 453}
 454
 455static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 456{
 457        smp_mb();
 458        if (waitqueue_active(&pipe->wait))
 459                wake_up_interruptible(&pipe->wait);
 460        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 461}
 462
 463/**
 464 * splice_from_pipe_feed - feed available data from a pipe to a file
 465 * @pipe:       pipe to splice from
 466 * @sd:         information to @actor
 467 * @actor:      handler that splices the data
 468 *
 469 * Description:
 470 *    This function loops over the pipe and calls @actor to do the
 471 *    actual moving of a single struct pipe_buffer to the desired
 472 *    destination.  It returns when there's no more buffers left in
 473 *    the pipe or if the requested number of bytes (@sd->total_len)
 474 *    have been copied.  It returns a positive number (one) if the
 475 *    pipe needs to be filled with more data, zero if the required
 476 *    number of bytes have been copied and -errno on error.
 477 *
 478 *    This, together with splice_from_pipe_{begin,end,next}, may be
 479 *    used to implement the functionality of __splice_from_pipe() when
 480 *    locking is required around copying the pipe buffers to the
 481 *    destination.
 482 */
 483static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 484                          splice_actor *actor)
 485{
 486        int ret;
 487
 488        while (pipe->nrbufs) {
 489                struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 490
 491                sd->len = buf->len;
 492                if (sd->len > sd->total_len)
 493                        sd->len = sd->total_len;
 494
 495                ret = pipe_buf_confirm(pipe, buf);
 496                if (unlikely(ret)) {
 497                        if (ret == -ENODATA)
 498                                ret = 0;
 499                        return ret;
 500                }
 501
 502                ret = actor(pipe, buf, sd);
 503                if (ret <= 0)
 504                        return ret;
 505
 506                buf->offset += ret;
 507                buf->len -= ret;
 508
 509                sd->num_spliced += ret;
 510                sd->len -= ret;
 511                sd->pos += ret;
 512                sd->total_len -= ret;
 513
 514                if (!buf->len) {
 515                        pipe_buf_release(pipe, buf);
 516                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 517                        pipe->nrbufs--;
 518                        if (pipe->files)
 519                                sd->need_wakeup = true;
 520                }
 521
 522                if (!sd->total_len)
 523                        return 0;
 524        }
 525
 526        return 1;
 527}
 528
 529/**
 530 * splice_from_pipe_next - wait for some data to splice from
 531 * @pipe:       pipe to splice from
 532 * @sd:         information about the splice operation
 533 *
 534 * Description:
 535 *    This function will wait for some data and return a positive
 536 *    value (one) if pipe buffers are available.  It will return zero
 537 *    or -errno if no more data needs to be spliced.
 538 */
 539static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 540{
 541        /*
 542         * Check for signal early to make process killable when there are
 543         * always buffers available
 544         */
 545        if (signal_pending(current))
 546                return -ERESTARTSYS;
 547
 548        while (!pipe->nrbufs) {
 549                if (!pipe->writers)
 550                        return 0;
 551
 552                if (!pipe->waiting_writers && sd->num_spliced)
 553                        return 0;
 554
 555                if (sd->flags & SPLICE_F_NONBLOCK)
 556                        return -EAGAIN;
 557
 558                if (signal_pending(current))
 559                        return -ERESTARTSYS;
 560
 561                if (sd->need_wakeup) {
 562                        wakeup_pipe_writers(pipe);
 563                        sd->need_wakeup = false;
 564                }
 565
 566                pipe_wait(pipe);
 567        }
 568
 569        return 1;
 570}
 571
 572/**
 573 * splice_from_pipe_begin - start splicing from pipe
 574 * @sd:         information about the splice operation
 575 *
 576 * Description:
 577 *    This function should be called before a loop containing
 578 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 579 *    initialize the necessary fields of @sd.
 580 */
 581static void splice_from_pipe_begin(struct splice_desc *sd)
 582{
 583        sd->num_spliced = 0;
 584        sd->need_wakeup = false;
 585}
 586
 587/**
 588 * splice_from_pipe_end - finish splicing from pipe
 589 * @pipe:       pipe to splice from
 590 * @sd:         information about the splice operation
 591 *
 592 * Description:
 593 *    This function will wake up pipe writers if necessary.  It should
 594 *    be called after a loop containing splice_from_pipe_next() and
 595 *    splice_from_pipe_feed().
 596 */
 597static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
 598{
 599        if (sd->need_wakeup)
 600                wakeup_pipe_writers(pipe);
 601}
 602
 603/**
 604 * __splice_from_pipe - splice data from a pipe to given actor
 605 * @pipe:       pipe to splice from
 606 * @sd:         information to @actor
 607 * @actor:      handler that splices the data
 608 *
 609 * Description:
 610 *    This function does little more than loop over the pipe and call
 611 *    @actor to do the actual moving of a single struct pipe_buffer to
 612 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 613 *    pipe_to_user.
 614 *
 615 */
 616ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 617                           splice_actor *actor)
 618{
 619        int ret;
 620
 621        splice_from_pipe_begin(sd);
 622        do {
 623                cond_resched();
 624                ret = splice_from_pipe_next(pipe, sd);
 625                if (ret > 0)
 626                        ret = splice_from_pipe_feed(pipe, sd, actor);
 627        } while (ret > 0);
 628        splice_from_pipe_end(pipe, sd);
 629
 630        return sd->num_spliced ? sd->num_spliced : ret;
 631}
 632EXPORT_SYMBOL(__splice_from_pipe);
 633
 634/**
 635 * splice_from_pipe - splice data from a pipe to a file
 636 * @pipe:       pipe to splice from
 637 * @out:        file to splice to
 638 * @ppos:       position in @out
 639 * @len:        how many bytes to splice
 640 * @flags:      splice modifier flags
 641 * @actor:      handler that splices the data
 642 *
 643 * Description:
 644 *    See __splice_from_pipe. This function locks the pipe inode,
 645 *    otherwise it's identical to __splice_from_pipe().
 646 *
 647 */
 648ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 649                         loff_t *ppos, size_t len, unsigned int flags,
 650                         splice_actor *actor)
 651{
 652        ssize_t ret;
 653        struct splice_desc sd = {
 654                .total_len = len,
 655                .flags = flags,
 656                .pos = *ppos,
 657                .u.file = out,
 658        };
 659
 660        pipe_lock(pipe);
 661        ret = __splice_from_pipe(pipe, &sd, actor);
 662        pipe_unlock(pipe);
 663
 664        return ret;
 665}
 666
 667/**
 668 * iter_file_splice_write - splice data from a pipe to a file
 669 * @pipe:       pipe info
 670 * @out:        file to write to
 671 * @ppos:       position in @out
 672 * @len:        number of bytes to splice
 673 * @flags:      splice modifier flags
 674 *
 675 * Description:
 676 *    Will either move or copy pages (determined by @flags options) from
 677 *    the given pipe inode to the given file.
 678 *    This one is ->write_iter-based.
 679 *
 680 */
 681ssize_t
 682iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 683                          loff_t *ppos, size_t len, unsigned int flags)
 684{
 685        struct splice_desc sd = {
 686                .total_len = len,
 687                .flags = flags,
 688                .pos = *ppos,
 689                .u.file = out,
 690        };
 691        int nbufs = pipe->buffers;
 692        struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
 693                                        GFP_KERNEL);
 694        ssize_t ret;
 695
 696        if (unlikely(!array))
 697                return -ENOMEM;
 698
 699        pipe_lock(pipe);
 700
 701        splice_from_pipe_begin(&sd);
 702        while (sd.total_len) {
 703                struct iov_iter from;
 704                size_t left;
 705                int n, idx;
 706
 707                ret = splice_from_pipe_next(pipe, &sd);
 708                if (ret <= 0)
 709                        break;
 710
 711                if (unlikely(nbufs < pipe->buffers)) {
 712                        kfree(array);
 713                        nbufs = pipe->buffers;
 714                        array = kcalloc(nbufs, sizeof(struct bio_vec),
 715                                        GFP_KERNEL);
 716                        if (!array) {
 717                                ret = -ENOMEM;
 718                                break;
 719                        }
 720                }
 721
 722                /* build the vector */
 723                left = sd.total_len;
 724                for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
 725                        struct pipe_buffer *buf = pipe->bufs + idx;
 726                        size_t this_len = buf->len;
 727
 728                        if (this_len > left)
 729                                this_len = left;
 730
 731                        if (idx == pipe->buffers - 1)
 732                                idx = -1;
 733
 734                        ret = pipe_buf_confirm(pipe, buf);
 735                        if (unlikely(ret)) {
 736                                if (ret == -ENODATA)
 737                                        ret = 0;
 738                                goto done;
 739                        }
 740
 741                        array[n].bv_page = buf->page;
 742                        array[n].bv_len = this_len;
 743                        array[n].bv_offset = buf->offset;
 744                        left -= this_len;
 745                }
 746
 747                iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
 748                              sd.total_len - left);
 749                ret = vfs_iter_write(out, &from, &sd.pos, 0);
 750                if (ret <= 0)
 751                        break;
 752
 753                sd.num_spliced += ret;
 754                sd.total_len -= ret;
 755                *ppos = sd.pos;
 756
 757                /* dismiss the fully eaten buffers, adjust the partial one */
 758                while (ret) {
 759                        struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 760                        if (ret >= buf->len) {
 761                                ret -= buf->len;
 762                                buf->len = 0;
 763                                pipe_buf_release(pipe, buf);
 764                                pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 765                                pipe->nrbufs--;
 766                                if (pipe->files)
 767                                        sd.need_wakeup = true;
 768                        } else {
 769                                buf->offset += ret;
 770                                buf->len -= ret;
 771                                ret = 0;
 772                        }
 773                }
 774        }
 775done:
 776        kfree(array);
 777        splice_from_pipe_end(pipe, &sd);
 778
 779        pipe_unlock(pipe);
 780
 781        if (sd.num_spliced)
 782                ret = sd.num_spliced;
 783
 784        return ret;
 785}
 786
 787EXPORT_SYMBOL(iter_file_splice_write);
 788
 789static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 790                          struct splice_desc *sd)
 791{
 792        int ret;
 793        void *data;
 794        loff_t tmp = sd->pos;
 795
 796        data = kmap(buf->page);
 797        ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
 798        kunmap(buf->page);
 799
 800        return ret;
 801}
 802
 803static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
 804                                         struct file *out, loff_t *ppos,
 805                                         size_t len, unsigned int flags)
 806{
 807        ssize_t ret;
 808
 809        ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
 810        if (ret > 0)
 811                *ppos += ret;
 812
 813        return ret;
 814}
 815
 816/**
 817 * generic_splice_sendpage - splice data from a pipe to a socket
 818 * @pipe:       pipe to splice from
 819 * @out:        socket to write to
 820 * @ppos:       position in @out
 821 * @len:        number of bytes to splice
 822 * @flags:      splice modifier flags
 823 *
 824 * Description:
 825 *    Will send @len bytes from the pipe to a network socket. No data copying
 826 *    is involved.
 827 *
 828 */
 829ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 830                                loff_t *ppos, size_t len, unsigned int flags)
 831{
 832        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 833}
 834
 835EXPORT_SYMBOL(generic_splice_sendpage);
 836
 837/*
 838 * Attempt to initiate a splice from pipe to file.
 839 */
 840static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 841                           loff_t *ppos, size_t len, unsigned int flags)
 842{
 843        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 844                                loff_t *, size_t, unsigned int);
 845
 846        if (out->f_op->splice_write)
 847                splice_write = out->f_op->splice_write;
 848        else
 849                splice_write = default_file_splice_write;
 850
 851        return splice_write(pipe, out, ppos, len, flags);
 852}
 853
 854/*
 855 * Attempt to initiate a splice from a file to a pipe.
 856 */
 857static long do_splice_to(struct file *in, loff_t *ppos,
 858                         struct pipe_inode_info *pipe, size_t len,
 859                         unsigned int flags)
 860{
 861        ssize_t (*splice_read)(struct file *, loff_t *,
 862                               struct pipe_inode_info *, size_t, unsigned int);
 863        int ret;
 864
 865        if (unlikely(!(in->f_mode & FMODE_READ)))
 866                return -EBADF;
 867
 868        ret = rw_verify_area(READ, in, ppos, len);
 869        if (unlikely(ret < 0))
 870                return ret;
 871
 872        if (unlikely(len > MAX_RW_COUNT))
 873                len = MAX_RW_COUNT;
 874
 875        if (in->f_op->splice_read)
 876                splice_read = in->f_op->splice_read;
 877        else
 878                splice_read = default_file_splice_read;
 879
 880        return splice_read(in, ppos, pipe, len, flags);
 881}
 882
 883/**
 884 * splice_direct_to_actor - splices data directly between two non-pipes
 885 * @in:         file to splice from
 886 * @sd:         actor information on where to splice to
 887 * @actor:      handles the data splicing
 888 *
 889 * Description:
 890 *    This is a special case helper to splice directly between two
 891 *    points, without requiring an explicit pipe. Internally an allocated
 892 *    pipe is cached in the process, and reused during the lifetime of
 893 *    that process.
 894 *
 895 */
 896ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 897                               splice_direct_actor *actor)
 898{
 899        struct pipe_inode_info *pipe;
 900        long ret, bytes;
 901        umode_t i_mode;
 902        size_t len;
 903        int i, flags, more;
 904
 905        /*
 906         * We require the input being a regular file, as we don't want to
 907         * randomly drop data for eg socket -> socket splicing. Use the
 908         * piped splicing for that!
 909         */
 910        i_mode = file_inode(in)->i_mode;
 911        if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
 912                return -EINVAL;
 913
 914        /*
 915         * neither in nor out is a pipe, setup an internal pipe attached to
 916         * 'out' and transfer the wanted data from 'in' to 'out' through that
 917         */
 918        pipe = current->splice_pipe;
 919        if (unlikely(!pipe)) {
 920                pipe = alloc_pipe_info();
 921                if (!pipe)
 922                        return -ENOMEM;
 923
 924                /*
 925                 * We don't have an immediate reader, but we'll read the stuff
 926                 * out of the pipe right after the splice_to_pipe(). So set
 927                 * PIPE_READERS appropriately.
 928                 */
 929                pipe->readers = 1;
 930
 931                current->splice_pipe = pipe;
 932        }
 933
 934        /*
 935         * Do the splice.
 936         */
 937        ret = 0;
 938        bytes = 0;
 939        len = sd->total_len;
 940        flags = sd->flags;
 941
 942        /*
 943         * Don't block on output, we have to drain the direct pipe.
 944         */
 945        sd->flags &= ~SPLICE_F_NONBLOCK;
 946        more = sd->flags & SPLICE_F_MORE;
 947
 948        while (len) {
 949                size_t read_len;
 950                loff_t pos = sd->pos, prev_pos = pos;
 951
 952                ret = do_splice_to(in, &pos, pipe, len, flags);
 953                if (unlikely(ret <= 0))
 954                        goto out_release;
 955
 956                read_len = ret;
 957                sd->total_len = read_len;
 958
 959                /*
 960                 * If more data is pending, set SPLICE_F_MORE
 961                 * If this is the last data and SPLICE_F_MORE was not set
 962                 * initially, clears it.
 963                 */
 964                if (read_len < len)
 965                        sd->flags |= SPLICE_F_MORE;
 966                else if (!more)
 967                        sd->flags &= ~SPLICE_F_MORE;
 968                /*
 969                 * NOTE: nonblocking mode only applies to the input. We
 970                 * must not do the output in nonblocking mode as then we
 971                 * could get stuck data in the internal pipe:
 972                 */
 973                ret = actor(pipe, sd);
 974                if (unlikely(ret <= 0)) {
 975                        sd->pos = prev_pos;
 976                        goto out_release;
 977                }
 978
 979                bytes += ret;
 980                len -= ret;
 981                sd->pos = pos;
 982
 983                if (ret < read_len) {
 984                        sd->pos = prev_pos + ret;
 985                        goto out_release;
 986                }
 987        }
 988
 989done:
 990        pipe->nrbufs = pipe->curbuf = 0;
 991        file_accessed(in);
 992        return bytes;
 993
 994out_release:
 995        /*
 996         * If we did an incomplete transfer we must release
 997         * the pipe buffers in question:
 998         */
 999        for (i = 0; i < pipe->buffers; i++) {
1000                struct pipe_buffer *buf = pipe->bufs + i;
1001
1002                if (buf->ops)
1003                        pipe_buf_release(pipe, buf);
1004        }
1005
1006        if (!bytes)
1007                bytes = ret;
1008
1009        goto done;
1010}
1011EXPORT_SYMBOL(splice_direct_to_actor);
1012
1013static int direct_splice_actor(struct pipe_inode_info *pipe,
1014                               struct splice_desc *sd)
1015{
1016        struct file *file = sd->u.file;
1017
1018        return do_splice_from(pipe, file, sd->opos, sd->total_len,
1019                              sd->flags);
1020}
1021
1022/**
1023 * do_splice_direct - splices data directly between two files
1024 * @in:         file to splice from
1025 * @ppos:       input file offset
1026 * @out:        file to splice to
1027 * @opos:       output file offset
1028 * @len:        number of bytes to splice
1029 * @flags:      splice modifier flags
1030 *
1031 * Description:
1032 *    For use by do_sendfile(). splice can easily emulate sendfile, but
1033 *    doing it in the application would incur an extra system call
1034 *    (splice in + splice out, as compared to just sendfile()). So this helper
1035 *    can splice directly through a process-private pipe.
1036 *
1037 */
1038long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1039                      loff_t *opos, size_t len, unsigned int flags)
1040{
1041        struct splice_desc sd = {
1042                .len            = len,
1043                .total_len      = len,
1044                .flags          = flags,
1045                .pos            = *ppos,
1046                .u.file         = out,
1047                .opos           = opos,
1048        };
1049        long ret;
1050
1051        if (unlikely(!(out->f_mode & FMODE_WRITE)))
1052                return -EBADF;
1053
1054        if (unlikely(out->f_flags & O_APPEND))
1055                return -EINVAL;
1056
1057        ret = rw_verify_area(WRITE, out, opos, len);
1058        if (unlikely(ret < 0))
1059                return ret;
1060
1061        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1062        if (ret > 0)
1063                *ppos = sd.pos;
1064
1065        return ret;
1066}
1067EXPORT_SYMBOL(do_splice_direct);
1068
1069static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1070{
1071        for (;;) {
1072                if (unlikely(!pipe->readers)) {
1073                        send_sig(SIGPIPE, current, 0);
1074                        return -EPIPE;
1075                }
1076                if (pipe->nrbufs != pipe->buffers)
1077                        return 0;
1078                if (flags & SPLICE_F_NONBLOCK)
1079                        return -EAGAIN;
1080                if (signal_pending(current))
1081                        return -ERESTARTSYS;
1082                pipe->waiting_writers++;
1083                pipe_wait(pipe);
1084                pipe->waiting_writers--;
1085        }
1086}
1087
1088static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1089                               struct pipe_inode_info *opipe,
1090                               size_t len, unsigned int flags);
1091
1092/*
1093 * Determine where to splice to/from.
1094 */
1095static long do_splice(struct file *in, loff_t __user *off_in,
1096                      struct file *out, loff_t __user *off_out,
1097                      size_t len, unsigned int flags)
1098{
1099        struct pipe_inode_info *ipipe;
1100        struct pipe_inode_info *opipe;
1101        loff_t offset;
1102        long ret;
1103
1104        ipipe = get_pipe_info(in);
1105        opipe = get_pipe_info(out);
1106
1107        if (ipipe && opipe) {
1108                if (off_in || off_out)
1109                        return -ESPIPE;
1110
1111                if (!(in->f_mode & FMODE_READ))
1112                        return -EBADF;
1113
1114                if (!(out->f_mode & FMODE_WRITE))
1115                        return -EBADF;
1116
1117                /* Splicing to self would be fun, but... */
1118                if (ipipe == opipe)
1119                        return -EINVAL;
1120
1121                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1122        }
1123
1124        if (ipipe) {
1125                if (off_in)
1126                        return -ESPIPE;
1127                if (off_out) {
1128                        if (!(out->f_mode & FMODE_PWRITE))
1129                                return -EINVAL;
1130                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1131                                return -EFAULT;
1132                } else {
1133                        offset = out->f_pos;
1134                }
1135
1136                if (unlikely(!(out->f_mode & FMODE_WRITE)))
1137                        return -EBADF;
1138
1139                if (unlikely(out->f_flags & O_APPEND))
1140                        return -EINVAL;
1141
1142                ret = rw_verify_area(WRITE, out, &offset, len);
1143                if (unlikely(ret < 0))
1144                        return ret;
1145
1146                file_start_write(out);
1147                ret = do_splice_from(ipipe, out, &offset, len, flags);
1148                file_end_write(out);
1149
1150                if (!off_out)
1151                        out->f_pos = offset;
1152                else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1153                        ret = -EFAULT;
1154
1155                return ret;
1156        }
1157
1158        if (opipe) {
1159                if (off_out)
1160                        return -ESPIPE;
1161                if (off_in) {
1162                        if (!(in->f_mode & FMODE_PREAD))
1163                                return -EINVAL;
1164                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1165                                return -EFAULT;
1166                } else {
1167                        offset = in->f_pos;
1168                }
1169
1170                pipe_lock(opipe);
1171                ret = wait_for_space(opipe, flags);
1172                if (!ret)
1173                        ret = do_splice_to(in, &offset, opipe, len, flags);
1174                pipe_unlock(opipe);
1175                if (ret > 0)
1176                        wakeup_pipe_readers(opipe);
1177                if (!off_in)
1178                        in->f_pos = offset;
1179                else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1180                        ret = -EFAULT;
1181
1182                return ret;
1183        }
1184
1185        return -EINVAL;
1186}
1187
1188static int iter_to_pipe(struct iov_iter *from,
1189                        struct pipe_inode_info *pipe,
1190                        unsigned flags)
1191{
1192        struct pipe_buffer buf = {
1193                .ops = &user_page_pipe_buf_ops,
1194                .flags = flags
1195        };
1196        size_t total = 0;
1197        int ret = 0;
1198        bool failed = false;
1199
1200        while (iov_iter_count(from) && !failed) {
1201                struct page *pages[16];
1202                ssize_t copied;
1203                size_t start;
1204                int n;
1205
1206                copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1207                if (copied <= 0) {
1208                        ret = copied;
1209                        break;
1210                }
1211
1212                for (n = 0; copied; n++, start = 0) {
1213                        int size = min_t(int, copied, PAGE_SIZE - start);
1214                        if (!failed) {
1215                                buf.page = pages[n];
1216                                buf.offset = start;
1217                                buf.len = size;
1218                                ret = add_to_pipe(pipe, &buf);
1219                                if (unlikely(ret < 0)) {
1220                                        failed = true;
1221                                } else {
1222                                        iov_iter_advance(from, ret);
1223                                        total += ret;
1224                                }
1225                        } else {
1226                                put_page(pages[n]);
1227                        }
1228                        copied -= size;
1229                }
1230        }
1231        return total ? total : ret;
1232}
1233
1234static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1235                        struct splice_desc *sd)
1236{
1237        int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1238        return n == sd->len ? n : -EFAULT;
1239}
1240
1241/*
1242 * For lack of a better implementation, implement vmsplice() to userspace
1243 * as a simple copy of the pipes pages to the user iov.
1244 */
1245static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
1246                             unsigned long nr_segs, unsigned int flags)
1247{
1248        struct pipe_inode_info *pipe;
1249        struct splice_desc sd;
1250        long ret;
1251        struct iovec iovstack[UIO_FASTIOV];
1252        struct iovec *iov = iovstack;
1253        struct iov_iter iter;
1254
1255        pipe = get_pipe_info(file);
1256        if (!pipe)
1257                return -EBADF;
1258
1259        ret = import_iovec(READ, uiov, nr_segs,
1260                           ARRAY_SIZE(iovstack), &iov, &iter);
1261        if (ret < 0)
1262                return ret;
1263
1264        sd.total_len = iov_iter_count(&iter);
1265        sd.len = 0;
1266        sd.flags = flags;
1267        sd.u.data = &iter;
1268        sd.pos = 0;
1269
1270        if (sd.total_len) {
1271                pipe_lock(pipe);
1272                ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1273                pipe_unlock(pipe);
1274        }
1275
1276        kfree(iov);
1277        return ret;
1278}
1279
1280/*
1281 * vmsplice splices a user address range into a pipe. It can be thought of
1282 * as splice-from-memory, where the regular splice is splice-from-file (or
1283 * to file). In both cases the output is a pipe, naturally.
1284 */
1285static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov,
1286                             unsigned long nr_segs, unsigned int flags)
1287{
1288        struct pipe_inode_info *pipe;
1289        struct iovec iovstack[UIO_FASTIOV];
1290        struct iovec *iov = iovstack;
1291        struct iov_iter from;
1292        long ret;
1293        unsigned buf_flag = 0;
1294
1295        if (flags & SPLICE_F_GIFT)
1296                buf_flag = PIPE_BUF_FLAG_GIFT;
1297
1298        pipe = get_pipe_info(file);
1299        if (!pipe)
1300                return -EBADF;
1301
1302        ret = import_iovec(WRITE, uiov, nr_segs,
1303                           ARRAY_SIZE(iovstack), &iov, &from);
1304        if (ret < 0)
1305                return ret;
1306
1307        pipe_lock(pipe);
1308        ret = wait_for_space(pipe, flags);
1309        if (!ret)
1310                ret = iter_to_pipe(&from, pipe, buf_flag);
1311        pipe_unlock(pipe);
1312        if (ret > 0)
1313                wakeup_pipe_readers(pipe);
1314        kfree(iov);
1315        return ret;
1316}
1317
1318/*
1319 * Note that vmsplice only really supports true splicing _from_ user memory
1320 * to a pipe, not the other way around. Splicing from user memory is a simple
1321 * operation that can be supported without any funky alignment restrictions
1322 * or nasty vm tricks. We simply map in the user memory and fill them into
1323 * a pipe. The reverse isn't quite as easy, though. There are two possible
1324 * solutions for that:
1325 *
1326 *      - memcpy() the data internally, at which point we might as well just
1327 *        do a regular read() on the buffer anyway.
1328 *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1329 *        has restriction limitations on both ends of the pipe).
1330 *
1331 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1332 *
1333 */
1334static long do_vmsplice(int fd, const struct iovec __user *iov,
1335                        unsigned long nr_segs, unsigned int flags)
1336{
1337        struct fd f;
1338        long error;
1339
1340        if (unlikely(flags & ~SPLICE_F_ALL))
1341                return -EINVAL;
1342        if (unlikely(nr_segs > UIO_MAXIOV))
1343                return -EINVAL;
1344        else if (unlikely(!nr_segs))
1345                return 0;
1346
1347        error = -EBADF;
1348        f = fdget(fd);
1349        if (f.file) {
1350                if (f.file->f_mode & FMODE_WRITE)
1351                        error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
1352                else if (f.file->f_mode & FMODE_READ)
1353                        error = vmsplice_to_user(f.file, iov, nr_segs, flags);
1354
1355                fdput(f);
1356        }
1357
1358        return error;
1359}
1360
1361SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1362                unsigned long, nr_segs, unsigned int, flags)
1363{
1364        return do_vmsplice(fd, iov, nr_segs, flags);
1365}
1366
1367#ifdef CONFIG_COMPAT
1368COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1369                    unsigned int, nr_segs, unsigned int, flags)
1370{
1371        unsigned i;
1372        struct iovec __user *iov;
1373        if (nr_segs > UIO_MAXIOV)
1374                return -EINVAL;
1375        iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
1376        for (i = 0; i < nr_segs; i++) {
1377                struct compat_iovec v;
1378                if (get_user(v.iov_base, &iov32[i].iov_base) ||
1379                    get_user(v.iov_len, &iov32[i].iov_len) ||
1380                    put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
1381                    put_user(v.iov_len, &iov[i].iov_len))
1382                        return -EFAULT;
1383        }
1384        return do_vmsplice(fd, iov, nr_segs, flags);
1385}
1386#endif
1387
1388SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1389                int, fd_out, loff_t __user *, off_out,
1390                size_t, len, unsigned int, flags)
1391{
1392        struct fd in, out;
1393        long error;
1394
1395        if (unlikely(!len))
1396                return 0;
1397
1398        if (unlikely(flags & ~SPLICE_F_ALL))
1399                return -EINVAL;
1400
1401        error = -EBADF;
1402        in = fdget(fd_in);
1403        if (in.file) {
1404                if (in.file->f_mode & FMODE_READ) {
1405                        out = fdget(fd_out);
1406                        if (out.file) {
1407                                if (out.file->f_mode & FMODE_WRITE)
1408                                        error = do_splice(in.file, off_in,
1409                                                          out.file, off_out,
1410                                                          len, flags);
1411                                fdput(out);
1412                        }
1413                }
1414                fdput(in);
1415        }
1416        return error;
1417}
1418
1419/*
1420 * Make sure there's data to read. Wait for input if we can, otherwise
1421 * return an appropriate error.
1422 */
1423static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1424{
1425        int ret;
1426
1427        /*
1428         * Check ->nrbufs without the inode lock first. This function
1429         * is speculative anyways, so missing one is ok.
1430         */
1431        if (pipe->nrbufs)
1432                return 0;
1433
1434        ret = 0;
1435        pipe_lock(pipe);
1436
1437        while (!pipe->nrbufs) {
1438                if (signal_pending(current)) {
1439                        ret = -ERESTARTSYS;
1440                        break;
1441                }
1442                if (!pipe->writers)
1443                        break;
1444                if (!pipe->waiting_writers) {
1445                        if (flags & SPLICE_F_NONBLOCK) {
1446                                ret = -EAGAIN;
1447                                break;
1448                        }
1449                }
1450                pipe_wait(pipe);
1451        }
1452
1453        pipe_unlock(pipe);
1454        return ret;
1455}
1456
1457/*
1458 * Make sure there's writeable room. Wait for room if we can, otherwise
1459 * return an appropriate error.
1460 */
1461static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1462{
1463        int ret;
1464
1465        /*
1466         * Check ->nrbufs without the inode lock first. This function
1467         * is speculative anyways, so missing one is ok.
1468         */
1469        if (pipe->nrbufs < pipe->buffers)
1470                return 0;
1471
1472        ret = 0;
1473        pipe_lock(pipe);
1474
1475        while (pipe->nrbufs >= pipe->buffers) {
1476                if (!pipe->readers) {
1477                        send_sig(SIGPIPE, current, 0);
1478                        ret = -EPIPE;
1479                        break;
1480                }
1481                if (flags & SPLICE_F_NONBLOCK) {
1482                        ret = -EAGAIN;
1483                        break;
1484                }
1485                if (signal_pending(current)) {
1486                        ret = -ERESTARTSYS;
1487                        break;
1488                }
1489                pipe->waiting_writers++;
1490                pipe_wait(pipe);
1491                pipe->waiting_writers--;
1492        }
1493
1494        pipe_unlock(pipe);
1495        return ret;
1496}
1497
1498/*
1499 * Splice contents of ipipe to opipe.
1500 */
1501static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1502                               struct pipe_inode_info *opipe,
1503                               size_t len, unsigned int flags)
1504{
1505        struct pipe_buffer *ibuf, *obuf;
1506        int ret = 0, nbuf;
1507        bool input_wakeup = false;
1508
1509
1510retry:
1511        ret = ipipe_prep(ipipe, flags);
1512        if (ret)
1513                return ret;
1514
1515        ret = opipe_prep(opipe, flags);
1516        if (ret)
1517                return ret;
1518
1519        /*
1520         * Potential ABBA deadlock, work around it by ordering lock
1521         * grabbing by pipe info address. Otherwise two different processes
1522         * could deadlock (one doing tee from A -> B, the other from B -> A).
1523         */
1524        pipe_double_lock(ipipe, opipe);
1525
1526        do {
1527                if (!opipe->readers) {
1528                        send_sig(SIGPIPE, current, 0);
1529                        if (!ret)
1530                                ret = -EPIPE;
1531                        break;
1532                }
1533
1534                if (!ipipe->nrbufs && !ipipe->writers)
1535                        break;
1536
1537                /*
1538                 * Cannot make any progress, because either the input
1539                 * pipe is empty or the output pipe is full.
1540                 */
1541                if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1542                        /* Already processed some buffers, break */
1543                        if (ret)
1544                                break;
1545
1546                        if (flags & SPLICE_F_NONBLOCK) {
1547                                ret = -EAGAIN;
1548                                break;
1549                        }
1550
1551                        /*
1552                         * We raced with another reader/writer and haven't
1553                         * managed to process any buffers.  A zero return
1554                         * value means EOF, so retry instead.
1555                         */
1556                        pipe_unlock(ipipe);
1557                        pipe_unlock(opipe);
1558                        goto retry;
1559                }
1560
1561                ibuf = ipipe->bufs + ipipe->curbuf;
1562                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1563                obuf = opipe->bufs + nbuf;
1564
1565                if (len >= ibuf->len) {
1566                        /*
1567                         * Simply move the whole buffer from ipipe to opipe
1568                         */
1569                        *obuf = *ibuf;
1570                        ibuf->ops = NULL;
1571                        opipe->nrbufs++;
1572                        ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1573                        ipipe->nrbufs--;
1574                        input_wakeup = true;
1575                } else {
1576                        /*
1577                         * Get a reference to this pipe buffer,
1578                         * so we can copy the contents over.
1579                         */
1580                        pipe_buf_get(ipipe, ibuf);
1581                        *obuf = *ibuf;
1582
1583                        /*
1584                         * Don't inherit the gift flag, we need to
1585                         * prevent multiple steals of this page.
1586                         */
1587                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1588
1589                        obuf->len = len;
1590                        opipe->nrbufs++;
1591                        ibuf->offset += obuf->len;
1592                        ibuf->len -= obuf->len;
1593                }
1594                ret += obuf->len;
1595                len -= obuf->len;
1596        } while (len);
1597
1598        pipe_unlock(ipipe);
1599        pipe_unlock(opipe);
1600
1601        /*
1602         * If we put data in the output pipe, wakeup any potential readers.
1603         */
1604        if (ret > 0)
1605                wakeup_pipe_readers(opipe);
1606
1607        if (input_wakeup)
1608                wakeup_pipe_writers(ipipe);
1609
1610        return ret;
1611}
1612
1613/*
1614 * Link contents of ipipe to opipe.
1615 */
1616static int link_pipe(struct pipe_inode_info *ipipe,
1617                     struct pipe_inode_info *opipe,
1618                     size_t len, unsigned int flags)
1619{
1620        struct pipe_buffer *ibuf, *obuf;
1621        int ret = 0, i = 0, nbuf;
1622
1623        /*
1624         * Potential ABBA deadlock, work around it by ordering lock
1625         * grabbing by pipe info address. Otherwise two different processes
1626         * could deadlock (one doing tee from A -> B, the other from B -> A).
1627         */
1628        pipe_double_lock(ipipe, opipe);
1629
1630        do {
1631                if (!opipe->readers) {
1632                        send_sig(SIGPIPE, current, 0);
1633                        if (!ret)
1634                                ret = -EPIPE;
1635                        break;
1636                }
1637
1638                /*
1639                 * If we have iterated all input buffers or ran out of
1640                 * output room, break.
1641                 */
1642                if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1643                        break;
1644
1645                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1646                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1647
1648                /*
1649                 * Get a reference to this pipe buffer,
1650                 * so we can copy the contents over.
1651                 */
1652                pipe_buf_get(ipipe, ibuf);
1653
1654                obuf = opipe->bufs + nbuf;
1655                *obuf = *ibuf;
1656
1657                /*
1658                 * Don't inherit the gift flag, we need to
1659                 * prevent multiple steals of this page.
1660                 */
1661                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1662
1663                if (obuf->len > len)
1664                        obuf->len = len;
1665
1666                opipe->nrbufs++;
1667                ret += obuf->len;
1668                len -= obuf->len;
1669                i++;
1670        } while (len);
1671
1672        /*
1673         * return EAGAIN if we have the potential of some data in the
1674         * future, otherwise just return 0
1675         */
1676        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1677                ret = -EAGAIN;
1678
1679        pipe_unlock(ipipe);
1680        pipe_unlock(opipe);
1681
1682        /*
1683         * If we put data in the output pipe, wakeup any potential readers.
1684         */
1685        if (ret > 0)
1686                wakeup_pipe_readers(opipe);
1687
1688        return ret;
1689}
1690
1691/*
1692 * This is a tee(1) implementation that works on pipes. It doesn't copy
1693 * any data, it simply references the 'in' pages on the 'out' pipe.
1694 * The 'flags' used are the SPLICE_F_* variants, currently the only
1695 * applicable one is SPLICE_F_NONBLOCK.
1696 */
1697static long do_tee(struct file *in, struct file *out, size_t len,
1698                   unsigned int flags)
1699{
1700        struct pipe_inode_info *ipipe = get_pipe_info(in);
1701        struct pipe_inode_info *opipe = get_pipe_info(out);
1702        int ret = -EINVAL;
1703
1704        /*
1705         * Duplicate the contents of ipipe to opipe without actually
1706         * copying the data.
1707         */
1708        if (ipipe && opipe && ipipe != opipe) {
1709                /*
1710                 * Keep going, unless we encounter an error. The ipipe/opipe
1711                 * ordering doesn't really matter.
1712                 */
1713                ret = ipipe_prep(ipipe, flags);
1714                if (!ret) {
1715                        ret = opipe_prep(opipe, flags);
1716                        if (!ret)
1717                                ret = link_pipe(ipipe, opipe, len, flags);
1718                }
1719        }
1720
1721        return ret;
1722}
1723
1724SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1725{
1726        struct fd in;
1727        int error;
1728
1729        if (unlikely(flags & ~SPLICE_F_ALL))
1730                return -EINVAL;
1731
1732        if (unlikely(!len))
1733                return 0;
1734
1735        error = -EBADF;
1736        in = fdget(fdin);
1737        if (in.file) {
1738                if (in.file->f_mode & FMODE_READ) {
1739                        struct fd out = fdget(fdout);
1740                        if (out.file) {
1741                                if (out.file->f_mode & FMODE_WRITE)
1742                                        error = do_tee(in.file, out.file,
1743                                                        len, flags);
1744                                fdput(out);
1745                        }
1746                }
1747                fdput(in);
1748        }
1749
1750        return error;
1751}
1752