linux/fs/splice.c
<<
>>
Prefs
   1/*
   2 * "splice": joining two ropes together by interweaving their strands.
   3 *
   4 * This is the "extended pipe" functionality, where a pipe is used as
   5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
   6 * buffer that you can use to transfer data from one end to the other.
   7 *
   8 * The traditional unix read/write is extended with a "splice()" operation
   9 * that transfers data buffers to or from a pipe buffer.
  10 *
  11 * Named by Larry McVoy, original implementation from Linus, extended by
  12 * Jens to support splicing to files, network, direct splicing, etc and
  13 * fixing lots of bugs.
  14 *
  15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  18 *
  19 */
  20#include <linux/bvec.h>
  21#include <linux/fs.h>
  22#include <linux/file.h>
  23#include <linux/pagemap.h>
  24#include <linux/splice.h>
  25#include <linux/memcontrol.h>
  26#include <linux/mm_inline.h>
  27#include <linux/swap.h>
  28#include <linux/writeback.h>
  29#include <linux/export.h>
  30#include <linux/syscalls.h>
  31#include <linux/uio.h>
  32#include <linux/security.h>
  33#include <linux/gfp.h>
  34#include <linux/socket.h>
  35#include <linux/compat.h>
  36#include <linux/sched/signal.h>
  37
  38#include "internal.h"
  39
  40/*
  41 * Attempt to steal a page from a pipe buffer. This should perhaps go into
  42 * a vm helper function, it's already simplified quite a bit by the
  43 * addition of remove_mapping(). If success is returned, the caller may
  44 * attempt to reuse this page for another destination.
  45 */
  46static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
  47                                     struct pipe_buffer *buf)
  48{
  49        struct page *page = buf->page;
  50        struct address_space *mapping;
  51
  52        lock_page(page);
  53
  54        mapping = page_mapping(page);
  55        if (mapping) {
  56                WARN_ON(!PageUptodate(page));
  57
  58                /*
  59                 * At least for ext2 with nobh option, we need to wait on
  60                 * writeback completing on this page, since we'll remove it
  61                 * from the pagecache.  Otherwise truncate wont wait on the
  62                 * page, allowing the disk blocks to be reused by someone else
  63                 * before we actually wrote our data to them. fs corruption
  64                 * ensues.
  65                 */
  66                wait_on_page_writeback(page);
  67
  68                if (page_has_private(page) &&
  69                    !try_to_release_page(page, GFP_KERNEL))
  70                        goto out_unlock;
  71
  72                /*
  73                 * If we succeeded in removing the mapping, set LRU flag
  74                 * and return good.
  75                 */
  76                if (remove_mapping(mapping, page)) {
  77                        buf->flags |= PIPE_BUF_FLAG_LRU;
  78                        return 0;
  79                }
  80        }
  81
  82        /*
  83         * Raced with truncate or failed to remove page from current
  84         * address space, unlock and return failure.
  85         */
  86out_unlock:
  87        unlock_page(page);
  88        return 1;
  89}
  90
  91static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  92                                        struct pipe_buffer *buf)
  93{
  94        put_page(buf->page);
  95        buf->flags &= ~PIPE_BUF_FLAG_LRU;
  96}
  97
  98/*
  99 * Check whether the contents of buf is OK to access. Since the content
 100 * is a page cache page, IO may be in flight.
 101 */
 102static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
 103                                       struct pipe_buffer *buf)
 104{
 105        struct page *page = buf->page;
 106        int err;
 107
 108        if (!PageUptodate(page)) {
 109                lock_page(page);
 110
 111                /*
 112                 * Page got truncated/unhashed. This will cause a 0-byte
 113                 * splice, if this is the first page.
 114                 */
 115                if (!page->mapping) {
 116                        err = -ENODATA;
 117                        goto error;
 118                }
 119
 120                /*
 121                 * Uh oh, read-error from disk.
 122                 */
 123                if (!PageUptodate(page)) {
 124                        err = -EIO;
 125                        goto error;
 126                }
 127
 128                /*
 129                 * Page is ok afterall, we are done.
 130                 */
 131                unlock_page(page);
 132        }
 133
 134        return 0;
 135error:
 136        unlock_page(page);
 137        return err;
 138}
 139
 140const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 141        .can_merge = 0,
 142        .confirm = page_cache_pipe_buf_confirm,
 143        .release = page_cache_pipe_buf_release,
 144        .steal = page_cache_pipe_buf_steal,
 145        .get = generic_pipe_buf_get,
 146};
 147
 148static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 149                                    struct pipe_buffer *buf)
 150{
 151        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 152                return 1;
 153
 154        buf->flags |= PIPE_BUF_FLAG_LRU;
 155        return generic_pipe_buf_steal(pipe, buf);
 156}
 157
 158static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 159        .can_merge = 0,
 160        .confirm = generic_pipe_buf_confirm,
 161        .release = page_cache_pipe_buf_release,
 162        .steal = user_page_pipe_buf_steal,
 163        .get = generic_pipe_buf_get,
 164};
 165
 166static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
 167{
 168        smp_mb();
 169        if (waitqueue_active(&pipe->wait))
 170                wake_up_interruptible(&pipe->wait);
 171        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 172}
 173
 174/**
 175 * splice_to_pipe - fill passed data into a pipe
 176 * @pipe:       pipe to fill
 177 * @spd:        data to fill
 178 *
 179 * Description:
 180 *    @spd contains a map of pages and len/offset tuples, along with
 181 *    the struct pipe_buf_operations associated with these pages. This
 182 *    function will link that data to the pipe.
 183 *
 184 */
 185ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 186                       struct splice_pipe_desc *spd)
 187{
 188        unsigned int spd_pages = spd->nr_pages;
 189        int ret = 0, page_nr = 0;
 190
 191        if (!spd_pages)
 192                return 0;
 193
 194        if (unlikely(!pipe->readers)) {
 195                send_sig(SIGPIPE, current, 0);
 196                ret = -EPIPE;
 197                goto out;
 198        }
 199
 200        while (pipe->nrbufs < pipe->buffers) {
 201                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 202                struct pipe_buffer *buf = pipe->bufs + newbuf;
 203
 204                buf->page = spd->pages[page_nr];
 205                buf->offset = spd->partial[page_nr].offset;
 206                buf->len = spd->partial[page_nr].len;
 207                buf->private = spd->partial[page_nr].private;
 208                buf->ops = spd->ops;
 209                buf->flags = 0;
 210
 211                pipe->nrbufs++;
 212                page_nr++;
 213                ret += buf->len;
 214
 215                if (!--spd->nr_pages)
 216                        break;
 217        }
 218
 219        if (!ret)
 220                ret = -EAGAIN;
 221
 222out:
 223        while (page_nr < spd_pages)
 224                spd->spd_release(spd, page_nr++);
 225
 226        return ret;
 227}
 228EXPORT_SYMBOL_GPL(splice_to_pipe);
 229
 230ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 231{
 232        int ret;
 233
 234        if (unlikely(!pipe->readers)) {
 235                send_sig(SIGPIPE, current, 0);
 236                ret = -EPIPE;
 237        } else if (pipe->nrbufs == pipe->buffers) {
 238                ret = -EAGAIN;
 239        } else {
 240                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 241                pipe->bufs[newbuf] = *buf;
 242                pipe->nrbufs++;
 243                return buf->len;
 244        }
 245        pipe_buf_release(pipe, buf);
 246        return ret;
 247}
 248EXPORT_SYMBOL(add_to_pipe);
 249
 250/*
 251 * Check if we need to grow the arrays holding pages and partial page
 252 * descriptions.
 253 */
 254int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
 255{
 256        unsigned int buffers = READ_ONCE(pipe->buffers);
 257
 258        spd->nr_pages_max = buffers;
 259        if (buffers <= PIPE_DEF_BUFFERS)
 260                return 0;
 261
 262        spd->pages = kmalloc_array(buffers, sizeof(struct page *), GFP_KERNEL);
 263        spd->partial = kmalloc_array(buffers, sizeof(struct partial_page),
 264                                     GFP_KERNEL);
 265
 266        if (spd->pages && spd->partial)
 267                return 0;
 268
 269        kfree(spd->pages);
 270        kfree(spd->partial);
 271        return -ENOMEM;
 272}
 273
 274void splice_shrink_spd(struct splice_pipe_desc *spd)
 275{
 276        if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
 277                return;
 278
 279        kfree(spd->pages);
 280        kfree(spd->partial);
 281}
 282
 283/**
 284 * generic_file_splice_read - splice data from file to a pipe
 285 * @in:         file to splice from
 286 * @ppos:       position in @in
 287 * @pipe:       pipe to splice to
 288 * @len:        number of bytes to splice
 289 * @flags:      splice modifier flags
 290 *
 291 * Description:
 292 *    Will read pages from given file and fill them into a pipe. Can be
 293 *    used as long as it has more or less sane ->read_iter().
 294 *
 295 */
 296ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 297                                 struct pipe_inode_info *pipe, size_t len,
 298                                 unsigned int flags)
 299{
 300        struct iov_iter to;
 301        struct kiocb kiocb;
 302        int idx, ret;
 303
 304        iov_iter_pipe(&to, READ, pipe, len);
 305        idx = to.idx;
 306        init_sync_kiocb(&kiocb, in);
 307        kiocb.ki_pos = *ppos;
 308        ret = call_read_iter(in, &kiocb, &to);
 309        if (ret > 0) {
 310                *ppos = kiocb.ki_pos;
 311                file_accessed(in);
 312        } else if (ret < 0) {
 313                to.idx = idx;
 314                to.iov_offset = 0;
 315                iov_iter_advance(&to, 0); /* to free what was emitted */
 316                /*
 317                 * callers of ->splice_read() expect -EAGAIN on
 318                 * "can't put anything in there", rather than -EFAULT.
 319                 */
 320                if (ret == -EFAULT)
 321                        ret = -EAGAIN;
 322        }
 323
 324        return ret;
 325}
 326EXPORT_SYMBOL(generic_file_splice_read);
 327
 328const struct pipe_buf_operations default_pipe_buf_ops = {
 329        .can_merge = 0,
 330        .confirm = generic_pipe_buf_confirm,
 331        .release = generic_pipe_buf_release,
 332        .steal = generic_pipe_buf_steal,
 333        .get = generic_pipe_buf_get,
 334};
 335
 336static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
 337                                    struct pipe_buffer *buf)
 338{
 339        return 1;
 340}
 341
 342/* Pipe buffer operations for a socket and similar. */
 343const struct pipe_buf_operations nosteal_pipe_buf_ops = {
 344        .can_merge = 0,
 345        .confirm = generic_pipe_buf_confirm,
 346        .release = generic_pipe_buf_release,
 347        .steal = generic_pipe_buf_nosteal,
 348        .get = generic_pipe_buf_get,
 349};
 350EXPORT_SYMBOL(nosteal_pipe_buf_ops);
 351
 352static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
 353                            unsigned long vlen, loff_t offset)
 354{
 355        mm_segment_t old_fs;
 356        loff_t pos = offset;
 357        ssize_t res;
 358
 359        old_fs = get_fs();
 360        set_fs(get_ds());
 361        /* The cast to a user pointer is valid due to the set_fs() */
 362        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
 363        set_fs(old_fs);
 364
 365        return res;
 366}
 367
 368static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 369                                 struct pipe_inode_info *pipe, size_t len,
 370                                 unsigned int flags)
 371{
 372        struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
 373        struct iov_iter to;
 374        struct page **pages;
 375        unsigned int nr_pages;
 376        size_t offset, base, copied = 0;
 377        ssize_t res;
 378        int i;
 379
 380        if (pipe->nrbufs == pipe->buffers)
 381                return -EAGAIN;
 382
 383        /*
 384         * Try to keep page boundaries matching to source pagecache ones -
 385         * it probably won't be much help, but...
 386         */
 387        offset = *ppos & ~PAGE_MASK;
 388
 389        iov_iter_pipe(&to, READ, pipe, len + offset);
 390
 391        res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
 392        if (res <= 0)
 393                return -ENOMEM;
 394
 395        nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE);
 396
 397        vec = __vec;
 398        if (nr_pages > PIPE_DEF_BUFFERS) {
 399                vec = kmalloc_array(nr_pages, sizeof(struct kvec), GFP_KERNEL);
 400                if (unlikely(!vec)) {
 401                        res = -ENOMEM;
 402                        goto out;
 403                }
 404        }
 405
 406        pipe->bufs[to.idx].offset = offset;
 407        pipe->bufs[to.idx].len -= offset;
 408
 409        for (i = 0; i < nr_pages; i++) {
 410                size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
 411                vec[i].iov_base = page_address(pages[i]) + offset;
 412                vec[i].iov_len = this_len;
 413                len -= this_len;
 414                offset = 0;
 415        }
 416
 417        res = kernel_readv(in, vec, nr_pages, *ppos);
 418        if (res > 0) {
 419                copied = res;
 420                *ppos += res;
 421        }
 422
 423        if (vec != __vec)
 424                kfree(vec);
 425out:
 426        for (i = 0; i < nr_pages; i++)
 427                put_page(pages[i]);
 428        kvfree(pages);
 429        iov_iter_advance(&to, copied);  /* truncates and discards */
 430        return res;
 431}
 432
 433/*
 434 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 435 * using sendpage(). Return the number of bytes sent.
 436 */
 437static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 438                            struct pipe_buffer *buf, struct splice_desc *sd)
 439{
 440        struct file *file = sd->u.file;
 441        loff_t pos = sd->pos;
 442        int more;
 443
 444        if (!likely(file->f_op->sendpage))
 445                return -EINVAL;
 446
 447        more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
 448
 449        if (sd->len < sd->total_len && pipe->nrbufs > 1)
 450                more |= MSG_SENDPAGE_NOTLAST;
 451
 452        return file->f_op->sendpage(file, buf->page, buf->offset,
 453                                    sd->len, &pos, more);
 454}
 455
 456static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 457{
 458        smp_mb();
 459        if (waitqueue_active(&pipe->wait))
 460                wake_up_interruptible(&pipe->wait);
 461        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 462}
 463
 464/**
 465 * splice_from_pipe_feed - feed available data from a pipe to a file
 466 * @pipe:       pipe to splice from
 467 * @sd:         information to @actor
 468 * @actor:      handler that splices the data
 469 *
 470 * Description:
 471 *    This function loops over the pipe and calls @actor to do the
 472 *    actual moving of a single struct pipe_buffer to the desired
 473 *    destination.  It returns when there's no more buffers left in
 474 *    the pipe or if the requested number of bytes (@sd->total_len)
 475 *    have been copied.  It returns a positive number (one) if the
 476 *    pipe needs to be filled with more data, zero if the required
 477 *    number of bytes have been copied and -errno on error.
 478 *
 479 *    This, together with splice_from_pipe_{begin,end,next}, may be
 480 *    used to implement the functionality of __splice_from_pipe() when
 481 *    locking is required around copying the pipe buffers to the
 482 *    destination.
 483 */
 484static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 485                          splice_actor *actor)
 486{
 487        int ret;
 488
 489        while (pipe->nrbufs) {
 490                struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 491
 492                sd->len = buf->len;
 493                if (sd->len > sd->total_len)
 494                        sd->len = sd->total_len;
 495
 496                ret = pipe_buf_confirm(pipe, buf);
 497                if (unlikely(ret)) {
 498                        if (ret == -ENODATA)
 499                                ret = 0;
 500                        return ret;
 501                }
 502
 503                ret = actor(pipe, buf, sd);
 504                if (ret <= 0)
 505                        return ret;
 506
 507                buf->offset += ret;
 508                buf->len -= ret;
 509
 510                sd->num_spliced += ret;
 511                sd->len -= ret;
 512                sd->pos += ret;
 513                sd->total_len -= ret;
 514
 515                if (!buf->len) {
 516                        pipe_buf_release(pipe, buf);
 517                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 518                        pipe->nrbufs--;
 519                        if (pipe->files)
 520                                sd->need_wakeup = true;
 521                }
 522
 523                if (!sd->total_len)
 524                        return 0;
 525        }
 526
 527        return 1;
 528}
 529
 530/**
 531 * splice_from_pipe_next - wait for some data to splice from
 532 * @pipe:       pipe to splice from
 533 * @sd:         information about the splice operation
 534 *
 535 * Description:
 536 *    This function will wait for some data and return a positive
 537 *    value (one) if pipe buffers are available.  It will return zero
 538 *    or -errno if no more data needs to be spliced.
 539 */
 540static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 541{
 542        /*
 543         * Check for signal early to make process killable when there are
 544         * always buffers available
 545         */
 546        if (signal_pending(current))
 547                return -ERESTARTSYS;
 548
 549        while (!pipe->nrbufs) {
 550                if (!pipe->writers)
 551                        return 0;
 552
 553                if (!pipe->waiting_writers && sd->num_spliced)
 554                        return 0;
 555
 556                if (sd->flags & SPLICE_F_NONBLOCK)
 557                        return -EAGAIN;
 558
 559                if (signal_pending(current))
 560                        return -ERESTARTSYS;
 561
 562                if (sd->need_wakeup) {
 563                        wakeup_pipe_writers(pipe);
 564                        sd->need_wakeup = false;
 565                }
 566
 567                pipe_wait(pipe);
 568        }
 569
 570        return 1;
 571}
 572
 573/**
 574 * splice_from_pipe_begin - start splicing from pipe
 575 * @sd:         information about the splice operation
 576 *
 577 * Description:
 578 *    This function should be called before a loop containing
 579 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 580 *    initialize the necessary fields of @sd.
 581 */
 582static void splice_from_pipe_begin(struct splice_desc *sd)
 583{
 584        sd->num_spliced = 0;
 585        sd->need_wakeup = false;
 586}
 587
 588/**
 589 * splice_from_pipe_end - finish splicing from pipe
 590 * @pipe:       pipe to splice from
 591 * @sd:         information about the splice operation
 592 *
 593 * Description:
 594 *    This function will wake up pipe writers if necessary.  It should
 595 *    be called after a loop containing splice_from_pipe_next() and
 596 *    splice_from_pipe_feed().
 597 */
 598static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
 599{
 600        if (sd->need_wakeup)
 601                wakeup_pipe_writers(pipe);
 602}
 603
 604/**
 605 * __splice_from_pipe - splice data from a pipe to given actor
 606 * @pipe:       pipe to splice from
 607 * @sd:         information to @actor
 608 * @actor:      handler that splices the data
 609 *
 610 * Description:
 611 *    This function does little more than loop over the pipe and call
 612 *    @actor to do the actual moving of a single struct pipe_buffer to
 613 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 614 *    pipe_to_user.
 615 *
 616 */
 617ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 618                           splice_actor *actor)
 619{
 620        int ret;
 621
 622        splice_from_pipe_begin(sd);
 623        do {
 624                cond_resched();
 625                ret = splice_from_pipe_next(pipe, sd);
 626                if (ret > 0)
 627                        ret = splice_from_pipe_feed(pipe, sd, actor);
 628        } while (ret > 0);
 629        splice_from_pipe_end(pipe, sd);
 630
 631        return sd->num_spliced ? sd->num_spliced : ret;
 632}
 633EXPORT_SYMBOL(__splice_from_pipe);
 634
 635/**
 636 * splice_from_pipe - splice data from a pipe to a file
 637 * @pipe:       pipe to splice from
 638 * @out:        file to splice to
 639 * @ppos:       position in @out
 640 * @len:        how many bytes to splice
 641 * @flags:      splice modifier flags
 642 * @actor:      handler that splices the data
 643 *
 644 * Description:
 645 *    See __splice_from_pipe. This function locks the pipe inode,
 646 *    otherwise it's identical to __splice_from_pipe().
 647 *
 648 */
 649ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 650                         loff_t *ppos, size_t len, unsigned int flags,
 651                         splice_actor *actor)
 652{
 653        ssize_t ret;
 654        struct splice_desc sd = {
 655                .total_len = len,
 656                .flags = flags,
 657                .pos = *ppos,
 658                .u.file = out,
 659        };
 660
 661        pipe_lock(pipe);
 662        ret = __splice_from_pipe(pipe, &sd, actor);
 663        pipe_unlock(pipe);
 664
 665        return ret;
 666}
 667
 668/**
 669 * iter_file_splice_write - splice data from a pipe to a file
 670 * @pipe:       pipe info
 671 * @out:        file to write to
 672 * @ppos:       position in @out
 673 * @len:        number of bytes to splice
 674 * @flags:      splice modifier flags
 675 *
 676 * Description:
 677 *    Will either move or copy pages (determined by @flags options) from
 678 *    the given pipe inode to the given file.
 679 *    This one is ->write_iter-based.
 680 *
 681 */
 682ssize_t
 683iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 684                          loff_t *ppos, size_t len, unsigned int flags)
 685{
 686        struct splice_desc sd = {
 687                .total_len = len,
 688                .flags = flags,
 689                .pos = *ppos,
 690                .u.file = out,
 691        };
 692        int nbufs = pipe->buffers;
 693        struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
 694                                        GFP_KERNEL);
 695        ssize_t ret;
 696
 697        if (unlikely(!array))
 698                return -ENOMEM;
 699
 700        pipe_lock(pipe);
 701
 702        splice_from_pipe_begin(&sd);
 703        while (sd.total_len) {
 704                struct iov_iter from;
 705                size_t left;
 706                int n, idx;
 707
 708                ret = splice_from_pipe_next(pipe, &sd);
 709                if (ret <= 0)
 710                        break;
 711
 712                if (unlikely(nbufs < pipe->buffers)) {
 713                        kfree(array);
 714                        nbufs = pipe->buffers;
 715                        array = kcalloc(nbufs, sizeof(struct bio_vec),
 716                                        GFP_KERNEL);
 717                        if (!array) {
 718                                ret = -ENOMEM;
 719                                break;
 720                        }
 721                }
 722
 723                /* build the vector */
 724                left = sd.total_len;
 725                for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
 726                        struct pipe_buffer *buf = pipe->bufs + idx;
 727                        size_t this_len = buf->len;
 728
 729                        if (this_len > left)
 730                                this_len = left;
 731
 732                        if (idx == pipe->buffers - 1)
 733                                idx = -1;
 734
 735                        ret = pipe_buf_confirm(pipe, buf);
 736                        if (unlikely(ret)) {
 737                                if (ret == -ENODATA)
 738                                        ret = 0;
 739                                goto done;
 740                        }
 741
 742                        array[n].bv_page = buf->page;
 743                        array[n].bv_len = this_len;
 744                        array[n].bv_offset = buf->offset;
 745                        left -= this_len;
 746                }
 747
 748                iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
 749                ret = vfs_iter_write(out, &from, &sd.pos, 0);
 750                if (ret <= 0)
 751                        break;
 752
 753                sd.num_spliced += ret;
 754                sd.total_len -= ret;
 755                *ppos = sd.pos;
 756
 757                /* dismiss the fully eaten buffers, adjust the partial one */
 758                while (ret) {
 759                        struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 760                        if (ret >= buf->len) {
 761                                ret -= buf->len;
 762                                buf->len = 0;
 763                                pipe_buf_release(pipe, buf);
 764                                pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 765                                pipe->nrbufs--;
 766                                if (pipe->files)
 767                                        sd.need_wakeup = true;
 768                        } else {
 769                                buf->offset += ret;
 770                                buf->len -= ret;
 771                                ret = 0;
 772                        }
 773                }
 774        }
 775done:
 776        kfree(array);
 777        splice_from_pipe_end(pipe, &sd);
 778
 779        pipe_unlock(pipe);
 780
 781        if (sd.num_spliced)
 782                ret = sd.num_spliced;
 783
 784        return ret;
 785}
 786
 787EXPORT_SYMBOL(iter_file_splice_write);
 788
 789static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 790                          struct splice_desc *sd)
 791{
 792        int ret;
 793        void *data;
 794        loff_t tmp = sd->pos;
 795
 796        data = kmap(buf->page);
 797        ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
 798        kunmap(buf->page);
 799
 800        return ret;
 801}
 802
 803static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
 804                                         struct file *out, loff_t *ppos,
 805                                         size_t len, unsigned int flags)
 806{
 807        ssize_t ret;
 808
 809        ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
 810        if (ret > 0)
 811                *ppos += ret;
 812
 813        return ret;
 814}
 815
 816/**
 817 * generic_splice_sendpage - splice data from a pipe to a socket
 818 * @pipe:       pipe to splice from
 819 * @out:        socket to write to
 820 * @ppos:       position in @out
 821 * @len:        number of bytes to splice
 822 * @flags:      splice modifier flags
 823 *
 824 * Description:
 825 *    Will send @len bytes from the pipe to a network socket. No data copying
 826 *    is involved.
 827 *
 828 */
 829ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 830                                loff_t *ppos, size_t len, unsigned int flags)
 831{
 832        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 833}
 834
 835EXPORT_SYMBOL(generic_splice_sendpage);
 836
 837/*
 838 * Attempt to initiate a splice from pipe to file.
 839 */
 840static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 841                           loff_t *ppos, size_t len, unsigned int flags)
 842{
 843        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 844                                loff_t *, size_t, unsigned int);
 845
 846        if (out->f_op->splice_write)
 847                splice_write = out->f_op->splice_write;
 848        else
 849                splice_write = default_file_splice_write;
 850
 851        return splice_write(pipe, out, ppos, len, flags);
 852}
 853
 854/*
 855 * Attempt to initiate a splice from a file to a pipe.
 856 */
 857static long do_splice_to(struct file *in, loff_t *ppos,
 858                         struct pipe_inode_info *pipe, size_t len,
 859                         unsigned int flags)
 860{
 861        ssize_t (*splice_read)(struct file *, loff_t *,
 862                               struct pipe_inode_info *, size_t, unsigned int);
 863        int ret;
 864
 865        if (unlikely(!(in->f_mode & FMODE_READ)))
 866                return -EBADF;
 867
 868        ret = rw_verify_area(READ, in, ppos, len);
 869        if (unlikely(ret < 0))
 870                return ret;
 871
 872        if (unlikely(len > MAX_RW_COUNT))
 873                len = MAX_RW_COUNT;
 874
 875        if (in->f_op->splice_read)
 876                splice_read = in->f_op->splice_read;
 877        else
 878                splice_read = default_file_splice_read;
 879
 880        return splice_read(in, ppos, pipe, len, flags);
 881}
 882
 883/**
 884 * splice_direct_to_actor - splices data directly between two non-pipes
 885 * @in:         file to splice from
 886 * @sd:         actor information on where to splice to
 887 * @actor:      handles the data splicing
 888 *
 889 * Description:
 890 *    This is a special case helper to splice directly between two
 891 *    points, without requiring an explicit pipe. Internally an allocated
 892 *    pipe is cached in the process, and reused during the lifetime of
 893 *    that process.
 894 *
 895 */
 896ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 897                               splice_direct_actor *actor)
 898{
 899        struct pipe_inode_info *pipe;
 900        long ret, bytes;
 901        umode_t i_mode;
 902        size_t len;
 903        int i, flags, more;
 904
 905        /*
 906         * We require the input being a regular file, as we don't want to
 907         * randomly drop data for eg socket -> socket splicing. Use the
 908         * piped splicing for that!
 909         */
 910        i_mode = file_inode(in)->i_mode;
 911        if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
 912                return -EINVAL;
 913
 914        /*
 915         * neither in nor out is a pipe, setup an internal pipe attached to
 916         * 'out' and transfer the wanted data from 'in' to 'out' through that
 917         */
 918        pipe = current->splice_pipe;
 919        if (unlikely(!pipe)) {
 920                pipe = alloc_pipe_info();
 921                if (!pipe)
 922                        return -ENOMEM;
 923
 924                /*
 925                 * We don't have an immediate reader, but we'll read the stuff
 926                 * out of the pipe right after the splice_to_pipe(). So set
 927                 * PIPE_READERS appropriately.
 928                 */
 929                pipe->readers = 1;
 930
 931                current->splice_pipe = pipe;
 932        }
 933
 934        /*
 935         * Do the splice.
 936         */
 937        ret = 0;
 938        bytes = 0;
 939        len = sd->total_len;
 940        flags = sd->flags;
 941
 942        /*
 943         * Don't block on output, we have to drain the direct pipe.
 944         */
 945        sd->flags &= ~SPLICE_F_NONBLOCK;
 946        more = sd->flags & SPLICE_F_MORE;
 947
 948        WARN_ON_ONCE(pipe->nrbufs != 0);
 949
 950        while (len) {
 951                unsigned int pipe_pages;
 952                size_t read_len;
 953                loff_t pos = sd->pos, prev_pos = pos;
 954
 955                /* Don't try to read more the pipe has space for. */
 956                pipe_pages = pipe->buffers - pipe->nrbufs;
 957                read_len = min(len, (size_t)pipe_pages << PAGE_SHIFT);
 958                ret = do_splice_to(in, &pos, pipe, read_len, flags);
 959                if (unlikely(ret <= 0))
 960                        goto out_release;
 961
 962                read_len = ret;
 963                sd->total_len = read_len;
 964
 965                /*
 966                 * If more data is pending, set SPLICE_F_MORE
 967                 * If this is the last data and SPLICE_F_MORE was not set
 968                 * initially, clears it.
 969                 */
 970                if (read_len < len)
 971                        sd->flags |= SPLICE_F_MORE;
 972                else if (!more)
 973                        sd->flags &= ~SPLICE_F_MORE;
 974                /*
 975                 * NOTE: nonblocking mode only applies to the input. We
 976                 * must not do the output in nonblocking mode as then we
 977                 * could get stuck data in the internal pipe:
 978                 */
 979                ret = actor(pipe, sd);
 980                if (unlikely(ret <= 0)) {
 981                        sd->pos = prev_pos;
 982                        goto out_release;
 983                }
 984
 985                bytes += ret;
 986                len -= ret;
 987                sd->pos = pos;
 988
 989                if (ret < read_len) {
 990                        sd->pos = prev_pos + ret;
 991                        goto out_release;
 992                }
 993        }
 994
 995done:
 996        pipe->nrbufs = pipe->curbuf = 0;
 997        file_accessed(in);
 998        return bytes;
 999
1000out_release:
1001        /*
1002         * If we did an incomplete transfer we must release
1003         * the pipe buffers in question:
1004         */
1005        for (i = 0; i < pipe->buffers; i++) {
1006                struct pipe_buffer *buf = pipe->bufs + i;
1007
1008                if (buf->ops)
1009                        pipe_buf_release(pipe, buf);
1010        }
1011
1012        if (!bytes)
1013                bytes = ret;
1014
1015        goto done;
1016}
1017EXPORT_SYMBOL(splice_direct_to_actor);
1018
1019static int direct_splice_actor(struct pipe_inode_info *pipe,
1020                               struct splice_desc *sd)
1021{
1022        struct file *file = sd->u.file;
1023
1024        return do_splice_from(pipe, file, sd->opos, sd->total_len,
1025                              sd->flags);
1026}
1027
1028/**
1029 * do_splice_direct - splices data directly between two files
1030 * @in:         file to splice from
1031 * @ppos:       input file offset
1032 * @out:        file to splice to
1033 * @opos:       output file offset
1034 * @len:        number of bytes to splice
1035 * @flags:      splice modifier flags
1036 *
1037 * Description:
1038 *    For use by do_sendfile(). splice can easily emulate sendfile, but
1039 *    doing it in the application would incur an extra system call
1040 *    (splice in + splice out, as compared to just sendfile()). So this helper
1041 *    can splice directly through a process-private pipe.
1042 *
1043 */
1044long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1045                      loff_t *opos, size_t len, unsigned int flags)
1046{
1047        struct splice_desc sd = {
1048                .len            = len,
1049                .total_len      = len,
1050                .flags          = flags,
1051                .pos            = *ppos,
1052                .u.file         = out,
1053                .opos           = opos,
1054        };
1055        long ret;
1056
1057        if (unlikely(!(out->f_mode & FMODE_WRITE)))
1058                return -EBADF;
1059
1060        if (unlikely(out->f_flags & O_APPEND))
1061                return -EINVAL;
1062
1063        ret = rw_verify_area(WRITE, out, opos, len);
1064        if (unlikely(ret < 0))
1065                return ret;
1066
1067        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1068        if (ret > 0)
1069                *ppos = sd.pos;
1070
1071        return ret;
1072}
1073EXPORT_SYMBOL(do_splice_direct);
1074
1075static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1076{
1077        for (;;) {
1078                if (unlikely(!pipe->readers)) {
1079                        send_sig(SIGPIPE, current, 0);
1080                        return -EPIPE;
1081                }
1082                if (pipe->nrbufs != pipe->buffers)
1083                        return 0;
1084                if (flags & SPLICE_F_NONBLOCK)
1085                        return -EAGAIN;
1086                if (signal_pending(current))
1087                        return -ERESTARTSYS;
1088                pipe->waiting_writers++;
1089                pipe_wait(pipe);
1090                pipe->waiting_writers--;
1091        }
1092}
1093
1094static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1095                               struct pipe_inode_info *opipe,
1096                               size_t len, unsigned int flags);
1097
1098/*
1099 * Determine where to splice to/from.
1100 */
1101long do_splice(struct file *in, loff_t __user *off_in,
1102                struct file *out, loff_t __user *off_out,
1103                size_t len, unsigned int flags)
1104{
1105        struct pipe_inode_info *ipipe;
1106        struct pipe_inode_info *opipe;
1107        loff_t offset;
1108        long ret;
1109
1110        if (unlikely(!(in->f_mode & FMODE_READ) ||
1111                     !(out->f_mode & FMODE_WRITE)))
1112                return -EBADF;
1113
1114        ipipe = get_pipe_info(in);
1115        opipe = get_pipe_info(out);
1116
1117        if (ipipe && opipe) {
1118                if (off_in || off_out)
1119                        return -ESPIPE;
1120
1121                /* Splicing to self would be fun, but... */
1122                if (ipipe == opipe)
1123                        return -EINVAL;
1124
1125                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1126        }
1127
1128        if (ipipe) {
1129                if (off_in)
1130                        return -ESPIPE;
1131                if (off_out) {
1132                        if (!(out->f_mode & FMODE_PWRITE))
1133                                return -EINVAL;
1134                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1135                                return -EFAULT;
1136                } else {
1137                        offset = out->f_pos;
1138                }
1139
1140                if (unlikely(out->f_flags & O_APPEND))
1141                        return -EINVAL;
1142
1143                ret = rw_verify_area(WRITE, out, &offset, len);
1144                if (unlikely(ret < 0))
1145                        return ret;
1146
1147                file_start_write(out);
1148                ret = do_splice_from(ipipe, out, &offset, len, flags);
1149                file_end_write(out);
1150
1151                if (!off_out)
1152                        out->f_pos = offset;
1153                else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1154                        ret = -EFAULT;
1155
1156                return ret;
1157        }
1158
1159        if (opipe) {
1160                if (off_out)
1161                        return -ESPIPE;
1162                if (off_in) {
1163                        if (!(in->f_mode & FMODE_PREAD))
1164                                return -EINVAL;
1165                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1166                                return -EFAULT;
1167                } else {
1168                        offset = in->f_pos;
1169                }
1170
1171                pipe_lock(opipe);
1172                ret = wait_for_space(opipe, flags);
1173                if (!ret) {
1174                        unsigned int pipe_pages;
1175
1176                        /* Don't try to read more the pipe has space for. */
1177                        pipe_pages = opipe->buffers - opipe->nrbufs;
1178                        len = min(len, (size_t)pipe_pages << PAGE_SHIFT);
1179
1180                        ret = do_splice_to(in, &offset, opipe, len, flags);
1181                }
1182                pipe_unlock(opipe);
1183                if (ret > 0)
1184                        wakeup_pipe_readers(opipe);
1185                if (!off_in)
1186                        in->f_pos = offset;
1187                else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1188                        ret = -EFAULT;
1189
1190                return ret;
1191        }
1192
1193        return -EINVAL;
1194}
1195
1196static int iter_to_pipe(struct iov_iter *from,
1197                        struct pipe_inode_info *pipe,
1198                        unsigned flags)
1199{
1200        struct pipe_buffer buf = {
1201                .ops = &user_page_pipe_buf_ops,
1202                .flags = flags
1203        };
1204        size_t total = 0;
1205        int ret = 0;
1206        bool failed = false;
1207
1208        while (iov_iter_count(from) && !failed) {
1209                struct page *pages[16];
1210                ssize_t copied;
1211                size_t start;
1212                int n;
1213
1214                copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1215                if (copied <= 0) {
1216                        ret = copied;
1217                        break;
1218                }
1219
1220                for (n = 0; copied; n++, start = 0) {
1221                        int size = min_t(int, copied, PAGE_SIZE - start);
1222                        if (!failed) {
1223                                buf.page = pages[n];
1224                                buf.offset = start;
1225                                buf.len = size;
1226                                ret = add_to_pipe(pipe, &buf);
1227                                if (unlikely(ret < 0)) {
1228                                        failed = true;
1229                                } else {
1230                                        iov_iter_advance(from, ret);
1231                                        total += ret;
1232                                }
1233                        } else {
1234                                put_page(pages[n]);
1235                        }
1236                        copied -= size;
1237                }
1238        }
1239        return total ? total : ret;
1240}
1241
1242static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1243                        struct splice_desc *sd)
1244{
1245        int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1246        return n == sd->len ? n : -EFAULT;
1247}
1248
1249/*
1250 * For lack of a better implementation, implement vmsplice() to userspace
1251 * as a simple copy of the pipes pages to the user iov.
1252 */
1253static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1254                             unsigned int flags)
1255{
1256        struct pipe_inode_info *pipe = get_pipe_info(file);
1257        struct splice_desc sd = {
1258                .total_len = iov_iter_count(iter),
1259                .flags = flags,
1260                .u.data = iter
1261        };
1262        long ret = 0;
1263
1264        if (!pipe)
1265                return -EBADF;
1266
1267        if (sd.total_len) {
1268                pipe_lock(pipe);
1269                ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1270                pipe_unlock(pipe);
1271        }
1272
1273        return ret;
1274}
1275
1276/*
1277 * vmsplice splices a user address range into a pipe. It can be thought of
1278 * as splice-from-memory, where the regular splice is splice-from-file (or
1279 * to file). In both cases the output is a pipe, naturally.
1280 */
1281static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1282                             unsigned int flags)
1283{
1284        struct pipe_inode_info *pipe;
1285        long ret = 0;
1286        unsigned buf_flag = 0;
1287
1288        if (flags & SPLICE_F_GIFT)
1289                buf_flag = PIPE_BUF_FLAG_GIFT;
1290
1291        pipe = get_pipe_info(file);
1292        if (!pipe)
1293                return -EBADF;
1294
1295        pipe_lock(pipe);
1296        ret = wait_for_space(pipe, flags);
1297        if (!ret)
1298                ret = iter_to_pipe(iter, pipe, buf_flag);
1299        pipe_unlock(pipe);
1300        if (ret > 0)
1301                wakeup_pipe_readers(pipe);
1302        return ret;
1303}
1304
1305static int vmsplice_type(struct fd f, int *type)
1306{
1307        if (!f.file)
1308                return -EBADF;
1309        if (f.file->f_mode & FMODE_WRITE) {
1310                *type = WRITE;
1311        } else if (f.file->f_mode & FMODE_READ) {
1312                *type = READ;
1313        } else {
1314                fdput(f);
1315                return -EBADF;
1316        }
1317        return 0;
1318}
1319
1320/*
1321 * Note that vmsplice only really supports true splicing _from_ user memory
1322 * to a pipe, not the other way around. Splicing from user memory is a simple
1323 * operation that can be supported without any funky alignment restrictions
1324 * or nasty vm tricks. We simply map in the user memory and fill them into
1325 * a pipe. The reverse isn't quite as easy, though. There are two possible
1326 * solutions for that:
1327 *
1328 *      - memcpy() the data internally, at which point we might as well just
1329 *        do a regular read() on the buffer anyway.
1330 *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1331 *        has restriction limitations on both ends of the pipe).
1332 *
1333 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1334 *
1335 */
1336static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags)
1337{
1338        if (unlikely(flags & ~SPLICE_F_ALL))
1339                return -EINVAL;
1340
1341        if (!iov_iter_count(iter))
1342                return 0;
1343
1344        if (iov_iter_rw(iter) == WRITE)
1345                return vmsplice_to_pipe(f, iter, flags);
1346        else
1347                return vmsplice_to_user(f, iter, flags);
1348}
1349
1350SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1351                unsigned long, nr_segs, unsigned int, flags)
1352{
1353        struct iovec iovstack[UIO_FASTIOV];
1354        struct iovec *iov = iovstack;
1355        struct iov_iter iter;
1356        ssize_t error;
1357        struct fd f;
1358        int type;
1359
1360        f = fdget(fd);
1361        error = vmsplice_type(f, &type);
1362        if (error)
1363                return error;
1364
1365        error = import_iovec(type, uiov, nr_segs,
1366                             ARRAY_SIZE(iovstack), &iov, &iter);
1367        if (error >= 0) {
1368                error = do_vmsplice(f.file, &iter, flags);
1369                kfree(iov);
1370        }
1371        fdput(f);
1372        return error;
1373}
1374
1375#ifdef CONFIG_COMPAT
1376COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1377                    unsigned int, nr_segs, unsigned int, flags)
1378{
1379        struct iovec iovstack[UIO_FASTIOV];
1380        struct iovec *iov = iovstack;
1381        struct iov_iter iter;
1382        ssize_t error;
1383        struct fd f;
1384        int type;
1385
1386        f = fdget(fd);
1387        error = vmsplice_type(f, &type);
1388        if (error)
1389                return error;
1390
1391        error = compat_import_iovec(type, iov32, nr_segs,
1392                             ARRAY_SIZE(iovstack), &iov, &iter);
1393        if (error >= 0) {
1394                error = do_vmsplice(f.file, &iter, flags);
1395                kfree(iov);
1396        }
1397        fdput(f);
1398        return error;
1399}
1400#endif
1401
1402SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1403                int, fd_out, loff_t __user *, off_out,
1404                size_t, len, unsigned int, flags)
1405{
1406        struct fd in, out;
1407        long error;
1408
1409        if (unlikely(!len))
1410                return 0;
1411
1412        if (unlikely(flags & ~SPLICE_F_ALL))
1413                return -EINVAL;
1414
1415        error = -EBADF;
1416        in = fdget(fd_in);
1417        if (in.file) {
1418                out = fdget(fd_out);
1419                if (out.file) {
1420                        error = do_splice(in.file, off_in, out.file, off_out,
1421                                          len, flags);
1422                        fdput(out);
1423                }
1424                fdput(in);
1425        }
1426        return error;
1427}
1428
1429/*
1430 * Make sure there's data to read. Wait for input if we can, otherwise
1431 * return an appropriate error.
1432 */
1433static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1434{
1435        int ret;
1436
1437        /*
1438         * Check ->nrbufs without the inode lock first. This function
1439         * is speculative anyways, so missing one is ok.
1440         */
1441        if (pipe->nrbufs)
1442                return 0;
1443
1444        ret = 0;
1445        pipe_lock(pipe);
1446
1447        while (!pipe->nrbufs) {
1448                if (signal_pending(current)) {
1449                        ret = -ERESTARTSYS;
1450                        break;
1451                }
1452                if (!pipe->writers)
1453                        break;
1454                if (!pipe->waiting_writers) {
1455                        if (flags & SPLICE_F_NONBLOCK) {
1456                                ret = -EAGAIN;
1457                                break;
1458                        }
1459                }
1460                pipe_wait(pipe);
1461        }
1462
1463        pipe_unlock(pipe);
1464        return ret;
1465}
1466
1467/*
1468 * Make sure there's writeable room. Wait for room if we can, otherwise
1469 * return an appropriate error.
1470 */
1471static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1472{
1473        int ret;
1474
1475        /*
1476         * Check ->nrbufs without the inode lock first. This function
1477         * is speculative anyways, so missing one is ok.
1478         */
1479        if (pipe->nrbufs < pipe->buffers)
1480                return 0;
1481
1482        ret = 0;
1483        pipe_lock(pipe);
1484
1485        while (pipe->nrbufs >= pipe->buffers) {
1486                if (!pipe->readers) {
1487                        send_sig(SIGPIPE, current, 0);
1488                        ret = -EPIPE;
1489                        break;
1490                }
1491                if (flags & SPLICE_F_NONBLOCK) {
1492                        ret = -EAGAIN;
1493                        break;
1494                }
1495                if (signal_pending(current)) {
1496                        ret = -ERESTARTSYS;
1497                        break;
1498                }
1499                pipe->waiting_writers++;
1500                pipe_wait(pipe);
1501                pipe->waiting_writers--;
1502        }
1503
1504        pipe_unlock(pipe);
1505        return ret;
1506}
1507
1508/*
1509 * Splice contents of ipipe to opipe.
1510 */
1511static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1512                               struct pipe_inode_info *opipe,
1513                               size_t len, unsigned int flags)
1514{
1515        struct pipe_buffer *ibuf, *obuf;
1516        int ret = 0, nbuf;
1517        bool input_wakeup = false;
1518
1519
1520retry:
1521        ret = ipipe_prep(ipipe, flags);
1522        if (ret)
1523                return ret;
1524
1525        ret = opipe_prep(opipe, flags);
1526        if (ret)
1527                return ret;
1528
1529        /*
1530         * Potential ABBA deadlock, work around it by ordering lock
1531         * grabbing by pipe info address. Otherwise two different processes
1532         * could deadlock (one doing tee from A -> B, the other from B -> A).
1533         */
1534        pipe_double_lock(ipipe, opipe);
1535
1536        do {
1537                if (!opipe->readers) {
1538                        send_sig(SIGPIPE, current, 0);
1539                        if (!ret)
1540                                ret = -EPIPE;
1541                        break;
1542                }
1543
1544                if (!ipipe->nrbufs && !ipipe->writers)
1545                        break;
1546
1547                /*
1548                 * Cannot make any progress, because either the input
1549                 * pipe is empty or the output pipe is full.
1550                 */
1551                if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1552                        /* Already processed some buffers, break */
1553                        if (ret)
1554                                break;
1555
1556                        if (flags & SPLICE_F_NONBLOCK) {
1557                                ret = -EAGAIN;
1558                                break;
1559                        }
1560
1561                        /*
1562                         * We raced with another reader/writer and haven't
1563                         * managed to process any buffers.  A zero return
1564                         * value means EOF, so retry instead.
1565                         */
1566                        pipe_unlock(ipipe);
1567                        pipe_unlock(opipe);
1568                        goto retry;
1569                }
1570
1571                ibuf = ipipe->bufs + ipipe->curbuf;
1572                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1573                obuf = opipe->bufs + nbuf;
1574
1575                if (len >= ibuf->len) {
1576                        /*
1577                         * Simply move the whole buffer from ipipe to opipe
1578                         */
1579                        *obuf = *ibuf;
1580                        ibuf->ops = NULL;
1581                        opipe->nrbufs++;
1582                        ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1583                        ipipe->nrbufs--;
1584                        input_wakeup = true;
1585                } else {
1586                        /*
1587                         * Get a reference to this pipe buffer,
1588                         * so we can copy the contents over.
1589                         */
1590                        if (!pipe_buf_get(ipipe, ibuf)) {
1591                                if (ret == 0)
1592                                        ret = -EFAULT;
1593                                break;
1594                        }
1595                        *obuf = *ibuf;
1596
1597                        /*
1598                         * Don't inherit the gift flag, we need to
1599                         * prevent multiple steals of this page.
1600                         */
1601                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1602
1603                        obuf->len = len;
1604                        opipe->nrbufs++;
1605                        ibuf->offset += obuf->len;
1606                        ibuf->len -= obuf->len;
1607                }
1608                ret += obuf->len;
1609                len -= obuf->len;
1610        } while (len);
1611
1612        pipe_unlock(ipipe);
1613        pipe_unlock(opipe);
1614
1615        /*
1616         * If we put data in the output pipe, wakeup any potential readers.
1617         */
1618        if (ret > 0)
1619                wakeup_pipe_readers(opipe);
1620
1621        if (input_wakeup)
1622                wakeup_pipe_writers(ipipe);
1623
1624        return ret;
1625}
1626
1627/*
1628 * Link contents of ipipe to opipe.
1629 */
1630static int link_pipe(struct pipe_inode_info *ipipe,
1631                     struct pipe_inode_info *opipe,
1632                     size_t len, unsigned int flags)
1633{
1634        struct pipe_buffer *ibuf, *obuf;
1635        int ret = 0, i = 0, nbuf;
1636
1637        /*
1638         * Potential ABBA deadlock, work around it by ordering lock
1639         * grabbing by pipe info address. Otherwise two different processes
1640         * could deadlock (one doing tee from A -> B, the other from B -> A).
1641         */
1642        pipe_double_lock(ipipe, opipe);
1643
1644        do {
1645                if (!opipe->readers) {
1646                        send_sig(SIGPIPE, current, 0);
1647                        if (!ret)
1648                                ret = -EPIPE;
1649                        break;
1650                }
1651
1652                /*
1653                 * If we have iterated all input buffers or ran out of
1654                 * output room, break.
1655                 */
1656                if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1657                        break;
1658
1659                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1660                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1661
1662                /*
1663                 * Get a reference to this pipe buffer,
1664                 * so we can copy the contents over.
1665                 */
1666                if (!pipe_buf_get(ipipe, ibuf)) {
1667                        if (ret == 0)
1668                                ret = -EFAULT;
1669                        break;
1670                }
1671
1672                obuf = opipe->bufs + nbuf;
1673                *obuf = *ibuf;
1674
1675                /*
1676                 * Don't inherit the gift flag, we need to
1677                 * prevent multiple steals of this page.
1678                 */
1679                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1680
1681                if (obuf->len > len)
1682                        obuf->len = len;
1683
1684                opipe->nrbufs++;
1685                ret += obuf->len;
1686                len -= obuf->len;
1687                i++;
1688        } while (len);
1689
1690        /*
1691         * return EAGAIN if we have the potential of some data in the
1692         * future, otherwise just return 0
1693         */
1694        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1695                ret = -EAGAIN;
1696
1697        pipe_unlock(ipipe);
1698        pipe_unlock(opipe);
1699
1700        /*
1701         * If we put data in the output pipe, wakeup any potential readers.
1702         */
1703        if (ret > 0)
1704                wakeup_pipe_readers(opipe);
1705
1706        return ret;
1707}
1708
1709/*
1710 * This is a tee(1) implementation that works on pipes. It doesn't copy
1711 * any data, it simply references the 'in' pages on the 'out' pipe.
1712 * The 'flags' used are the SPLICE_F_* variants, currently the only
1713 * applicable one is SPLICE_F_NONBLOCK.
1714 */
1715static long do_tee(struct file *in, struct file *out, size_t len,
1716                   unsigned int flags)
1717{
1718        struct pipe_inode_info *ipipe = get_pipe_info(in);
1719        struct pipe_inode_info *opipe = get_pipe_info(out);
1720        int ret = -EINVAL;
1721
1722        if (unlikely(!(in->f_mode & FMODE_READ) ||
1723                     !(out->f_mode & FMODE_WRITE)))
1724                return -EBADF;
1725
1726        /*
1727         * Duplicate the contents of ipipe to opipe without actually
1728         * copying the data.
1729         */
1730        if (ipipe && opipe && ipipe != opipe) {
1731                /*
1732                 * Keep going, unless we encounter an error. The ipipe/opipe
1733                 * ordering doesn't really matter.
1734                 */
1735                ret = ipipe_prep(ipipe, flags);
1736                if (!ret) {
1737                        ret = opipe_prep(opipe, flags);
1738                        if (!ret)
1739                                ret = link_pipe(ipipe, opipe, len, flags);
1740                }
1741        }
1742
1743        return ret;
1744}
1745
1746SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1747{
1748        struct fd in, out;
1749        int error;
1750
1751        if (unlikely(flags & ~SPLICE_F_ALL))
1752                return -EINVAL;
1753
1754        if (unlikely(!len))
1755                return 0;
1756
1757        error = -EBADF;
1758        in = fdget(fdin);
1759        if (in.file) {
1760                out = fdget(fdout);
1761                if (out.file) {
1762                        error = do_tee(in.file, out.file, len, flags);
1763                        fdput(out);
1764                }
1765                fdput(in);
1766        }
1767
1768        return error;
1769}
1770