linux/fs/splice.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * "splice": joining two ropes together by interweaving their strands.
   4 *
   5 * This is the "extended pipe" functionality, where a pipe is used as
   6 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
   7 * buffer that you can use to transfer data from one end to the other.
   8 *
   9 * The traditional unix read/write is extended with a "splice()" operation
  10 * that transfers data buffers to or from a pipe buffer.
  11 *
  12 * Named by Larry McVoy, original implementation from Linus, extended by
  13 * Jens to support splicing to files, network, direct splicing, etc and
  14 * fixing lots of bugs.
  15 *
  16 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  17 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  18 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  19 *
  20 */
  21#include <linux/bvec.h>
  22#include <linux/fs.h>
  23#include <linux/file.h>
  24#include <linux/pagemap.h>
  25#include <linux/splice.h>
  26#include <linux/memcontrol.h>
  27#include <linux/mm_inline.h>
  28#include <linux/swap.h>
  29#include <linux/writeback.h>
  30#include <linux/export.h>
  31#include <linux/syscalls.h>
  32#include <linux/uio.h>
  33#include <linux/security.h>
  34#include <linux/gfp.h>
  35#include <linux/socket.h>
  36#include <linux/sched/signal.h>
  37
  38#include "internal.h"
  39
  40/*
  41 * Attempt to steal a page from a pipe buffer. This should perhaps go into
  42 * a vm helper function, it's already simplified quite a bit by the
  43 * addition of remove_mapping(). If success is returned, the caller may
  44 * attempt to reuse this page for another destination.
  45 */
  46static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
  47                struct pipe_buffer *buf)
  48{
  49        struct page *page = buf->page;
  50        struct address_space *mapping;
  51
  52        lock_page(page);
  53
  54        mapping = page_mapping(page);
  55        if (mapping) {
  56                WARN_ON(!PageUptodate(page));
  57
  58                /*
  59                 * At least for ext2 with nobh option, we need to wait on
  60                 * writeback completing on this page, since we'll remove it
  61                 * from the pagecache.  Otherwise truncate wont wait on the
  62                 * page, allowing the disk blocks to be reused by someone else
  63                 * before we actually wrote our data to them. fs corruption
  64                 * ensues.
  65                 */
  66                wait_on_page_writeback(page);
  67
  68                if (page_has_private(page) &&
  69                    !try_to_release_page(page, GFP_KERNEL))
  70                        goto out_unlock;
  71
  72                /*
  73                 * If we succeeded in removing the mapping, set LRU flag
  74                 * and return good.
  75                 */
  76                if (remove_mapping(mapping, page)) {
  77                        buf->flags |= PIPE_BUF_FLAG_LRU;
  78                        return true;
  79                }
  80        }
  81
  82        /*
  83         * Raced with truncate or failed to remove page from current
  84         * address space, unlock and return failure.
  85         */
  86out_unlock:
  87        unlock_page(page);
  88        return false;
  89}
  90
  91static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  92                                        struct pipe_buffer *buf)
  93{
  94        put_page(buf->page);
  95        buf->flags &= ~PIPE_BUF_FLAG_LRU;
  96}
  97
  98/*
  99 * Check whether the contents of buf is OK to access. Since the content
 100 * is a page cache page, IO may be in flight.
 101 */
 102static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
 103                                       struct pipe_buffer *buf)
 104{
 105        struct page *page = buf->page;
 106        int err;
 107
 108        if (!PageUptodate(page)) {
 109                lock_page(page);
 110
 111                /*
 112                 * Page got truncated/unhashed. This will cause a 0-byte
 113                 * splice, if this is the first page.
 114                 */
 115                if (!page->mapping) {
 116                        err = -ENODATA;
 117                        goto error;
 118                }
 119
 120                /*
 121                 * Uh oh, read-error from disk.
 122                 */
 123                if (!PageUptodate(page)) {
 124                        err = -EIO;
 125                        goto error;
 126                }
 127
 128                /*
 129                 * Page is ok afterall, we are done.
 130                 */
 131                unlock_page(page);
 132        }
 133
 134        return 0;
 135error:
 136        unlock_page(page);
 137        return err;
 138}
 139
 140const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 141        .confirm        = page_cache_pipe_buf_confirm,
 142        .release        = page_cache_pipe_buf_release,
 143        .try_steal      = page_cache_pipe_buf_try_steal,
 144        .get            = generic_pipe_buf_get,
 145};
 146
 147static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
 148                struct pipe_buffer *buf)
 149{
 150        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 151                return false;
 152
 153        buf->flags |= PIPE_BUF_FLAG_LRU;
 154        return generic_pipe_buf_try_steal(pipe, buf);
 155}
 156
 157static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 158        .release        = page_cache_pipe_buf_release,
 159        .try_steal      = user_page_pipe_buf_try_steal,
 160        .get            = generic_pipe_buf_get,
 161};
 162
 163static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
 164{
 165        smp_mb();
 166        if (waitqueue_active(&pipe->rd_wait))
 167                wake_up_interruptible(&pipe->rd_wait);
 168        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 169}
 170
 171/**
 172 * splice_to_pipe - fill passed data into a pipe
 173 * @pipe:       pipe to fill
 174 * @spd:        data to fill
 175 *
 176 * Description:
 177 *    @spd contains a map of pages and len/offset tuples, along with
 178 *    the struct pipe_buf_operations associated with these pages. This
 179 *    function will link that data to the pipe.
 180 *
 181 */
 182ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 183                       struct splice_pipe_desc *spd)
 184{
 185        unsigned int spd_pages = spd->nr_pages;
 186        unsigned int tail = pipe->tail;
 187        unsigned int head = pipe->head;
 188        unsigned int mask = pipe->ring_size - 1;
 189        int ret = 0, page_nr = 0;
 190
 191        if (!spd_pages)
 192                return 0;
 193
 194        if (unlikely(!pipe->readers)) {
 195                send_sig(SIGPIPE, current, 0);
 196                ret = -EPIPE;
 197                goto out;
 198        }
 199
 200        while (!pipe_full(head, tail, pipe->max_usage)) {
 201                struct pipe_buffer *buf = &pipe->bufs[head & mask];
 202
 203                buf->page = spd->pages[page_nr];
 204                buf->offset = spd->partial[page_nr].offset;
 205                buf->len = spd->partial[page_nr].len;
 206                buf->private = spd->partial[page_nr].private;
 207                buf->ops = spd->ops;
 208                buf->flags = 0;
 209
 210                head++;
 211                pipe->head = head;
 212                page_nr++;
 213                ret += buf->len;
 214
 215                if (!--spd->nr_pages)
 216                        break;
 217        }
 218
 219        if (!ret)
 220                ret = -EAGAIN;
 221
 222out:
 223        while (page_nr < spd_pages)
 224                spd->spd_release(spd, page_nr++);
 225
 226        return ret;
 227}
 228EXPORT_SYMBOL_GPL(splice_to_pipe);
 229
 230ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 231{
 232        unsigned int head = pipe->head;
 233        unsigned int tail = pipe->tail;
 234        unsigned int mask = pipe->ring_size - 1;
 235        int ret;
 236
 237        if (unlikely(!pipe->readers)) {
 238                send_sig(SIGPIPE, current, 0);
 239                ret = -EPIPE;
 240        } else if (pipe_full(head, tail, pipe->max_usage)) {
 241                ret = -EAGAIN;
 242        } else {
 243                pipe->bufs[head & mask] = *buf;
 244                pipe->head = head + 1;
 245                return buf->len;
 246        }
 247        pipe_buf_release(pipe, buf);
 248        return ret;
 249}
 250EXPORT_SYMBOL(add_to_pipe);
 251
 252/*
 253 * Check if we need to grow the arrays holding pages and partial page
 254 * descriptions.
 255 */
 256int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
 257{
 258        unsigned int max_usage = READ_ONCE(pipe->max_usage);
 259
 260        spd->nr_pages_max = max_usage;
 261        if (max_usage <= PIPE_DEF_BUFFERS)
 262                return 0;
 263
 264        spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
 265        spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
 266                                     GFP_KERNEL);
 267
 268        if (spd->pages && spd->partial)
 269                return 0;
 270
 271        kfree(spd->pages);
 272        kfree(spd->partial);
 273        return -ENOMEM;
 274}
 275
 276void splice_shrink_spd(struct splice_pipe_desc *spd)
 277{
 278        if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
 279                return;
 280
 281        kfree(spd->pages);
 282        kfree(spd->partial);
 283}
 284
 285/**
 286 * generic_file_splice_read - splice data from file to a pipe
 287 * @in:         file to splice from
 288 * @ppos:       position in @in
 289 * @pipe:       pipe to splice to
 290 * @len:        number of bytes to splice
 291 * @flags:      splice modifier flags
 292 *
 293 * Description:
 294 *    Will read pages from given file and fill them into a pipe. Can be
 295 *    used as long as it has more or less sane ->read_iter().
 296 *
 297 */
 298ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 299                                 struct pipe_inode_info *pipe, size_t len,
 300                                 unsigned int flags)
 301{
 302        struct iov_iter to;
 303        struct kiocb kiocb;
 304        unsigned int i_head;
 305        int ret;
 306
 307        iov_iter_pipe(&to, READ, pipe, len);
 308        i_head = to.head;
 309        init_sync_kiocb(&kiocb, in);
 310        kiocb.ki_pos = *ppos;
 311        ret = call_read_iter(in, &kiocb, &to);
 312        if (ret > 0) {
 313                *ppos = kiocb.ki_pos;
 314                file_accessed(in);
 315        } else if (ret < 0) {
 316                to.head = i_head;
 317                to.iov_offset = 0;
 318                iov_iter_advance(&to, 0); /* to free what was emitted */
 319                /*
 320                 * callers of ->splice_read() expect -EAGAIN on
 321                 * "can't put anything in there", rather than -EFAULT.
 322                 */
 323                if (ret == -EFAULT)
 324                        ret = -EAGAIN;
 325        }
 326
 327        return ret;
 328}
 329EXPORT_SYMBOL(generic_file_splice_read);
 330
 331const struct pipe_buf_operations default_pipe_buf_ops = {
 332        .release        = generic_pipe_buf_release,
 333        .try_steal      = generic_pipe_buf_try_steal,
 334        .get            = generic_pipe_buf_get,
 335};
 336
 337/* Pipe buffer operations for a socket and similar. */
 338const struct pipe_buf_operations nosteal_pipe_buf_ops = {
 339        .release        = generic_pipe_buf_release,
 340        .get            = generic_pipe_buf_get,
 341};
 342EXPORT_SYMBOL(nosteal_pipe_buf_ops);
 343
 344/*
 345 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 346 * using sendpage(). Return the number of bytes sent.
 347 */
 348static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 349                            struct pipe_buffer *buf, struct splice_desc *sd)
 350{
 351        struct file *file = sd->u.file;
 352        loff_t pos = sd->pos;
 353        int more;
 354
 355        if (!likely(file->f_op->sendpage))
 356                return -EINVAL;
 357
 358        more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
 359
 360        if (sd->len < sd->total_len &&
 361            pipe_occupancy(pipe->head, pipe->tail) > 1)
 362                more |= MSG_SENDPAGE_NOTLAST;
 363
 364        return file->f_op->sendpage(file, buf->page, buf->offset,
 365                                    sd->len, &pos, more);
 366}
 367
 368static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 369{
 370        smp_mb();
 371        if (waitqueue_active(&pipe->wr_wait))
 372                wake_up_interruptible(&pipe->wr_wait);
 373        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 374}
 375
 376/**
 377 * splice_from_pipe_feed - feed available data from a pipe to a file
 378 * @pipe:       pipe to splice from
 379 * @sd:         information to @actor
 380 * @actor:      handler that splices the data
 381 *
 382 * Description:
 383 *    This function loops over the pipe and calls @actor to do the
 384 *    actual moving of a single struct pipe_buffer to the desired
 385 *    destination.  It returns when there's no more buffers left in
 386 *    the pipe or if the requested number of bytes (@sd->total_len)
 387 *    have been copied.  It returns a positive number (one) if the
 388 *    pipe needs to be filled with more data, zero if the required
 389 *    number of bytes have been copied and -errno on error.
 390 *
 391 *    This, together with splice_from_pipe_{begin,end,next}, may be
 392 *    used to implement the functionality of __splice_from_pipe() when
 393 *    locking is required around copying the pipe buffers to the
 394 *    destination.
 395 */
 396static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 397                          splice_actor *actor)
 398{
 399        unsigned int head = pipe->head;
 400        unsigned int tail = pipe->tail;
 401        unsigned int mask = pipe->ring_size - 1;
 402        int ret;
 403
 404        while (!pipe_empty(head, tail)) {
 405                struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 406
 407                sd->len = buf->len;
 408                if (sd->len > sd->total_len)
 409                        sd->len = sd->total_len;
 410
 411                ret = pipe_buf_confirm(pipe, buf);
 412                if (unlikely(ret)) {
 413                        if (ret == -ENODATA)
 414                                ret = 0;
 415                        return ret;
 416                }
 417
 418                ret = actor(pipe, buf, sd);
 419                if (ret <= 0)
 420                        return ret;
 421
 422                buf->offset += ret;
 423                buf->len -= ret;
 424
 425                sd->num_spliced += ret;
 426                sd->len -= ret;
 427                sd->pos += ret;
 428                sd->total_len -= ret;
 429
 430                if (!buf->len) {
 431                        pipe_buf_release(pipe, buf);
 432                        tail++;
 433                        pipe->tail = tail;
 434                        if (pipe->files)
 435                                sd->need_wakeup = true;
 436                }
 437
 438                if (!sd->total_len)
 439                        return 0;
 440        }
 441
 442        return 1;
 443}
 444
 445/* We know we have a pipe buffer, but maybe it's empty? */
 446static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
 447{
 448        unsigned int tail = pipe->tail;
 449        unsigned int mask = pipe->ring_size - 1;
 450        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 451
 452        if (unlikely(!buf->len)) {
 453                pipe_buf_release(pipe, buf);
 454                pipe->tail = tail+1;
 455                return true;
 456        }
 457
 458        return false;
 459}
 460
 461/**
 462 * splice_from_pipe_next - wait for some data to splice from
 463 * @pipe:       pipe to splice from
 464 * @sd:         information about the splice operation
 465 *
 466 * Description:
 467 *    This function will wait for some data and return a positive
 468 *    value (one) if pipe buffers are available.  It will return zero
 469 *    or -errno if no more data needs to be spliced.
 470 */
 471static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 472{
 473        /*
 474         * Check for signal early to make process killable when there are
 475         * always buffers available
 476         */
 477        if (signal_pending(current))
 478                return -ERESTARTSYS;
 479
 480repeat:
 481        while (pipe_empty(pipe->head, pipe->tail)) {
 482                if (!pipe->writers)
 483                        return 0;
 484
 485                if (sd->num_spliced)
 486                        return 0;
 487
 488                if (sd->flags & SPLICE_F_NONBLOCK)
 489                        return -EAGAIN;
 490
 491                if (signal_pending(current))
 492                        return -ERESTARTSYS;
 493
 494                if (sd->need_wakeup) {
 495                        wakeup_pipe_writers(pipe);
 496                        sd->need_wakeup = false;
 497                }
 498
 499                pipe_wait_readable(pipe);
 500        }
 501
 502        if (eat_empty_buffer(pipe))
 503                goto repeat;
 504
 505        return 1;
 506}
 507
 508/**
 509 * splice_from_pipe_begin - start splicing from pipe
 510 * @sd:         information about the splice operation
 511 *
 512 * Description:
 513 *    This function should be called before a loop containing
 514 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 515 *    initialize the necessary fields of @sd.
 516 */
 517static void splice_from_pipe_begin(struct splice_desc *sd)
 518{
 519        sd->num_spliced = 0;
 520        sd->need_wakeup = false;
 521}
 522
 523/**
 524 * splice_from_pipe_end - finish splicing from pipe
 525 * @pipe:       pipe to splice from
 526 * @sd:         information about the splice operation
 527 *
 528 * Description:
 529 *    This function will wake up pipe writers if necessary.  It should
 530 *    be called after a loop containing splice_from_pipe_next() and
 531 *    splice_from_pipe_feed().
 532 */
 533static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
 534{
 535        if (sd->need_wakeup)
 536                wakeup_pipe_writers(pipe);
 537}
 538
 539/**
 540 * __splice_from_pipe - splice data from a pipe to given actor
 541 * @pipe:       pipe to splice from
 542 * @sd:         information to @actor
 543 * @actor:      handler that splices the data
 544 *
 545 * Description:
 546 *    This function does little more than loop over the pipe and call
 547 *    @actor to do the actual moving of a single struct pipe_buffer to
 548 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 549 *    pipe_to_user.
 550 *
 551 */
 552ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 553                           splice_actor *actor)
 554{
 555        int ret;
 556
 557        splice_from_pipe_begin(sd);
 558        do {
 559                cond_resched();
 560                ret = splice_from_pipe_next(pipe, sd);
 561                if (ret > 0)
 562                        ret = splice_from_pipe_feed(pipe, sd, actor);
 563        } while (ret > 0);
 564        splice_from_pipe_end(pipe, sd);
 565
 566        return sd->num_spliced ? sd->num_spliced : ret;
 567}
 568EXPORT_SYMBOL(__splice_from_pipe);
 569
 570/**
 571 * splice_from_pipe - splice data from a pipe to a file
 572 * @pipe:       pipe to splice from
 573 * @out:        file to splice to
 574 * @ppos:       position in @out
 575 * @len:        how many bytes to splice
 576 * @flags:      splice modifier flags
 577 * @actor:      handler that splices the data
 578 *
 579 * Description:
 580 *    See __splice_from_pipe. This function locks the pipe inode,
 581 *    otherwise it's identical to __splice_from_pipe().
 582 *
 583 */
 584ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 585                         loff_t *ppos, size_t len, unsigned int flags,
 586                         splice_actor *actor)
 587{
 588        ssize_t ret;
 589        struct splice_desc sd = {
 590                .total_len = len,
 591                .flags = flags,
 592                .pos = *ppos,
 593                .u.file = out,
 594        };
 595
 596        pipe_lock(pipe);
 597        ret = __splice_from_pipe(pipe, &sd, actor);
 598        pipe_unlock(pipe);
 599
 600        return ret;
 601}
 602
 603/**
 604 * iter_file_splice_write - splice data from a pipe to a file
 605 * @pipe:       pipe info
 606 * @out:        file to write to
 607 * @ppos:       position in @out
 608 * @len:        number of bytes to splice
 609 * @flags:      splice modifier flags
 610 *
 611 * Description:
 612 *    Will either move or copy pages (determined by @flags options) from
 613 *    the given pipe inode to the given file.
 614 *    This one is ->write_iter-based.
 615 *
 616 */
 617ssize_t
 618iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 619                          loff_t *ppos, size_t len, unsigned int flags)
 620{
 621        struct splice_desc sd = {
 622                .total_len = len,
 623                .flags = flags,
 624                .pos = *ppos,
 625                .u.file = out,
 626        };
 627        int nbufs = pipe->max_usage;
 628        struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
 629                                        GFP_KERNEL);
 630        ssize_t ret;
 631
 632        if (unlikely(!array))
 633                return -ENOMEM;
 634
 635        pipe_lock(pipe);
 636
 637        splice_from_pipe_begin(&sd);
 638        while (sd.total_len) {
 639                struct iov_iter from;
 640                unsigned int head, tail, mask;
 641                size_t left;
 642                int n;
 643
 644                ret = splice_from_pipe_next(pipe, &sd);
 645                if (ret <= 0)
 646                        break;
 647
 648                if (unlikely(nbufs < pipe->max_usage)) {
 649                        kfree(array);
 650                        nbufs = pipe->max_usage;
 651                        array = kcalloc(nbufs, sizeof(struct bio_vec),
 652                                        GFP_KERNEL);
 653                        if (!array) {
 654                                ret = -ENOMEM;
 655                                break;
 656                        }
 657                }
 658
 659                head = pipe->head;
 660                tail = pipe->tail;
 661                mask = pipe->ring_size - 1;
 662
 663                /* build the vector */
 664                left = sd.total_len;
 665                for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
 666                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 667                        size_t this_len = buf->len;
 668
 669                        /* zero-length bvecs are not supported, skip them */
 670                        if (!this_len)
 671                                continue;
 672                        this_len = min(this_len, left);
 673
 674                        ret = pipe_buf_confirm(pipe, buf);
 675                        if (unlikely(ret)) {
 676                                if (ret == -ENODATA)
 677                                        ret = 0;
 678                                goto done;
 679                        }
 680
 681                        array[n].bv_page = buf->page;
 682                        array[n].bv_len = this_len;
 683                        array[n].bv_offset = buf->offset;
 684                        left -= this_len;
 685                        n++;
 686                }
 687
 688                iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
 689                ret = vfs_iter_write(out, &from, &sd.pos, 0);
 690                if (ret <= 0)
 691                        break;
 692
 693                sd.num_spliced += ret;
 694                sd.total_len -= ret;
 695                *ppos = sd.pos;
 696
 697                /* dismiss the fully eaten buffers, adjust the partial one */
 698                tail = pipe->tail;
 699                while (ret) {
 700                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 701                        if (ret >= buf->len) {
 702                                ret -= buf->len;
 703                                buf->len = 0;
 704                                pipe_buf_release(pipe, buf);
 705                                tail++;
 706                                pipe->tail = tail;
 707                                if (pipe->files)
 708                                        sd.need_wakeup = true;
 709                        } else {
 710                                buf->offset += ret;
 711                                buf->len -= ret;
 712                                ret = 0;
 713                        }
 714                }
 715        }
 716done:
 717        kfree(array);
 718        splice_from_pipe_end(pipe, &sd);
 719
 720        pipe_unlock(pipe);
 721
 722        if (sd.num_spliced)
 723                ret = sd.num_spliced;
 724
 725        return ret;
 726}
 727
 728EXPORT_SYMBOL(iter_file_splice_write);
 729
 730/**
 731 * generic_splice_sendpage - splice data from a pipe to a socket
 732 * @pipe:       pipe to splice from
 733 * @out:        socket to write to
 734 * @ppos:       position in @out
 735 * @len:        number of bytes to splice
 736 * @flags:      splice modifier flags
 737 *
 738 * Description:
 739 *    Will send @len bytes from the pipe to a network socket. No data copying
 740 *    is involved.
 741 *
 742 */
 743ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 744                                loff_t *ppos, size_t len, unsigned int flags)
 745{
 746        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 747}
 748
 749EXPORT_SYMBOL(generic_splice_sendpage);
 750
 751static int warn_unsupported(struct file *file, const char *op)
 752{
 753        pr_debug_ratelimited(
 754                "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
 755                op, file, current->pid, current->comm);
 756        return -EINVAL;
 757}
 758
 759/*
 760 * Attempt to initiate a splice from pipe to file.
 761 */
 762static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 763                           loff_t *ppos, size_t len, unsigned int flags)
 764{
 765        if (unlikely(!out->f_op->splice_write))
 766                return warn_unsupported(out, "write");
 767        return out->f_op->splice_write(pipe, out, ppos, len, flags);
 768}
 769
 770/*
 771 * Attempt to initiate a splice from a file to a pipe.
 772 */
 773static long do_splice_to(struct file *in, loff_t *ppos,
 774                         struct pipe_inode_info *pipe, size_t len,
 775                         unsigned int flags)
 776{
 777        unsigned int p_space;
 778        int ret;
 779
 780        if (unlikely(!(in->f_mode & FMODE_READ)))
 781                return -EBADF;
 782
 783        /* Don't try to read more the pipe has space for. */
 784        p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
 785        len = min_t(size_t, len, p_space << PAGE_SHIFT);
 786
 787        ret = rw_verify_area(READ, in, ppos, len);
 788        if (unlikely(ret < 0))
 789                return ret;
 790
 791        if (unlikely(len > MAX_RW_COUNT))
 792                len = MAX_RW_COUNT;
 793
 794        if (unlikely(!in->f_op->splice_read))
 795                return warn_unsupported(in, "read");
 796        return in->f_op->splice_read(in, ppos, pipe, len, flags);
 797}
 798
 799/**
 800 * splice_direct_to_actor - splices data directly between two non-pipes
 801 * @in:         file to splice from
 802 * @sd:         actor information on where to splice to
 803 * @actor:      handles the data splicing
 804 *
 805 * Description:
 806 *    This is a special case helper to splice directly between two
 807 *    points, without requiring an explicit pipe. Internally an allocated
 808 *    pipe is cached in the process, and reused during the lifetime of
 809 *    that process.
 810 *
 811 */
 812ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 813                               splice_direct_actor *actor)
 814{
 815        struct pipe_inode_info *pipe;
 816        long ret, bytes;
 817        umode_t i_mode;
 818        size_t len;
 819        int i, flags, more;
 820
 821        /*
 822         * We require the input being a regular file, as we don't want to
 823         * randomly drop data for eg socket -> socket splicing. Use the
 824         * piped splicing for that!
 825         */
 826        i_mode = file_inode(in)->i_mode;
 827        if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
 828                return -EINVAL;
 829
 830        /*
 831         * neither in nor out is a pipe, setup an internal pipe attached to
 832         * 'out' and transfer the wanted data from 'in' to 'out' through that
 833         */
 834        pipe = current->splice_pipe;
 835        if (unlikely(!pipe)) {
 836                pipe = alloc_pipe_info();
 837                if (!pipe)
 838                        return -ENOMEM;
 839
 840                /*
 841                 * We don't have an immediate reader, but we'll read the stuff
 842                 * out of the pipe right after the splice_to_pipe(). So set
 843                 * PIPE_READERS appropriately.
 844                 */
 845                pipe->readers = 1;
 846
 847                current->splice_pipe = pipe;
 848        }
 849
 850        /*
 851         * Do the splice.
 852         */
 853        ret = 0;
 854        bytes = 0;
 855        len = sd->total_len;
 856        flags = sd->flags;
 857
 858        /*
 859         * Don't block on output, we have to drain the direct pipe.
 860         */
 861        sd->flags &= ~SPLICE_F_NONBLOCK;
 862        more = sd->flags & SPLICE_F_MORE;
 863
 864        WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
 865
 866        while (len) {
 867                size_t read_len;
 868                loff_t pos = sd->pos, prev_pos = pos;
 869
 870                ret = do_splice_to(in, &pos, pipe, len, flags);
 871                if (unlikely(ret <= 0))
 872                        goto out_release;
 873
 874                read_len = ret;
 875                sd->total_len = read_len;
 876
 877                /*
 878                 * If more data is pending, set SPLICE_F_MORE
 879                 * If this is the last data and SPLICE_F_MORE was not set
 880                 * initially, clears it.
 881                 */
 882                if (read_len < len)
 883                        sd->flags |= SPLICE_F_MORE;
 884                else if (!more)
 885                        sd->flags &= ~SPLICE_F_MORE;
 886                /*
 887                 * NOTE: nonblocking mode only applies to the input. We
 888                 * must not do the output in nonblocking mode as then we
 889                 * could get stuck data in the internal pipe:
 890                 */
 891                ret = actor(pipe, sd);
 892                if (unlikely(ret <= 0)) {
 893                        sd->pos = prev_pos;
 894                        goto out_release;
 895                }
 896
 897                bytes += ret;
 898                len -= ret;
 899                sd->pos = pos;
 900
 901                if (ret < read_len) {
 902                        sd->pos = prev_pos + ret;
 903                        goto out_release;
 904                }
 905        }
 906
 907done:
 908        pipe->tail = pipe->head = 0;
 909        file_accessed(in);
 910        return bytes;
 911
 912out_release:
 913        /*
 914         * If we did an incomplete transfer we must release
 915         * the pipe buffers in question:
 916         */
 917        for (i = 0; i < pipe->ring_size; i++) {
 918                struct pipe_buffer *buf = &pipe->bufs[i];
 919
 920                if (buf->ops)
 921                        pipe_buf_release(pipe, buf);
 922        }
 923
 924        if (!bytes)
 925                bytes = ret;
 926
 927        goto done;
 928}
 929EXPORT_SYMBOL(splice_direct_to_actor);
 930
 931static int direct_splice_actor(struct pipe_inode_info *pipe,
 932                               struct splice_desc *sd)
 933{
 934        struct file *file = sd->u.file;
 935
 936        return do_splice_from(pipe, file, sd->opos, sd->total_len,
 937                              sd->flags);
 938}
 939
 940/**
 941 * do_splice_direct - splices data directly between two files
 942 * @in:         file to splice from
 943 * @ppos:       input file offset
 944 * @out:        file to splice to
 945 * @opos:       output file offset
 946 * @len:        number of bytes to splice
 947 * @flags:      splice modifier flags
 948 *
 949 * Description:
 950 *    For use by do_sendfile(). splice can easily emulate sendfile, but
 951 *    doing it in the application would incur an extra system call
 952 *    (splice in + splice out, as compared to just sendfile()). So this helper
 953 *    can splice directly through a process-private pipe.
 954 *
 955 */
 956long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 957                      loff_t *opos, size_t len, unsigned int flags)
 958{
 959        struct splice_desc sd = {
 960                .len            = len,
 961                .total_len      = len,
 962                .flags          = flags,
 963                .pos            = *ppos,
 964                .u.file         = out,
 965                .opos           = opos,
 966        };
 967        long ret;
 968
 969        if (unlikely(!(out->f_mode & FMODE_WRITE)))
 970                return -EBADF;
 971
 972        if (unlikely(out->f_flags & O_APPEND))
 973                return -EINVAL;
 974
 975        ret = rw_verify_area(WRITE, out, opos, len);
 976        if (unlikely(ret < 0))
 977                return ret;
 978
 979        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
 980        if (ret > 0)
 981                *ppos = sd.pos;
 982
 983        return ret;
 984}
 985EXPORT_SYMBOL(do_splice_direct);
 986
 987static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
 988{
 989        for (;;) {
 990                if (unlikely(!pipe->readers)) {
 991                        send_sig(SIGPIPE, current, 0);
 992                        return -EPIPE;
 993                }
 994                if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
 995                        return 0;
 996                if (flags & SPLICE_F_NONBLOCK)
 997                        return -EAGAIN;
 998                if (signal_pending(current))
 999                        return -ERESTARTSYS;
1000                pipe_wait_writable(pipe);
1001        }
1002}
1003
1004static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1005                               struct pipe_inode_info *opipe,
1006                               size_t len, unsigned int flags);
1007
1008long splice_file_to_pipe(struct file *in,
1009                         struct pipe_inode_info *opipe,
1010                         loff_t *offset,
1011                         size_t len, unsigned int flags)
1012{
1013        long ret;
1014
1015        pipe_lock(opipe);
1016        ret = wait_for_space(opipe, flags);
1017        if (!ret)
1018                ret = do_splice_to(in, offset, opipe, len, flags);
1019        pipe_unlock(opipe);
1020        if (ret > 0)
1021                wakeup_pipe_readers(opipe);
1022        return ret;
1023}
1024
1025/*
1026 * Determine where to splice to/from.
1027 */
1028long do_splice(struct file *in, loff_t *off_in, struct file *out,
1029               loff_t *off_out, size_t len, unsigned int flags)
1030{
1031        struct pipe_inode_info *ipipe;
1032        struct pipe_inode_info *opipe;
1033        loff_t offset;
1034        long ret;
1035
1036        if (unlikely(!(in->f_mode & FMODE_READ) ||
1037                     !(out->f_mode & FMODE_WRITE)))
1038                return -EBADF;
1039
1040        ipipe = get_pipe_info(in, true);
1041        opipe = get_pipe_info(out, true);
1042
1043        if (ipipe && opipe) {
1044                if (off_in || off_out)
1045                        return -ESPIPE;
1046
1047                /* Splicing to self would be fun, but... */
1048                if (ipipe == opipe)
1049                        return -EINVAL;
1050
1051                if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1052                        flags |= SPLICE_F_NONBLOCK;
1053
1054                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1055        }
1056
1057        if (ipipe) {
1058                if (off_in)
1059                        return -ESPIPE;
1060                if (off_out) {
1061                        if (!(out->f_mode & FMODE_PWRITE))
1062                                return -EINVAL;
1063                        offset = *off_out;
1064                } else {
1065                        offset = out->f_pos;
1066                }
1067
1068                if (unlikely(out->f_flags & O_APPEND))
1069                        return -EINVAL;
1070
1071                ret = rw_verify_area(WRITE, out, &offset, len);
1072                if (unlikely(ret < 0))
1073                        return ret;
1074
1075                if (in->f_flags & O_NONBLOCK)
1076                        flags |= SPLICE_F_NONBLOCK;
1077
1078                file_start_write(out);
1079                ret = do_splice_from(ipipe, out, &offset, len, flags);
1080                file_end_write(out);
1081
1082                if (!off_out)
1083                        out->f_pos = offset;
1084                else
1085                        *off_out = offset;
1086
1087                return ret;
1088        }
1089
1090        if (opipe) {
1091                if (off_out)
1092                        return -ESPIPE;
1093                if (off_in) {
1094                        if (!(in->f_mode & FMODE_PREAD))
1095                                return -EINVAL;
1096                        offset = *off_in;
1097                } else {
1098                        offset = in->f_pos;
1099                }
1100
1101                if (out->f_flags & O_NONBLOCK)
1102                        flags |= SPLICE_F_NONBLOCK;
1103
1104                ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1105                if (!off_in)
1106                        in->f_pos = offset;
1107                else
1108                        *off_in = offset;
1109
1110                return ret;
1111        }
1112
1113        return -EINVAL;
1114}
1115
1116static long __do_splice(struct file *in, loff_t __user *off_in,
1117                        struct file *out, loff_t __user *off_out,
1118                        size_t len, unsigned int flags)
1119{
1120        struct pipe_inode_info *ipipe;
1121        struct pipe_inode_info *opipe;
1122        loff_t offset, *__off_in = NULL, *__off_out = NULL;
1123        long ret;
1124
1125        ipipe = get_pipe_info(in, true);
1126        opipe = get_pipe_info(out, true);
1127
1128        if (ipipe && off_in)
1129                return -ESPIPE;
1130        if (opipe && off_out)
1131                return -ESPIPE;
1132
1133        if (off_out) {
1134                if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1135                        return -EFAULT;
1136                __off_out = &offset;
1137        }
1138        if (off_in) {
1139                if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1140                        return -EFAULT;
1141                __off_in = &offset;
1142        }
1143
1144        ret = do_splice(in, __off_in, out, __off_out, len, flags);
1145        if (ret < 0)
1146                return ret;
1147
1148        if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1149                return -EFAULT;
1150        if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1151                return -EFAULT;
1152
1153        return ret;
1154}
1155
1156static int iter_to_pipe(struct iov_iter *from,
1157                        struct pipe_inode_info *pipe,
1158                        unsigned flags)
1159{
1160        struct pipe_buffer buf = {
1161                .ops = &user_page_pipe_buf_ops,
1162                .flags = flags
1163        };
1164        size_t total = 0;
1165        int ret = 0;
1166        bool failed = false;
1167
1168        while (iov_iter_count(from) && !failed) {
1169                struct page *pages[16];
1170                ssize_t copied;
1171                size_t start;
1172                int n;
1173
1174                copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1175                if (copied <= 0) {
1176                        ret = copied;
1177                        break;
1178                }
1179
1180                for (n = 0; copied; n++, start = 0) {
1181                        int size = min_t(int, copied, PAGE_SIZE - start);
1182                        if (!failed) {
1183                                buf.page = pages[n];
1184                                buf.offset = start;
1185                                buf.len = size;
1186                                ret = add_to_pipe(pipe, &buf);
1187                                if (unlikely(ret < 0)) {
1188                                        failed = true;
1189                                } else {
1190                                        iov_iter_advance(from, ret);
1191                                        total += ret;
1192                                }
1193                        } else {
1194                                put_page(pages[n]);
1195                        }
1196                        copied -= size;
1197                }
1198        }
1199        return total ? total : ret;
1200}
1201
1202static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1203                        struct splice_desc *sd)
1204{
1205        int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1206        return n == sd->len ? n : -EFAULT;
1207}
1208
1209/*
1210 * For lack of a better implementation, implement vmsplice() to userspace
1211 * as a simple copy of the pipes pages to the user iov.
1212 */
1213static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1214                             unsigned int flags)
1215{
1216        struct pipe_inode_info *pipe = get_pipe_info(file, true);
1217        struct splice_desc sd = {
1218                .total_len = iov_iter_count(iter),
1219                .flags = flags,
1220                .u.data = iter
1221        };
1222        long ret = 0;
1223
1224        if (!pipe)
1225                return -EBADF;
1226
1227        if (sd.total_len) {
1228                pipe_lock(pipe);
1229                ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1230                pipe_unlock(pipe);
1231        }
1232
1233        return ret;
1234}
1235
1236/*
1237 * vmsplice splices a user address range into a pipe. It can be thought of
1238 * as splice-from-memory, where the regular splice is splice-from-file (or
1239 * to file). In both cases the output is a pipe, naturally.
1240 */
1241static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1242                             unsigned int flags)
1243{
1244        struct pipe_inode_info *pipe;
1245        long ret = 0;
1246        unsigned buf_flag = 0;
1247
1248        if (flags & SPLICE_F_GIFT)
1249                buf_flag = PIPE_BUF_FLAG_GIFT;
1250
1251        pipe = get_pipe_info(file, true);
1252        if (!pipe)
1253                return -EBADF;
1254
1255        pipe_lock(pipe);
1256        ret = wait_for_space(pipe, flags);
1257        if (!ret)
1258                ret = iter_to_pipe(iter, pipe, buf_flag);
1259        pipe_unlock(pipe);
1260        if (ret > 0)
1261                wakeup_pipe_readers(pipe);
1262        return ret;
1263}
1264
1265static int vmsplice_type(struct fd f, int *type)
1266{
1267        if (!f.file)
1268                return -EBADF;
1269        if (f.file->f_mode & FMODE_WRITE) {
1270                *type = WRITE;
1271        } else if (f.file->f_mode & FMODE_READ) {
1272                *type = READ;
1273        } else {
1274                fdput(f);
1275                return -EBADF;
1276        }
1277        return 0;
1278}
1279
1280/*
1281 * Note that vmsplice only really supports true splicing _from_ user memory
1282 * to a pipe, not the other way around. Splicing from user memory is a simple
1283 * operation that can be supported without any funky alignment restrictions
1284 * or nasty vm tricks. We simply map in the user memory and fill them into
1285 * a pipe. The reverse isn't quite as easy, though. There are two possible
1286 * solutions for that:
1287 *
1288 *      - memcpy() the data internally, at which point we might as well just
1289 *        do a regular read() on the buffer anyway.
1290 *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1291 *        has restriction limitations on both ends of the pipe).
1292 *
1293 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1294 *
1295 */
1296SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1297                unsigned long, nr_segs, unsigned int, flags)
1298{
1299        struct iovec iovstack[UIO_FASTIOV];
1300        struct iovec *iov = iovstack;
1301        struct iov_iter iter;
1302        ssize_t error;
1303        struct fd f;
1304        int type;
1305
1306        if (unlikely(flags & ~SPLICE_F_ALL))
1307                return -EINVAL;
1308
1309        f = fdget(fd);
1310        error = vmsplice_type(f, &type);
1311        if (error)
1312                return error;
1313
1314        error = import_iovec(type, uiov, nr_segs,
1315                             ARRAY_SIZE(iovstack), &iov, &iter);
1316        if (error < 0)
1317                goto out_fdput;
1318
1319        if (!iov_iter_count(&iter))
1320                error = 0;
1321        else if (iov_iter_rw(&iter) == WRITE)
1322                error = vmsplice_to_pipe(f.file, &iter, flags);
1323        else
1324                error = vmsplice_to_user(f.file, &iter, flags);
1325
1326        kfree(iov);
1327out_fdput:
1328        fdput(f);
1329        return error;
1330}
1331
1332SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1333                int, fd_out, loff_t __user *, off_out,
1334                size_t, len, unsigned int, flags)
1335{
1336        struct fd in, out;
1337        long error;
1338
1339        if (unlikely(!len))
1340                return 0;
1341
1342        if (unlikely(flags & ~SPLICE_F_ALL))
1343                return -EINVAL;
1344
1345        error = -EBADF;
1346        in = fdget(fd_in);
1347        if (in.file) {
1348                out = fdget(fd_out);
1349                if (out.file) {
1350                        error = __do_splice(in.file, off_in, out.file, off_out,
1351                                                len, flags);
1352                        fdput(out);
1353                }
1354                fdput(in);
1355        }
1356        return error;
1357}
1358
1359/*
1360 * Make sure there's data to read. Wait for input if we can, otherwise
1361 * return an appropriate error.
1362 */
1363static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1364{
1365        int ret;
1366
1367        /*
1368         * Check the pipe occupancy without the inode lock first. This function
1369         * is speculative anyways, so missing one is ok.
1370         */
1371        if (!pipe_empty(pipe->head, pipe->tail))
1372                return 0;
1373
1374        ret = 0;
1375        pipe_lock(pipe);
1376
1377        while (pipe_empty(pipe->head, pipe->tail)) {
1378                if (signal_pending(current)) {
1379                        ret = -ERESTARTSYS;
1380                        break;
1381                }
1382                if (!pipe->writers)
1383                        break;
1384                if (flags & SPLICE_F_NONBLOCK) {
1385                        ret = -EAGAIN;
1386                        break;
1387                }
1388                pipe_wait_readable(pipe);
1389        }
1390
1391        pipe_unlock(pipe);
1392        return ret;
1393}
1394
1395/*
1396 * Make sure there's writeable room. Wait for room if we can, otherwise
1397 * return an appropriate error.
1398 */
1399static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1400{
1401        int ret;
1402
1403        /*
1404         * Check pipe occupancy without the inode lock first. This function
1405         * is speculative anyways, so missing one is ok.
1406         */
1407        if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1408                return 0;
1409
1410        ret = 0;
1411        pipe_lock(pipe);
1412
1413        while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1414                if (!pipe->readers) {
1415                        send_sig(SIGPIPE, current, 0);
1416                        ret = -EPIPE;
1417                        break;
1418                }
1419                if (flags & SPLICE_F_NONBLOCK) {
1420                        ret = -EAGAIN;
1421                        break;
1422                }
1423                if (signal_pending(current)) {
1424                        ret = -ERESTARTSYS;
1425                        break;
1426                }
1427                pipe_wait_writable(pipe);
1428        }
1429
1430        pipe_unlock(pipe);
1431        return ret;
1432}
1433
1434/*
1435 * Splice contents of ipipe to opipe.
1436 */
1437static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1438                               struct pipe_inode_info *opipe,
1439                               size_t len, unsigned int flags)
1440{
1441        struct pipe_buffer *ibuf, *obuf;
1442        unsigned int i_head, o_head;
1443        unsigned int i_tail, o_tail;
1444        unsigned int i_mask, o_mask;
1445        int ret = 0;
1446        bool input_wakeup = false;
1447
1448
1449retry:
1450        ret = ipipe_prep(ipipe, flags);
1451        if (ret)
1452                return ret;
1453
1454        ret = opipe_prep(opipe, flags);
1455        if (ret)
1456                return ret;
1457
1458        /*
1459         * Potential ABBA deadlock, work around it by ordering lock
1460         * grabbing by pipe info address. Otherwise two different processes
1461         * could deadlock (one doing tee from A -> B, the other from B -> A).
1462         */
1463        pipe_double_lock(ipipe, opipe);
1464
1465        i_tail = ipipe->tail;
1466        i_mask = ipipe->ring_size - 1;
1467        o_head = opipe->head;
1468        o_mask = opipe->ring_size - 1;
1469
1470        do {
1471                size_t o_len;
1472
1473                if (!opipe->readers) {
1474                        send_sig(SIGPIPE, current, 0);
1475                        if (!ret)
1476                                ret = -EPIPE;
1477                        break;
1478                }
1479
1480                i_head = ipipe->head;
1481                o_tail = opipe->tail;
1482
1483                if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1484                        break;
1485
1486                /*
1487                 * Cannot make any progress, because either the input
1488                 * pipe is empty or the output pipe is full.
1489                 */
1490                if (pipe_empty(i_head, i_tail) ||
1491                    pipe_full(o_head, o_tail, opipe->max_usage)) {
1492                        /* Already processed some buffers, break */
1493                        if (ret)
1494                                break;
1495
1496                        if (flags & SPLICE_F_NONBLOCK) {
1497                                ret = -EAGAIN;
1498                                break;
1499                        }
1500
1501                        /*
1502                         * We raced with another reader/writer and haven't
1503                         * managed to process any buffers.  A zero return
1504                         * value means EOF, so retry instead.
1505                         */
1506                        pipe_unlock(ipipe);
1507                        pipe_unlock(opipe);
1508                        goto retry;
1509                }
1510
1511                ibuf = &ipipe->bufs[i_tail & i_mask];
1512                obuf = &opipe->bufs[o_head & o_mask];
1513
1514                if (len >= ibuf->len) {
1515                        /*
1516                         * Simply move the whole buffer from ipipe to opipe
1517                         */
1518                        *obuf = *ibuf;
1519                        ibuf->ops = NULL;
1520                        i_tail++;
1521                        ipipe->tail = i_tail;
1522                        input_wakeup = true;
1523                        o_len = obuf->len;
1524                        o_head++;
1525                        opipe->head = o_head;
1526                } else {
1527                        /*
1528                         * Get a reference to this pipe buffer,
1529                         * so we can copy the contents over.
1530                         */
1531                        if (!pipe_buf_get(ipipe, ibuf)) {
1532                                if (ret == 0)
1533                                        ret = -EFAULT;
1534                                break;
1535                        }
1536                        *obuf = *ibuf;
1537
1538                        /*
1539                         * Don't inherit the gift and merge flags, we need to
1540                         * prevent multiple steals of this page.
1541                         */
1542                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1543                        obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1544
1545                        obuf->len = len;
1546                        ibuf->offset += len;
1547                        ibuf->len -= len;
1548                        o_len = len;
1549                        o_head++;
1550                        opipe->head = o_head;
1551                }
1552                ret += o_len;
1553                len -= o_len;
1554        } while (len);
1555
1556        pipe_unlock(ipipe);
1557        pipe_unlock(opipe);
1558
1559        /*
1560         * If we put data in the output pipe, wakeup any potential readers.
1561         */
1562        if (ret > 0)
1563                wakeup_pipe_readers(opipe);
1564
1565        if (input_wakeup)
1566                wakeup_pipe_writers(ipipe);
1567
1568        return ret;
1569}
1570
1571/*
1572 * Link contents of ipipe to opipe.
1573 */
1574static int link_pipe(struct pipe_inode_info *ipipe,
1575                     struct pipe_inode_info *opipe,
1576                     size_t len, unsigned int flags)
1577{
1578        struct pipe_buffer *ibuf, *obuf;
1579        unsigned int i_head, o_head;
1580        unsigned int i_tail, o_tail;
1581        unsigned int i_mask, o_mask;
1582        int ret = 0;
1583
1584        /*
1585         * Potential ABBA deadlock, work around it by ordering lock
1586         * grabbing by pipe info address. Otherwise two different processes
1587         * could deadlock (one doing tee from A -> B, the other from B -> A).
1588         */
1589        pipe_double_lock(ipipe, opipe);
1590
1591        i_tail = ipipe->tail;
1592        i_mask = ipipe->ring_size - 1;
1593        o_head = opipe->head;
1594        o_mask = opipe->ring_size - 1;
1595
1596        do {
1597                if (!opipe->readers) {
1598                        send_sig(SIGPIPE, current, 0);
1599                        if (!ret)
1600                                ret = -EPIPE;
1601                        break;
1602                }
1603
1604                i_head = ipipe->head;
1605                o_tail = opipe->tail;
1606
1607                /*
1608                 * If we have iterated all input buffers or run out of
1609                 * output room, break.
1610                 */
1611                if (pipe_empty(i_head, i_tail) ||
1612                    pipe_full(o_head, o_tail, opipe->max_usage))
1613                        break;
1614
1615                ibuf = &ipipe->bufs[i_tail & i_mask];
1616                obuf = &opipe->bufs[o_head & o_mask];
1617
1618                /*
1619                 * Get a reference to this pipe buffer,
1620                 * so we can copy the contents over.
1621                 */
1622                if (!pipe_buf_get(ipipe, ibuf)) {
1623                        if (ret == 0)
1624                                ret = -EFAULT;
1625                        break;
1626                }
1627
1628                *obuf = *ibuf;
1629
1630                /*
1631                 * Don't inherit the gift and merge flag, we need to prevent
1632                 * multiple steals of this page.
1633                 */
1634                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1635                obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1636
1637                if (obuf->len > len)
1638                        obuf->len = len;
1639                ret += obuf->len;
1640                len -= obuf->len;
1641
1642                o_head++;
1643                opipe->head = o_head;
1644                i_tail++;
1645        } while (len);
1646
1647        pipe_unlock(ipipe);
1648        pipe_unlock(opipe);
1649
1650        /*
1651         * If we put data in the output pipe, wakeup any potential readers.
1652         */
1653        if (ret > 0)
1654                wakeup_pipe_readers(opipe);
1655
1656        return ret;
1657}
1658
1659/*
1660 * This is a tee(1) implementation that works on pipes. It doesn't copy
1661 * any data, it simply references the 'in' pages on the 'out' pipe.
1662 * The 'flags' used are the SPLICE_F_* variants, currently the only
1663 * applicable one is SPLICE_F_NONBLOCK.
1664 */
1665long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1666{
1667        struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1668        struct pipe_inode_info *opipe = get_pipe_info(out, true);
1669        int ret = -EINVAL;
1670
1671        if (unlikely(!(in->f_mode & FMODE_READ) ||
1672                     !(out->f_mode & FMODE_WRITE)))
1673                return -EBADF;
1674
1675        /*
1676         * Duplicate the contents of ipipe to opipe without actually
1677         * copying the data.
1678         */
1679        if (ipipe && opipe && ipipe != opipe) {
1680                if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1681                        flags |= SPLICE_F_NONBLOCK;
1682
1683                /*
1684                 * Keep going, unless we encounter an error. The ipipe/opipe
1685                 * ordering doesn't really matter.
1686                 */
1687                ret = ipipe_prep(ipipe, flags);
1688                if (!ret) {
1689                        ret = opipe_prep(opipe, flags);
1690                        if (!ret)
1691                                ret = link_pipe(ipipe, opipe, len, flags);
1692                }
1693        }
1694
1695        return ret;
1696}
1697
1698SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1699{
1700        struct fd in, out;
1701        int error;
1702
1703        if (unlikely(flags & ~SPLICE_F_ALL))
1704                return -EINVAL;
1705
1706        if (unlikely(!len))
1707                return 0;
1708
1709        error = -EBADF;
1710        in = fdget(fdin);
1711        if (in.file) {
1712                out = fdget(fdout);
1713                if (out.file) {
1714                        error = do_tee(in.file, out.file, len, flags);
1715                        fdput(out);
1716                }
1717                fdput(in);
1718        }
1719
1720        return error;
1721}
1722