linux/fs/splice.c
<<
>>
Prefs
   1/*
   2 * "splice": joining two ropes together by interweaving their strands.
   3 *
   4 * This is the "extended pipe" functionality, where a pipe is used as
   5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
   6 * buffer that you can use to transfer data from one end to the other.
   7 *
   8 * The traditional unix read/write is extended with a "splice()" operation
   9 * that transfers data buffers to or from a pipe buffer.
  10 *
  11 * Named by Larry McVoy, original implementation from Linus, extended by
  12 * Jens to support splicing to files, network, direct splicing, etc and
  13 * fixing lots of bugs.
  14 *
  15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  18 *
  19 */
  20#include <linux/fs.h>
  21#include <linux/file.h>
  22#include <linux/pagemap.h>
  23#include <linux/splice.h>
  24#include <linux/memcontrol.h>
  25#include <linux/mm_inline.h>
  26#include <linux/swap.h>
  27#include <linux/writeback.h>
  28#include <linux/buffer_head.h>
  29#include <linux/module.h>
  30#include <linux/syscalls.h>
  31#include <linux/uio.h>
  32#include <linux/security.h>
  33
  34/*
  35 * Attempt to steal a page from a pipe buffer. This should perhaps go into
  36 * a vm helper function, it's already simplified quite a bit by the
  37 * addition of remove_mapping(). If success is returned, the caller may
  38 * attempt to reuse this page for another destination.
  39 */
  40static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
  41                                     struct pipe_buffer *buf)
  42{
  43        struct page *page = buf->page;
  44        struct address_space *mapping;
  45
  46        lock_page(page);
  47
  48        mapping = page_mapping(page);
  49        if (mapping) {
  50                WARN_ON(!PageUptodate(page));
  51
  52                /*
  53                 * At least for ext2 with nobh option, we need to wait on
  54                 * writeback completing on this page, since we'll remove it
  55                 * from the pagecache.  Otherwise truncate wont wait on the
  56                 * page, allowing the disk blocks to be reused by someone else
  57                 * before we actually wrote our data to them. fs corruption
  58                 * ensues.
  59                 */
  60                wait_on_page_writeback(page);
  61
  62                if (page_has_private(page) &&
  63                    !try_to_release_page(page, GFP_KERNEL))
  64                        goto out_unlock;
  65
  66                /*
  67                 * If we succeeded in removing the mapping, set LRU flag
  68                 * and return good.
  69                 */
  70                if (remove_mapping(mapping, page)) {
  71                        buf->flags |= PIPE_BUF_FLAG_LRU;
  72                        return 0;
  73                }
  74        }
  75
  76        /*
  77         * Raced with truncate or failed to remove page from current
  78         * address space, unlock and return failure.
  79         */
  80out_unlock:
  81        unlock_page(page);
  82        return 1;
  83}
  84
  85static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  86                                        struct pipe_buffer *buf)
  87{
  88        page_cache_release(buf->page);
  89        buf->flags &= ~PIPE_BUF_FLAG_LRU;
  90}
  91
  92/*
  93 * Check whether the contents of buf is OK to access. Since the content
  94 * is a page cache page, IO may be in flight.
  95 */
  96static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
  97                                       struct pipe_buffer *buf)
  98{
  99        struct page *page = buf->page;
 100        int err;
 101
 102        if (!PageUptodate(page)) {
 103                lock_page(page);
 104
 105                /*
 106                 * Page got truncated/unhashed. This will cause a 0-byte
 107                 * splice, if this is the first page.
 108                 */
 109                if (!page->mapping) {
 110                        err = -ENODATA;
 111                        goto error;
 112                }
 113
 114                /*
 115                 * Uh oh, read-error from disk.
 116                 */
 117                if (!PageUptodate(page)) {
 118                        err = -EIO;
 119                        goto error;
 120                }
 121
 122                /*
 123                 * Page is ok afterall, we are done.
 124                 */
 125                unlock_page(page);
 126        }
 127
 128        return 0;
 129error:
 130        unlock_page(page);
 131        return err;
 132}
 133
 134static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 135        .can_merge = 0,
 136        .map = generic_pipe_buf_map,
 137        .unmap = generic_pipe_buf_unmap,
 138        .confirm = page_cache_pipe_buf_confirm,
 139        .release = page_cache_pipe_buf_release,
 140        .steal = page_cache_pipe_buf_steal,
 141        .get = generic_pipe_buf_get,
 142};
 143
 144static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 145                                    struct pipe_buffer *buf)
 146{
 147        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 148                return 1;
 149
 150        buf->flags |= PIPE_BUF_FLAG_LRU;
 151        return generic_pipe_buf_steal(pipe, buf);
 152}
 153
 154static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 155        .can_merge = 0,
 156        .map = generic_pipe_buf_map,
 157        .unmap = generic_pipe_buf_unmap,
 158        .confirm = generic_pipe_buf_confirm,
 159        .release = page_cache_pipe_buf_release,
 160        .steal = user_page_pipe_buf_steal,
 161        .get = generic_pipe_buf_get,
 162};
 163
 164/**
 165 * splice_to_pipe - fill passed data into a pipe
 166 * @pipe:       pipe to fill
 167 * @spd:        data to fill
 168 *
 169 * Description:
 170 *    @spd contains a map of pages and len/offset tuples, along with
 171 *    the struct pipe_buf_operations associated with these pages. This
 172 *    function will link that data to the pipe.
 173 *
 174 */
 175ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 176                       struct splice_pipe_desc *spd)
 177{
 178        unsigned int spd_pages = spd->nr_pages;
 179        int ret, do_wakeup, page_nr;
 180
 181        ret = 0;
 182        do_wakeup = 0;
 183        page_nr = 0;
 184
 185        pipe_lock(pipe);
 186
 187        for (;;) {
 188                if (!pipe->readers) {
 189                        send_sig(SIGPIPE, current, 0);
 190                        if (!ret)
 191                                ret = -EPIPE;
 192                        break;
 193                }
 194
 195                if (pipe->nrbufs < PIPE_BUFFERS) {
 196                        int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 197                        struct pipe_buffer *buf = pipe->bufs + newbuf;
 198
 199                        buf->page = spd->pages[page_nr];
 200                        buf->offset = spd->partial[page_nr].offset;
 201                        buf->len = spd->partial[page_nr].len;
 202                        buf->private = spd->partial[page_nr].private;
 203                        buf->ops = spd->ops;
 204                        if (spd->flags & SPLICE_F_GIFT)
 205                                buf->flags |= PIPE_BUF_FLAG_GIFT;
 206
 207                        pipe->nrbufs++;
 208                        page_nr++;
 209                        ret += buf->len;
 210
 211                        if (pipe->inode)
 212                                do_wakeup = 1;
 213
 214                        if (!--spd->nr_pages)
 215                                break;
 216                        if (pipe->nrbufs < PIPE_BUFFERS)
 217                                continue;
 218
 219                        break;
 220                }
 221
 222                if (spd->flags & SPLICE_F_NONBLOCK) {
 223                        if (!ret)
 224                                ret = -EAGAIN;
 225                        break;
 226                }
 227
 228                if (signal_pending(current)) {
 229                        if (!ret)
 230                                ret = -ERESTARTSYS;
 231                        break;
 232                }
 233
 234                if (do_wakeup) {
 235                        smp_mb();
 236                        if (waitqueue_active(&pipe->wait))
 237                                wake_up_interruptible_sync(&pipe->wait);
 238                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 239                        do_wakeup = 0;
 240                }
 241
 242                pipe->waiting_writers++;
 243                pipe_wait(pipe);
 244                pipe->waiting_writers--;
 245        }
 246
 247        pipe_unlock(pipe);
 248
 249        if (do_wakeup) {
 250                smp_mb();
 251                if (waitqueue_active(&pipe->wait))
 252                        wake_up_interruptible(&pipe->wait);
 253                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 254        }
 255
 256        while (page_nr < spd_pages)
 257                spd->spd_release(spd, page_nr++);
 258
 259        return ret;
 260}
 261
 262static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 263{
 264        page_cache_release(spd->pages[i]);
 265}
 266
 267static int
 268__generic_file_splice_read(struct file *in, loff_t *ppos,
 269                           struct pipe_inode_info *pipe, size_t len,
 270                           unsigned int flags)
 271{
 272        struct address_space *mapping = in->f_mapping;
 273        unsigned int loff, nr_pages, req_pages;
 274        struct page *pages[PIPE_BUFFERS];
 275        struct partial_page partial[PIPE_BUFFERS];
 276        struct page *page;
 277        pgoff_t index, end_index;
 278        loff_t isize;
 279        int error, page_nr;
 280        struct splice_pipe_desc spd = {
 281                .pages = pages,
 282                .partial = partial,
 283                .flags = flags,
 284                .ops = &page_cache_pipe_buf_ops,
 285                .spd_release = spd_release_page,
 286        };
 287
 288        index = *ppos >> PAGE_CACHE_SHIFT;
 289        loff = *ppos & ~PAGE_CACHE_MASK;
 290        req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 291        nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
 292
 293        /*
 294         * Lookup the (hopefully) full range of pages we need.
 295         */
 296        spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
 297        index += spd.nr_pages;
 298
 299        /*
 300         * If find_get_pages_contig() returned fewer pages than we needed,
 301         * readahead/allocate the rest and fill in the holes.
 302         */
 303        if (spd.nr_pages < nr_pages)
 304                page_cache_sync_readahead(mapping, &in->f_ra, in,
 305                                index, req_pages - spd.nr_pages);
 306
 307        error = 0;
 308        while (spd.nr_pages < nr_pages) {
 309                /*
 310                 * Page could be there, find_get_pages_contig() breaks on
 311                 * the first hole.
 312                 */
 313                page = find_get_page(mapping, index);
 314                if (!page) {
 315                        /*
 316                         * page didn't exist, allocate one.
 317                         */
 318                        page = page_cache_alloc_cold(mapping);
 319                        if (!page)
 320                                break;
 321
 322                        error = add_to_page_cache_lru(page, mapping, index,
 323                                                mapping_gfp_mask(mapping));
 324                        if (unlikely(error)) {
 325                                page_cache_release(page);
 326                                if (error == -EEXIST)
 327                                        continue;
 328                                break;
 329                        }
 330                        /*
 331                         * add_to_page_cache() locks the page, unlock it
 332                         * to avoid convoluting the logic below even more.
 333                         */
 334                        unlock_page(page);
 335                }
 336
 337                pages[spd.nr_pages++] = page;
 338                index++;
 339        }
 340
 341        /*
 342         * Now loop over the map and see if we need to start IO on any
 343         * pages, fill in the partial map, etc.
 344         */
 345        index = *ppos >> PAGE_CACHE_SHIFT;
 346        nr_pages = spd.nr_pages;
 347        spd.nr_pages = 0;
 348        for (page_nr = 0; page_nr < nr_pages; page_nr++) {
 349                unsigned int this_len;
 350
 351                if (!len)
 352                        break;
 353
 354                /*
 355                 * this_len is the max we'll use from this page
 356                 */
 357                this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
 358                page = pages[page_nr];
 359
 360                if (PageReadahead(page))
 361                        page_cache_async_readahead(mapping, &in->f_ra, in,
 362                                        page, index, req_pages - page_nr);
 363
 364                /*
 365                 * If the page isn't uptodate, we may need to start io on it
 366                 */
 367                if (!PageUptodate(page)) {
 368                        /*
 369                         * If in nonblock mode then dont block on waiting
 370                         * for an in-flight io page
 371                         */
 372                        if (flags & SPLICE_F_NONBLOCK) {
 373                                if (!trylock_page(page)) {
 374                                        error = -EAGAIN;
 375                                        break;
 376                                }
 377                        } else
 378                                lock_page(page);
 379
 380                        /*
 381                         * Page was truncated, or invalidated by the
 382                         * filesystem.  Redo the find/create, but this time the
 383                         * page is kept locked, so there's no chance of another
 384                         * race with truncate/invalidate.
 385                         */
 386                        if (!page->mapping) {
 387                                unlock_page(page);
 388                                page = find_or_create_page(mapping, index,
 389                                                mapping_gfp_mask(mapping));
 390
 391                                if (!page) {
 392                                        error = -ENOMEM;
 393                                        break;
 394                                }
 395                                page_cache_release(pages[page_nr]);
 396                                pages[page_nr] = page;
 397                        }
 398                        /*
 399                         * page was already under io and is now done, great
 400                         */
 401                        if (PageUptodate(page)) {
 402                                unlock_page(page);
 403                                goto fill_it;
 404                        }
 405
 406                        /*
 407                         * need to read in the page
 408                         */
 409                        error = mapping->a_ops->readpage(in, page);
 410                        if (unlikely(error)) {
 411                                /*
 412                                 * We really should re-lookup the page here,
 413                                 * but it complicates things a lot. Instead
 414                                 * lets just do what we already stored, and
 415                                 * we'll get it the next time we are called.
 416                                 */
 417                                if (error == AOP_TRUNCATED_PAGE)
 418                                        error = 0;
 419
 420                                break;
 421                        }
 422                }
 423fill_it:
 424                /*
 425                 * i_size must be checked after PageUptodate.
 426                 */
 427                isize = i_size_read(mapping->host);
 428                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 429                if (unlikely(!isize || index > end_index))
 430                        break;
 431
 432                /*
 433                 * if this is the last page, see if we need to shrink
 434                 * the length and stop
 435                 */
 436                if (end_index == index) {
 437                        unsigned int plen;
 438
 439                        /*
 440                         * max good bytes in this page
 441                         */
 442                        plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 443                        if (plen <= loff)
 444                                break;
 445
 446                        /*
 447                         * force quit after adding this page
 448                         */
 449                        this_len = min(this_len, plen - loff);
 450                        len = this_len;
 451                }
 452
 453                partial[page_nr].offset = loff;
 454                partial[page_nr].len = this_len;
 455                len -= this_len;
 456                loff = 0;
 457                spd.nr_pages++;
 458                index++;
 459        }
 460
 461        /*
 462         * Release any pages at the end, if we quit early. 'page_nr' is how far
 463         * we got, 'nr_pages' is how many pages are in the map.
 464         */
 465        while (page_nr < nr_pages)
 466                page_cache_release(pages[page_nr++]);
 467        in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 468
 469        if (spd.nr_pages)
 470                return splice_to_pipe(pipe, &spd);
 471
 472        return error;
 473}
 474
 475/**
 476 * generic_file_splice_read - splice data from file to a pipe
 477 * @in:         file to splice from
 478 * @ppos:       position in @in
 479 * @pipe:       pipe to splice to
 480 * @len:        number of bytes to splice
 481 * @flags:      splice modifier flags
 482 *
 483 * Description:
 484 *    Will read pages from given file and fill them into a pipe. Can be
 485 *    used as long as the address_space operations for the source implements
 486 *    a readpage() hook.
 487 *
 488 */
 489ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 490                                 struct pipe_inode_info *pipe, size_t len,
 491                                 unsigned int flags)
 492{
 493        loff_t isize, left;
 494        int ret;
 495
 496        isize = i_size_read(in->f_mapping->host);
 497        if (unlikely(*ppos >= isize))
 498                return 0;
 499
 500        left = isize - *ppos;
 501        if (unlikely(left < len))
 502                len = left;
 503
 504        ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 505        if (ret > 0) {
 506                *ppos += ret;
 507                file_accessed(in);
 508        }
 509
 510        return ret;
 511}
 512EXPORT_SYMBOL(generic_file_splice_read);
 513
 514static const struct pipe_buf_operations default_pipe_buf_ops = {
 515        .can_merge = 0,
 516        .map = generic_pipe_buf_map,
 517        .unmap = generic_pipe_buf_unmap,
 518        .confirm = generic_pipe_buf_confirm,
 519        .release = generic_pipe_buf_release,
 520        .steal = generic_pipe_buf_steal,
 521        .get = generic_pipe_buf_get,
 522};
 523
 524static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 525                            unsigned long vlen, loff_t offset)
 526{
 527        mm_segment_t old_fs;
 528        loff_t pos = offset;
 529        ssize_t res;
 530
 531        old_fs = get_fs();
 532        set_fs(get_ds());
 533        /* The cast to a user pointer is valid due to the set_fs() */
 534        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
 535        set_fs(old_fs);
 536
 537        return res;
 538}
 539
 540static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
 541                            loff_t pos)
 542{
 543        mm_segment_t old_fs;
 544        ssize_t res;
 545
 546        old_fs = get_fs();
 547        set_fs(get_ds());
 548        /* The cast to a user pointer is valid due to the set_fs() */
 549        res = vfs_write(file, (const char __user *)buf, count, &pos);
 550        set_fs(old_fs);
 551
 552        return res;
 553}
 554
 555ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 556                                 struct pipe_inode_info *pipe, size_t len,
 557                                 unsigned int flags)
 558{
 559        unsigned int nr_pages;
 560        unsigned int nr_freed;
 561        size_t offset;
 562        struct page *pages[PIPE_BUFFERS];
 563        struct partial_page partial[PIPE_BUFFERS];
 564        struct iovec vec[PIPE_BUFFERS];
 565        pgoff_t index;
 566        ssize_t res;
 567        size_t this_len;
 568        int error;
 569        int i;
 570        struct splice_pipe_desc spd = {
 571                .pages = pages,
 572                .partial = partial,
 573                .flags = flags,
 574                .ops = &default_pipe_buf_ops,
 575                .spd_release = spd_release_page,
 576        };
 577
 578        index = *ppos >> PAGE_CACHE_SHIFT;
 579        offset = *ppos & ~PAGE_CACHE_MASK;
 580        nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 581
 582        for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
 583                struct page *page;
 584
 585                page = alloc_page(GFP_USER);
 586                error = -ENOMEM;
 587                if (!page)
 588                        goto err;
 589
 590                this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
 591                vec[i].iov_base = (void __user *) page_address(page);
 592                vec[i].iov_len = this_len;
 593                pages[i] = page;
 594                spd.nr_pages++;
 595                len -= this_len;
 596                offset = 0;
 597        }
 598
 599        res = kernel_readv(in, vec, spd.nr_pages, *ppos);
 600        if (res < 0) {
 601                error = res;
 602                goto err;
 603        }
 604
 605        error = 0;
 606        if (!res)
 607                goto err;
 608
 609        nr_freed = 0;
 610        for (i = 0; i < spd.nr_pages; i++) {
 611                this_len = min_t(size_t, vec[i].iov_len, res);
 612                partial[i].offset = 0;
 613                partial[i].len = this_len;
 614                if (!this_len) {
 615                        __free_page(pages[i]);
 616                        pages[i] = NULL;
 617                        nr_freed++;
 618                }
 619                res -= this_len;
 620        }
 621        spd.nr_pages -= nr_freed;
 622
 623        res = splice_to_pipe(pipe, &spd);
 624        if (res > 0)
 625                *ppos += res;
 626
 627        return res;
 628
 629err:
 630        for (i = 0; i < spd.nr_pages; i++)
 631                __free_page(pages[i]);
 632
 633        return error;
 634}
 635EXPORT_SYMBOL(default_file_splice_read);
 636
 637/*
 638 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 639 * using sendpage(). Return the number of bytes sent.
 640 */
 641static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 642                            struct pipe_buffer *buf, struct splice_desc *sd)
 643{
 644        struct file *file = sd->u.file;
 645        loff_t pos = sd->pos;
 646        int ret, more;
 647
 648        ret = buf->ops->confirm(pipe, buf);
 649        if (!ret) {
 650                more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 651
 652                ret = file->f_op->sendpage(file, buf->page, buf->offset,
 653                                           sd->len, &pos, more);
 654        }
 655
 656        return ret;
 657}
 658
 659/*
 660 * This is a little more tricky than the file -> pipe splicing. There are
 661 * basically three cases:
 662 *
 663 *      - Destination page already exists in the address space and there
 664 *        are users of it. For that case we have no other option that
 665 *        copying the data. Tough luck.
 666 *      - Destination page already exists in the address space, but there
 667 *        are no users of it. Make sure it's uptodate, then drop it. Fall
 668 *        through to last case.
 669 *      - Destination page does not exist, we can add the pipe page to
 670 *        the page cache and avoid the copy.
 671 *
 672 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
 673 * sd->flags), we attempt to migrate pages from the pipe to the output
 674 * file address space page cache. This is possible if no one else has
 675 * the pipe page referenced outside of the pipe and page cache. If
 676 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
 677 * a new page in the output file page cache and fill/dirty that.
 678 */
 679int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 680                 struct splice_desc *sd)
 681{
 682        struct file *file = sd->u.file;
 683        struct address_space *mapping = file->f_mapping;
 684        unsigned int offset, this_len;
 685        struct page *page;
 686        void *fsdata;
 687        int ret;
 688
 689        /*
 690         * make sure the data in this buffer is uptodate
 691         */
 692        ret = buf->ops->confirm(pipe, buf);
 693        if (unlikely(ret))
 694                return ret;
 695
 696        offset = sd->pos & ~PAGE_CACHE_MASK;
 697
 698        this_len = sd->len;
 699        if (this_len + offset > PAGE_CACHE_SIZE)
 700                this_len = PAGE_CACHE_SIZE - offset;
 701
 702        ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
 703                                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 704        if (unlikely(ret))
 705                goto out;
 706
 707        if (buf->page != page) {
 708                /*
 709                 * Careful, ->map() uses KM_USER0!
 710                 */
 711                char *src = buf->ops->map(pipe, buf, 1);
 712                char *dst = kmap_atomic(page, KM_USER1);
 713
 714                memcpy(dst + offset, src + buf->offset, this_len);
 715                flush_dcache_page(page);
 716                kunmap_atomic(dst, KM_USER1);
 717                buf->ops->unmap(pipe, buf, src);
 718        }
 719        ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
 720                                page, fsdata);
 721out:
 722        return ret;
 723}
 724EXPORT_SYMBOL(pipe_to_file);
 725
 726static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 727{
 728        smp_mb();
 729        if (waitqueue_active(&pipe->wait))
 730                wake_up_interruptible(&pipe->wait);
 731        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 732}
 733
 734/**
 735 * splice_from_pipe_feed - feed available data from a pipe to a file
 736 * @pipe:       pipe to splice from
 737 * @sd:         information to @actor
 738 * @actor:      handler that splices the data
 739 *
 740 * Description:
 741 *    This function loops over the pipe and calls @actor to do the
 742 *    actual moving of a single struct pipe_buffer to the desired
 743 *    destination.  It returns when there's no more buffers left in
 744 *    the pipe or if the requested number of bytes (@sd->total_len)
 745 *    have been copied.  It returns a positive number (one) if the
 746 *    pipe needs to be filled with more data, zero if the required
 747 *    number of bytes have been copied and -errno on error.
 748 *
 749 *    This, together with splice_from_pipe_{begin,end,next}, may be
 750 *    used to implement the functionality of __splice_from_pipe() when
 751 *    locking is required around copying the pipe buffers to the
 752 *    destination.
 753 */
 754int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 755                          splice_actor *actor)
 756{
 757        int ret;
 758
 759        while (pipe->nrbufs) {
 760                struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 761                const struct pipe_buf_operations *ops = buf->ops;
 762
 763                sd->len = buf->len;
 764                if (sd->len > sd->total_len)
 765                        sd->len = sd->total_len;
 766
 767                ret = actor(pipe, buf, sd);
 768                if (ret <= 0) {
 769                        if (ret == -ENODATA)
 770                                ret = 0;
 771                        return ret;
 772                }
 773                buf->offset += ret;
 774                buf->len -= ret;
 775
 776                sd->num_spliced += ret;
 777                sd->len -= ret;
 778                sd->pos += ret;
 779                sd->total_len -= ret;
 780
 781                if (!buf->len) {
 782                        buf->ops = NULL;
 783                        ops->release(pipe, buf);
 784                        pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
 785                        pipe->nrbufs--;
 786                        if (pipe->inode)
 787                                sd->need_wakeup = true;
 788                }
 789
 790                if (!sd->total_len)
 791                        return 0;
 792        }
 793
 794        return 1;
 795}
 796EXPORT_SYMBOL(splice_from_pipe_feed);
 797
 798/**
 799 * splice_from_pipe_next - wait for some data to splice from
 800 * @pipe:       pipe to splice from
 801 * @sd:         information about the splice operation
 802 *
 803 * Description:
 804 *    This function will wait for some data and return a positive
 805 *    value (one) if pipe buffers are available.  It will return zero
 806 *    or -errno if no more data needs to be spliced.
 807 */
 808int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
 809{
 810        while (!pipe->nrbufs) {
 811                if (!pipe->writers)
 812                        return 0;
 813
 814                if (!pipe->waiting_writers && sd->num_spliced)
 815                        return 0;
 816
 817                if (sd->flags & SPLICE_F_NONBLOCK)
 818                        return -EAGAIN;
 819
 820                if (signal_pending(current))
 821                        return -ERESTARTSYS;
 822
 823                if (sd->need_wakeup) {
 824                        wakeup_pipe_writers(pipe);
 825                        sd->need_wakeup = false;
 826                }
 827
 828                pipe_wait(pipe);
 829        }
 830
 831        return 1;
 832}
 833EXPORT_SYMBOL(splice_from_pipe_next);
 834
 835/**
 836 * splice_from_pipe_begin - start splicing from pipe
 837 * @sd:         information about the splice operation
 838 *
 839 * Description:
 840 *    This function should be called before a loop containing
 841 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 842 *    initialize the necessary fields of @sd.
 843 */
 844void splice_from_pipe_begin(struct splice_desc *sd)
 845{
 846        sd->num_spliced = 0;
 847        sd->need_wakeup = false;
 848}
 849EXPORT_SYMBOL(splice_from_pipe_begin);
 850
 851/**
 852 * splice_from_pipe_end - finish splicing from pipe
 853 * @pipe:       pipe to splice from
 854 * @sd:         information about the splice operation
 855 *
 856 * Description:
 857 *    This function will wake up pipe writers if necessary.  It should
 858 *    be called after a loop containing splice_from_pipe_next() and
 859 *    splice_from_pipe_feed().
 860 */
 861void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
 862{
 863        if (sd->need_wakeup)
 864                wakeup_pipe_writers(pipe);
 865}
 866EXPORT_SYMBOL(splice_from_pipe_end);
 867
 868/**
 869 * __splice_from_pipe - splice data from a pipe to given actor
 870 * @pipe:       pipe to splice from
 871 * @sd:         information to @actor
 872 * @actor:      handler that splices the data
 873 *
 874 * Description:
 875 *    This function does little more than loop over the pipe and call
 876 *    @actor to do the actual moving of a single struct pipe_buffer to
 877 *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
 878 *    pipe_to_user.
 879 *
 880 */
 881ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 882                           splice_actor *actor)
 883{
 884        int ret;
 885
 886        splice_from_pipe_begin(sd);
 887        do {
 888                ret = splice_from_pipe_next(pipe, sd);
 889                if (ret > 0)
 890                        ret = splice_from_pipe_feed(pipe, sd, actor);
 891        } while (ret > 0);
 892        splice_from_pipe_end(pipe, sd);
 893
 894        return sd->num_spliced ? sd->num_spliced : ret;
 895}
 896EXPORT_SYMBOL(__splice_from_pipe);
 897
 898/**
 899 * splice_from_pipe - splice data from a pipe to a file
 900 * @pipe:       pipe to splice from
 901 * @out:        file to splice to
 902 * @ppos:       position in @out
 903 * @len:        how many bytes to splice
 904 * @flags:      splice modifier flags
 905 * @actor:      handler that splices the data
 906 *
 907 * Description:
 908 *    See __splice_from_pipe. This function locks the pipe inode,
 909 *    otherwise it's identical to __splice_from_pipe().
 910 *
 911 */
 912ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 913                         loff_t *ppos, size_t len, unsigned int flags,
 914                         splice_actor *actor)
 915{
 916        ssize_t ret;
 917        struct splice_desc sd = {
 918                .total_len = len,
 919                .flags = flags,
 920                .pos = *ppos,
 921                .u.file = out,
 922        };
 923
 924        pipe_lock(pipe);
 925        ret = __splice_from_pipe(pipe, &sd, actor);
 926        pipe_unlock(pipe);
 927
 928        return ret;
 929}
 930
 931/**
 932 * generic_file_splice_write - splice data from a pipe to a file
 933 * @pipe:       pipe info
 934 * @out:        file to write to
 935 * @ppos:       position in @out
 936 * @len:        number of bytes to splice
 937 * @flags:      splice modifier flags
 938 *
 939 * Description:
 940 *    Will either move or copy pages (determined by @flags options) from
 941 *    the given pipe inode to the given file.
 942 *
 943 */
 944ssize_t
 945generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 946                          loff_t *ppos, size_t len, unsigned int flags)
 947{
 948        struct address_space *mapping = out->f_mapping;
 949        struct inode *inode = mapping->host;
 950        struct splice_desc sd = {
 951                .total_len = len,
 952                .flags = flags,
 953                .pos = *ppos,
 954                .u.file = out,
 955        };
 956        ssize_t ret;
 957
 958        pipe_lock(pipe);
 959
 960        splice_from_pipe_begin(&sd);
 961        do {
 962                ret = splice_from_pipe_next(pipe, &sd);
 963                if (ret <= 0)
 964                        break;
 965
 966                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 967                ret = file_remove_suid(out);
 968                if (!ret) {
 969                        file_update_time(out);
 970                        ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
 971                }
 972                mutex_unlock(&inode->i_mutex);
 973        } while (ret > 0);
 974        splice_from_pipe_end(pipe, &sd);
 975
 976        pipe_unlock(pipe);
 977
 978        if (sd.num_spliced)
 979                ret = sd.num_spliced;
 980
 981        if (ret > 0) {
 982                unsigned long nr_pages;
 983                int err;
 984
 985                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 986
 987                err = generic_write_sync(out, *ppos, ret);
 988                if (err)
 989                        ret = err;
 990                else
 991                        *ppos += ret;
 992                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 993        }
 994
 995        return ret;
 996}
 997
 998EXPORT_SYMBOL(generic_file_splice_write);
 999
1000static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1001                          struct splice_desc *sd)
1002{
1003        int ret;
1004        void *data;
1005
1006        ret = buf->ops->confirm(pipe, buf);
1007        if (ret)
1008                return ret;
1009
1010        data = buf->ops->map(pipe, buf, 0);
1011        ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1012        buf->ops->unmap(pipe, buf, data);
1013
1014        return ret;
1015}
1016
1017static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
1018                                         struct file *out, loff_t *ppos,
1019                                         size_t len, unsigned int flags)
1020{
1021        ssize_t ret;
1022
1023        ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
1024        if (ret > 0)
1025                *ppos += ret;
1026
1027        return ret;
1028}
1029
1030/**
1031 * generic_splice_sendpage - splice data from a pipe to a socket
1032 * @pipe:       pipe to splice from
1033 * @out:        socket to write to
1034 * @ppos:       position in @out
1035 * @len:        number of bytes to splice
1036 * @flags:      splice modifier flags
1037 *
1038 * Description:
1039 *    Will send @len bytes from the pipe to a network socket. No data copying
1040 *    is involved.
1041 *
1042 */
1043ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
1044                                loff_t *ppos, size_t len, unsigned int flags)
1045{
1046        return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
1047}
1048
1049EXPORT_SYMBOL(generic_splice_sendpage);
1050
1051/*
1052 * Attempt to initiate a splice from pipe to file.
1053 */
1054static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1055                           loff_t *ppos, size_t len, unsigned int flags)
1056{
1057        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1058                                loff_t *, size_t, unsigned int);
1059        int ret;
1060
1061        if (unlikely(!(out->f_mode & FMODE_WRITE)))
1062                return -EBADF;
1063
1064        if (unlikely(out->f_flags & O_APPEND))
1065                return -EINVAL;
1066
1067        ret = rw_verify_area(WRITE, out, ppos, len);
1068        if (unlikely(ret < 0))
1069                return ret;
1070
1071        splice_write = out->f_op->splice_write;
1072        if (!splice_write)
1073                splice_write = default_file_splice_write;
1074
1075        return splice_write(pipe, out, ppos, len, flags);
1076}
1077
1078/*
1079 * Attempt to initiate a splice from a file to a pipe.
1080 */
1081static long do_splice_to(struct file *in, loff_t *ppos,
1082                         struct pipe_inode_info *pipe, size_t len,
1083                         unsigned int flags)
1084{
1085        ssize_t (*splice_read)(struct file *, loff_t *,
1086                               struct pipe_inode_info *, size_t, unsigned int);
1087        int ret;
1088
1089        if (unlikely(!(in->f_mode & FMODE_READ)))
1090                return -EBADF;
1091
1092        ret = rw_verify_area(READ, in, ppos, len);
1093        if (unlikely(ret < 0))
1094                return ret;
1095
1096        splice_read = in->f_op->splice_read;
1097        if (!splice_read)
1098                splice_read = default_file_splice_read;
1099
1100        return splice_read(in, ppos, pipe, len, flags);
1101}
1102
1103/**
1104 * splice_direct_to_actor - splices data directly between two non-pipes
1105 * @in:         file to splice from
1106 * @sd:         actor information on where to splice to
1107 * @actor:      handles the data splicing
1108 *
1109 * Description:
1110 *    This is a special case helper to splice directly between two
1111 *    points, without requiring an explicit pipe. Internally an allocated
1112 *    pipe is cached in the process, and reused during the lifetime of
1113 *    that process.
1114 *
1115 */
1116ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1117                               splice_direct_actor *actor)
1118{
1119        struct pipe_inode_info *pipe;
1120        long ret, bytes;
1121        umode_t i_mode;
1122        size_t len;
1123        int i, flags;
1124
1125        /*
1126         * We require the input being a regular file, as we don't want to
1127         * randomly drop data for eg socket -> socket splicing. Use the
1128         * piped splicing for that!
1129         */
1130        i_mode = in->f_path.dentry->d_inode->i_mode;
1131        if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
1132                return -EINVAL;
1133
1134        /*
1135         * neither in nor out is a pipe, setup an internal pipe attached to
1136         * 'out' and transfer the wanted data from 'in' to 'out' through that
1137         */
1138        pipe = current->splice_pipe;
1139        if (unlikely(!pipe)) {
1140                pipe = alloc_pipe_info(NULL);
1141                if (!pipe)
1142                        return -ENOMEM;
1143
1144                /*
1145                 * We don't have an immediate reader, but we'll read the stuff
1146                 * out of the pipe right after the splice_to_pipe(). So set
1147                 * PIPE_READERS appropriately.
1148                 */
1149                pipe->readers = 1;
1150
1151                current->splice_pipe = pipe;
1152        }
1153
1154        /*
1155         * Do the splice.
1156         */
1157        ret = 0;
1158        bytes = 0;
1159        len = sd->total_len;
1160        flags = sd->flags;
1161
1162        /*
1163         * Don't block on output, we have to drain the direct pipe.
1164         */
1165        sd->flags &= ~SPLICE_F_NONBLOCK;
1166
1167        while (len) {
1168                size_t read_len;
1169                loff_t pos = sd->pos, prev_pos = pos;
1170
1171                ret = do_splice_to(in, &pos, pipe, len, flags);
1172                if (unlikely(ret <= 0))
1173                        goto out_release;
1174
1175                read_len = ret;
1176                sd->total_len = read_len;
1177
1178                /*
1179                 * NOTE: nonblocking mode only applies to the input. We
1180                 * must not do the output in nonblocking mode as then we
1181                 * could get stuck data in the internal pipe:
1182                 */
1183                ret = actor(pipe, sd);
1184                if (unlikely(ret <= 0)) {
1185                        sd->pos = prev_pos;
1186                        goto out_release;
1187                }
1188
1189                bytes += ret;
1190                len -= ret;
1191                sd->pos = pos;
1192
1193                if (ret < read_len) {
1194                        sd->pos = prev_pos + ret;
1195                        goto out_release;
1196                }
1197        }
1198
1199done:
1200        pipe->nrbufs = pipe->curbuf = 0;
1201        file_accessed(in);
1202        return bytes;
1203
1204out_release:
1205        /*
1206         * If we did an incomplete transfer we must release
1207         * the pipe buffers in question:
1208         */
1209        for (i = 0; i < PIPE_BUFFERS; i++) {
1210                struct pipe_buffer *buf = pipe->bufs + i;
1211
1212                if (buf->ops) {
1213                        buf->ops->release(pipe, buf);
1214                        buf->ops = NULL;
1215                }
1216        }
1217
1218        if (!bytes)
1219                bytes = ret;
1220
1221        goto done;
1222}
1223EXPORT_SYMBOL(splice_direct_to_actor);
1224
1225static int direct_splice_actor(struct pipe_inode_info *pipe,
1226                               struct splice_desc *sd)
1227{
1228        struct file *file = sd->u.file;
1229
1230        return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
1231}
1232
1233/**
1234 * do_splice_direct - splices data directly between two files
1235 * @in:         file to splice from
1236 * @ppos:       input file offset
1237 * @out:        file to splice to
1238 * @len:        number of bytes to splice
1239 * @flags:      splice modifier flags
1240 *
1241 * Description:
1242 *    For use by do_sendfile(). splice can easily emulate sendfile, but
1243 *    doing it in the application would incur an extra system call
1244 *    (splice in + splice out, as compared to just sendfile()). So this helper
1245 *    can splice directly through a process-private pipe.
1246 *
1247 */
1248long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1249                      size_t len, unsigned int flags)
1250{
1251        struct splice_desc sd = {
1252                .len            = len,
1253                .total_len      = len,
1254                .flags          = flags,
1255                .pos            = *ppos,
1256                .u.file         = out,
1257        };
1258        long ret;
1259
1260        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1261        if (ret > 0)
1262                *ppos = sd.pos;
1263
1264        return ret;
1265}
1266
1267static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1268                               struct pipe_inode_info *opipe,
1269                               size_t len, unsigned int flags);
1270/*
1271 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1272 * location, so checking ->i_pipe is not enough to verify that this is a
1273 * pipe.
1274 */
1275static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1276{
1277        if (S_ISFIFO(inode->i_mode))
1278                return inode->i_pipe;
1279
1280        return NULL;
1281}
1282
1283/*
1284 * Determine where to splice to/from.
1285 */
1286static long do_splice(struct file *in, loff_t __user *off_in,
1287                      struct file *out, loff_t __user *off_out,
1288                      size_t len, unsigned int flags)
1289{
1290        struct pipe_inode_info *ipipe;
1291        struct pipe_inode_info *opipe;
1292        loff_t offset, *off;
1293        long ret;
1294
1295        ipipe = pipe_info(in->f_path.dentry->d_inode);
1296        opipe = pipe_info(out->f_path.dentry->d_inode);
1297
1298        if (ipipe && opipe) {
1299                if (off_in || off_out)
1300                        return -ESPIPE;
1301
1302                if (!(in->f_mode & FMODE_READ))
1303                        return -EBADF;
1304
1305                if (!(out->f_mode & FMODE_WRITE))
1306                        return -EBADF;
1307
1308                /* Splicing to self would be fun, but... */
1309                if (ipipe == opipe)
1310                        return -EINVAL;
1311
1312                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1313        }
1314
1315        if (ipipe) {
1316                if (off_in)
1317                        return -ESPIPE;
1318                if (off_out) {
1319                        if (out->f_op->llseek == no_llseek)
1320                                return -EINVAL;
1321                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1322                                return -EFAULT;
1323                        off = &offset;
1324                } else
1325                        off = &out->f_pos;
1326
1327                ret = do_splice_from(ipipe, out, off, len, flags);
1328
1329                if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1330                        ret = -EFAULT;
1331
1332                return ret;
1333        }
1334
1335        if (opipe) {
1336                if (off_out)
1337                        return -ESPIPE;
1338                if (off_in) {
1339                        if (in->f_op->llseek == no_llseek)
1340                                return -EINVAL;
1341                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1342                                return -EFAULT;
1343                        off = &offset;
1344                } else
1345                        off = &in->f_pos;
1346
1347                ret = do_splice_to(in, off, opipe, len, flags);
1348
1349                if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1350                        ret = -EFAULT;
1351
1352                return ret;
1353        }
1354
1355        return -EINVAL;
1356}
1357
1358/*
1359 * Map an iov into an array of pages and offset/length tupples. With the
1360 * partial_page structure, we can map several non-contiguous ranges into
1361 * our ones pages[] map instead of splitting that operation into pieces.
1362 * Could easily be exported as a generic helper for other users, in which
1363 * case one would probably want to add a 'max_nr_pages' parameter as well.
1364 */
1365static int get_iovec_page_array(const struct iovec __user *iov,
1366                                unsigned int nr_vecs, struct page **pages,
1367                                struct partial_page *partial, int aligned)
1368{
1369        int buffers = 0, error = 0;
1370
1371        while (nr_vecs) {
1372                unsigned long off, npages;
1373                struct iovec entry;
1374                void __user *base;
1375                size_t len;
1376                int i;
1377
1378                error = -EFAULT;
1379                if (copy_from_user(&entry, iov, sizeof(entry)))
1380                        break;
1381
1382                base = entry.iov_base;
1383                len = entry.iov_len;
1384
1385                /*
1386                 * Sanity check this iovec. 0 read succeeds.
1387                 */
1388                error = 0;
1389                if (unlikely(!len))
1390                        break;
1391                error = -EFAULT;
1392                if (!access_ok(VERIFY_READ, base, len))
1393                        break;
1394
1395                /*
1396                 * Get this base offset and number of pages, then map
1397                 * in the user pages.
1398                 */
1399                off = (unsigned long) base & ~PAGE_MASK;
1400
1401                /*
1402                 * If asked for alignment, the offset must be zero and the
1403                 * length a multiple of the PAGE_SIZE.
1404                 */
1405                error = -EINVAL;
1406                if (aligned && (off || len & ~PAGE_MASK))
1407                        break;
1408
1409                npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1410                if (npages > PIPE_BUFFERS - buffers)
1411                        npages = PIPE_BUFFERS - buffers;
1412
1413                error = get_user_pages_fast((unsigned long)base, npages,
1414                                        0, &pages[buffers]);
1415
1416                if (unlikely(error <= 0))
1417                        break;
1418
1419                /*
1420                 * Fill this contiguous range into the partial page map.
1421                 */
1422                for (i = 0; i < error; i++) {
1423                        const int plen = min_t(size_t, len, PAGE_SIZE - off);
1424
1425                        partial[buffers].offset = off;
1426                        partial[buffers].len = plen;
1427
1428                        off = 0;
1429                        len -= plen;
1430                        buffers++;
1431                }
1432
1433                /*
1434                 * We didn't complete this iov, stop here since it probably
1435                 * means we have to move some of this into a pipe to
1436                 * be able to continue.
1437                 */
1438                if (len)
1439                        break;
1440
1441                /*
1442                 * Don't continue if we mapped fewer pages than we asked for,
1443                 * or if we mapped the max number of pages that we have
1444                 * room for.
1445                 */
1446                if (error < npages || buffers == PIPE_BUFFERS)
1447                        break;
1448
1449                nr_vecs--;
1450                iov++;
1451        }
1452
1453        if (buffers)
1454                return buffers;
1455
1456        return error;
1457}
1458
1459static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1460                        struct splice_desc *sd)
1461{
1462        char *src;
1463        int ret;
1464
1465        ret = buf->ops->confirm(pipe, buf);
1466        if (unlikely(ret))
1467                return ret;
1468
1469        /*
1470         * See if we can use the atomic maps, by prefaulting in the
1471         * pages and doing an atomic copy
1472         */
1473        if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1474                src = buf->ops->map(pipe, buf, 1);
1475                ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1476                                                        sd->len);
1477                buf->ops->unmap(pipe, buf, src);
1478                if (!ret) {
1479                        ret = sd->len;
1480                        goto out;
1481                }
1482        }
1483
1484        /*
1485         * No dice, use slow non-atomic map and copy
1486         */
1487        src = buf->ops->map(pipe, buf, 0);
1488
1489        ret = sd->len;
1490        if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1491                ret = -EFAULT;
1492
1493        buf->ops->unmap(pipe, buf, src);
1494out:
1495        if (ret > 0)
1496                sd->u.userptr += ret;
1497        return ret;
1498}
1499
1500/*
1501 * For lack of a better implementation, implement vmsplice() to userspace
1502 * as a simple copy of the pipes pages to the user iov.
1503 */
1504static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1505                             unsigned long nr_segs, unsigned int flags)
1506{
1507        struct pipe_inode_info *pipe;
1508        struct splice_desc sd;
1509        ssize_t size;
1510        int error;
1511        long ret;
1512
1513        pipe = pipe_info(file->f_path.dentry->d_inode);
1514        if (!pipe)
1515                return -EBADF;
1516
1517        pipe_lock(pipe);
1518
1519        error = ret = 0;
1520        while (nr_segs) {
1521                void __user *base;
1522                size_t len;
1523
1524                /*
1525                 * Get user address base and length for this iovec.
1526                 */
1527                error = get_user(base, &iov->iov_base);
1528                if (unlikely(error))
1529                        break;
1530                error = get_user(len, &iov->iov_len);
1531                if (unlikely(error))
1532                        break;
1533
1534                /*
1535                 * Sanity check this iovec. 0 read succeeds.
1536                 */
1537                if (unlikely(!len))
1538                        break;
1539                if (unlikely(!base)) {
1540                        error = -EFAULT;
1541                        break;
1542                }
1543
1544                if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
1545                        error = -EFAULT;
1546                        break;
1547                }
1548
1549                sd.len = 0;
1550                sd.total_len = len;
1551                sd.flags = flags;
1552                sd.u.userptr = base;
1553                sd.pos = 0;
1554
1555                size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1556                if (size < 0) {
1557                        if (!ret)
1558                                ret = size;
1559
1560                        break;
1561                }
1562
1563                ret += size;
1564
1565                if (size < len)
1566                        break;
1567
1568                nr_segs--;
1569                iov++;
1570        }
1571
1572        pipe_unlock(pipe);
1573
1574        if (!ret)
1575                ret = error;
1576
1577        return ret;
1578}
1579
1580/*
1581 * vmsplice splices a user address range into a pipe. It can be thought of
1582 * as splice-from-memory, where the regular splice is splice-from-file (or
1583 * to file). In both cases the output is a pipe, naturally.
1584 */
1585static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1586                             unsigned long nr_segs, unsigned int flags)
1587{
1588        struct pipe_inode_info *pipe;
1589        struct page *pages[PIPE_BUFFERS];
1590        struct partial_page partial[PIPE_BUFFERS];
1591        struct splice_pipe_desc spd = {
1592                .pages = pages,
1593                .partial = partial,
1594                .flags = flags,
1595                .ops = &user_page_pipe_buf_ops,
1596                .spd_release = spd_release_page,
1597        };
1598
1599        pipe = pipe_info(file->f_path.dentry->d_inode);
1600        if (!pipe)
1601                return -EBADF;
1602
1603        spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1604                                            flags & SPLICE_F_GIFT);
1605        if (spd.nr_pages <= 0)
1606                return spd.nr_pages;
1607
1608        return splice_to_pipe(pipe, &spd);
1609}
1610
1611/*
1612 * Note that vmsplice only really supports true splicing _from_ user memory
1613 * to a pipe, not the other way around. Splicing from user memory is a simple
1614 * operation that can be supported without any funky alignment restrictions
1615 * or nasty vm tricks. We simply map in the user memory and fill them into
1616 * a pipe. The reverse isn't quite as easy, though. There are two possible
1617 * solutions for that:
1618 *
1619 *      - memcpy() the data internally, at which point we might as well just
1620 *        do a regular read() on the buffer anyway.
1621 *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1622 *        has restriction limitations on both ends of the pipe).
1623 *
1624 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1625 *
1626 */
1627SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1628                unsigned long, nr_segs, unsigned int, flags)
1629{
1630        struct file *file;
1631        long error;
1632        int fput;
1633
1634        if (unlikely(nr_segs > UIO_MAXIOV))
1635                return -EINVAL;
1636        else if (unlikely(!nr_segs))
1637                return 0;
1638
1639        error = -EBADF;
1640        file = fget_light(fd, &fput);
1641        if (file) {
1642                if (file->f_mode & FMODE_WRITE)
1643                        error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1644                else if (file->f_mode & FMODE_READ)
1645                        error = vmsplice_to_user(file, iov, nr_segs, flags);
1646
1647                fput_light(file, fput);
1648        }
1649
1650        return error;
1651}
1652
1653SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1654                int, fd_out, loff_t __user *, off_out,
1655                size_t, len, unsigned int, flags)
1656{
1657        long error;
1658        struct file *in, *out;
1659        int fput_in, fput_out;
1660
1661        if (unlikely(!len))
1662                return 0;
1663
1664        error = -EBADF;
1665        in = fget_light(fd_in, &fput_in);
1666        if (in) {
1667                if (in->f_mode & FMODE_READ) {
1668                        out = fget_light(fd_out, &fput_out);
1669                        if (out) {
1670                                if (out->f_mode & FMODE_WRITE)
1671                                        error = do_splice(in, off_in,
1672                                                          out, off_out,
1673                                                          len, flags);
1674                                fput_light(out, fput_out);
1675                        }
1676                }
1677
1678                fput_light(in, fput_in);
1679        }
1680
1681        return error;
1682}
1683
1684/*
1685 * Make sure there's data to read. Wait for input if we can, otherwise
1686 * return an appropriate error.
1687 */
1688static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1689{
1690        int ret;
1691
1692        /*
1693         * Check ->nrbufs without the inode lock first. This function
1694         * is speculative anyways, so missing one is ok.
1695         */
1696        if (pipe->nrbufs)
1697                return 0;
1698
1699        ret = 0;
1700        pipe_lock(pipe);
1701
1702        while (!pipe->nrbufs) {
1703                if (signal_pending(current)) {
1704                        ret = -ERESTARTSYS;
1705                        break;
1706                }
1707                if (!pipe->writers)
1708                        break;
1709                if (!pipe->waiting_writers) {
1710                        if (flags & SPLICE_F_NONBLOCK) {
1711                                ret = -EAGAIN;
1712                                break;
1713                        }
1714                }
1715                pipe_wait(pipe);
1716        }
1717
1718        pipe_unlock(pipe);
1719        return ret;
1720}
1721
1722/*
1723 * Make sure there's writeable room. Wait for room if we can, otherwise
1724 * return an appropriate error.
1725 */
1726static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1727{
1728        int ret;
1729
1730        /*
1731         * Check ->nrbufs without the inode lock first. This function
1732         * is speculative anyways, so missing one is ok.
1733         */
1734        if (pipe->nrbufs < PIPE_BUFFERS)
1735                return 0;
1736
1737        ret = 0;
1738        pipe_lock(pipe);
1739
1740        while (pipe->nrbufs >= PIPE_BUFFERS) {
1741                if (!pipe->readers) {
1742                        send_sig(SIGPIPE, current, 0);
1743                        ret = -EPIPE;
1744                        break;
1745                }
1746                if (flags & SPLICE_F_NONBLOCK) {
1747                        ret = -EAGAIN;
1748                        break;
1749                }
1750                if (signal_pending(current)) {
1751                        ret = -ERESTARTSYS;
1752                        break;
1753                }
1754                pipe->waiting_writers++;
1755                pipe_wait(pipe);
1756                pipe->waiting_writers--;
1757        }
1758
1759        pipe_unlock(pipe);
1760        return ret;
1761}
1762
1763/*
1764 * Splice contents of ipipe to opipe.
1765 */
1766static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1767                               struct pipe_inode_info *opipe,
1768                               size_t len, unsigned int flags)
1769{
1770        struct pipe_buffer *ibuf, *obuf;
1771        int ret = 0, nbuf;
1772        bool input_wakeup = false;
1773
1774
1775retry:
1776        ret = ipipe_prep(ipipe, flags);
1777        if (ret)
1778                return ret;
1779
1780        ret = opipe_prep(opipe, flags);
1781        if (ret)
1782                return ret;
1783
1784        /*
1785         * Potential ABBA deadlock, work around it by ordering lock
1786         * grabbing by pipe info address. Otherwise two different processes
1787         * could deadlock (one doing tee from A -> B, the other from B -> A).
1788         */
1789        pipe_double_lock(ipipe, opipe);
1790
1791        do {
1792                if (!opipe->readers) {
1793                        send_sig(SIGPIPE, current, 0);
1794                        if (!ret)
1795                                ret = -EPIPE;
1796                        break;
1797                }
1798
1799                if (!ipipe->nrbufs && !ipipe->writers)
1800                        break;
1801
1802                /*
1803                 * Cannot make any progress, because either the input
1804                 * pipe is empty or the output pipe is full.
1805                 */
1806                if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
1807                        /* Already processed some buffers, break */
1808                        if (ret)
1809                                break;
1810
1811                        if (flags & SPLICE_F_NONBLOCK) {
1812                                ret = -EAGAIN;
1813                                break;
1814                        }
1815
1816                        /*
1817                         * We raced with another reader/writer and haven't
1818                         * managed to process any buffers.  A zero return
1819                         * value means EOF, so retry instead.
1820                         */
1821                        pipe_unlock(ipipe);
1822                        pipe_unlock(opipe);
1823                        goto retry;
1824                }
1825
1826                ibuf = ipipe->bufs + ipipe->curbuf;
1827                nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
1828                obuf = opipe->bufs + nbuf;
1829
1830                if (len >= ibuf->len) {
1831                        /*
1832                         * Simply move the whole buffer from ipipe to opipe
1833                         */
1834                        *obuf = *ibuf;
1835                        ibuf->ops = NULL;
1836                        opipe->nrbufs++;
1837                        ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
1838                        ipipe->nrbufs--;
1839                        input_wakeup = true;
1840                } else {
1841                        /*
1842                         * Get a reference to this pipe buffer,
1843                         * so we can copy the contents over.
1844                         */
1845                        ibuf->ops->get(ipipe, ibuf);
1846                        *obuf = *ibuf;
1847
1848                        /*
1849                         * Don't inherit the gift flag, we need to
1850                         * prevent multiple steals of this page.
1851                         */
1852                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1853
1854                        obuf->len = len;
1855                        opipe->nrbufs++;
1856                        ibuf->offset += obuf->len;
1857                        ibuf->len -= obuf->len;
1858                }
1859                ret += obuf->len;
1860                len -= obuf->len;
1861        } while (len);
1862
1863        pipe_unlock(ipipe);
1864        pipe_unlock(opipe);
1865
1866        /*
1867         * If we put data in the output pipe, wakeup any potential readers.
1868         */
1869        if (ret > 0) {
1870                smp_mb();
1871                if (waitqueue_active(&opipe->wait))
1872                        wake_up_interruptible(&opipe->wait);
1873                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1874        }
1875        if (input_wakeup)
1876                wakeup_pipe_writers(ipipe);
1877
1878        return ret;
1879}
1880
1881/*
1882 * Link contents of ipipe to opipe.
1883 */
1884static int link_pipe(struct pipe_inode_info *ipipe,
1885                     struct pipe_inode_info *opipe,
1886                     size_t len, unsigned int flags)
1887{
1888        struct pipe_buffer *ibuf, *obuf;
1889        int ret = 0, i = 0, nbuf;
1890
1891        /*
1892         * Potential ABBA deadlock, work around it by ordering lock
1893         * grabbing by pipe info address. Otherwise two different processes
1894         * could deadlock (one doing tee from A -> B, the other from B -> A).
1895         */
1896        pipe_double_lock(ipipe, opipe);
1897
1898        do {
1899                if (!opipe->readers) {
1900                        send_sig(SIGPIPE, current, 0);
1901                        if (!ret)
1902                                ret = -EPIPE;
1903                        break;
1904                }
1905
1906                /*
1907                 * If we have iterated all input buffers or ran out of
1908                 * output room, break.
1909                 */
1910                if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1911                        break;
1912
1913                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1914                nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1915
1916                /*
1917                 * Get a reference to this pipe buffer,
1918                 * so we can copy the contents over.
1919                 */
1920                ibuf->ops->get(ipipe, ibuf);
1921
1922                obuf = opipe->bufs + nbuf;
1923                *obuf = *ibuf;
1924
1925                /*
1926                 * Don't inherit the gift flag, we need to
1927                 * prevent multiple steals of this page.
1928                 */
1929                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1930
1931                if (obuf->len > len)
1932                        obuf->len = len;
1933
1934                opipe->nrbufs++;
1935                ret += obuf->len;
1936                len -= obuf->len;
1937                i++;
1938        } while (len);
1939
1940        /*
1941         * return EAGAIN if we have the potential of some data in the
1942         * future, otherwise just return 0
1943         */
1944        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1945                ret = -EAGAIN;
1946
1947        pipe_unlock(ipipe);
1948        pipe_unlock(opipe);
1949
1950        /*
1951         * If we put data in the output pipe, wakeup any potential readers.
1952         */
1953        if (ret > 0) {
1954                smp_mb();
1955                if (waitqueue_active(&opipe->wait))
1956                        wake_up_interruptible(&opipe->wait);
1957                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1958        }
1959
1960        return ret;
1961}
1962
1963/*
1964 * This is a tee(1) implementation that works on pipes. It doesn't copy
1965 * any data, it simply references the 'in' pages on the 'out' pipe.
1966 * The 'flags' used are the SPLICE_F_* variants, currently the only
1967 * applicable one is SPLICE_F_NONBLOCK.
1968 */
1969static long do_tee(struct file *in, struct file *out, size_t len,
1970                   unsigned int flags)
1971{
1972        struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1973        struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1974        int ret = -EINVAL;
1975
1976        /*
1977         * Duplicate the contents of ipipe to opipe without actually
1978         * copying the data.
1979         */
1980        if (ipipe && opipe && ipipe != opipe) {
1981                /*
1982                 * Keep going, unless we encounter an error. The ipipe/opipe
1983                 * ordering doesn't really matter.
1984                 */
1985                ret = ipipe_prep(ipipe, flags);
1986                if (!ret) {
1987                        ret = opipe_prep(opipe, flags);
1988                        if (!ret)
1989                                ret = link_pipe(ipipe, opipe, len, flags);
1990                }
1991        }
1992
1993        return ret;
1994}
1995
1996SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1997{
1998        struct file *in;
1999        int error, fput_in;
2000
2001        if (unlikely(!len))
2002                return 0;
2003
2004        error = -EBADF;
2005        in = fget_light(fdin, &fput_in);
2006        if (in) {
2007                if (in->f_mode & FMODE_READ) {
2008                        int fput_out;
2009                        struct file *out = fget_light(fdout, &fput_out);
2010
2011                        if (out) {
2012                                if (out->f_mode & FMODE_WRITE)
2013                                        error = do_tee(in, out, len, flags);
2014                                fput_light(out, fput_out);
2015                        }
2016                }
2017                fput_light(in, fput_in);
2018        }
2019
2020        return error;
2021}
2022