linux/lib/iov_iter.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2#include <crypto/hash.h>
   3#include <linux/export.h>
   4#include <linux/bvec.h>
   5#include <linux/fault-inject-usercopy.h>
   6#include <linux/uio.h>
   7#include <linux/pagemap.h>
   8#include <linux/highmem.h>
   9#include <linux/slab.h>
  10#include <linux/vmalloc.h>
  11#include <linux/splice.h>
  12#include <linux/compat.h>
  13#include <net/checksum.h>
  14#include <linux/scatterlist.h>
  15#include <linux/instrumented.h>
  16
  17#define PIPE_PARANOIA /* for now */
  18
  19/* covers iovec and kvec alike */
  20#define iterate_iovec(i, n, base, len, off, __p, STEP) {        \
  21        size_t off = 0;                                         \
  22        size_t skip = i->iov_offset;                            \
  23        do {                                                    \
  24                len = min(n, __p->iov_len - skip);              \
  25                if (likely(len)) {                              \
  26                        base = __p->iov_base + skip;            \
  27                        len -= (STEP);                          \
  28                        off += len;                             \
  29                        skip += len;                            \
  30                        n -= len;                               \
  31                        if (skip < __p->iov_len)                \
  32                                break;                          \
  33                }                                               \
  34                __p++;                                          \
  35                skip = 0;                                       \
  36        } while (n);                                            \
  37        i->iov_offset = skip;                                   \
  38        n = off;                                                \
  39}
  40
  41#define iterate_bvec(i, n, base, len, off, p, STEP) {           \
  42        size_t off = 0;                                         \
  43        unsigned skip = i->iov_offset;                          \
  44        while (n) {                                             \
  45                unsigned offset = p->bv_offset + skip;          \
  46                unsigned left;                                  \
  47                void *kaddr = kmap_local_page(p->bv_page +      \
  48                                        offset / PAGE_SIZE);    \
  49                base = kaddr + offset % PAGE_SIZE;              \
  50                len = min(min(n, (size_t)(p->bv_len - skip)),   \
  51                     (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \
  52                left = (STEP);                                  \
  53                kunmap_local(kaddr);                            \
  54                len -= left;                                    \
  55                off += len;                                     \
  56                skip += len;                                    \
  57                if (skip == p->bv_len) {                        \
  58                        skip = 0;                               \
  59                        p++;                                    \
  60                }                                               \
  61                n -= len;                                       \
  62                if (left)                                       \
  63                        break;                                  \
  64        }                                                       \
  65        i->iov_offset = skip;                                   \
  66        n = off;                                                \
  67}
  68
  69#define iterate_xarray(i, n, base, len, __off, STEP) {          \
  70        __label__ __out;                                        \
  71        size_t __off = 0;                                       \
  72        struct page *head = NULL;                               \
  73        loff_t start = i->xarray_start + i->iov_offset;         \
  74        unsigned offset = start % PAGE_SIZE;                    \
  75        pgoff_t index = start / PAGE_SIZE;                      \
  76        int j;                                                  \
  77                                                                \
  78        XA_STATE(xas, i->xarray, index);                        \
  79                                                                \
  80        rcu_read_lock();                                        \
  81        xas_for_each(&xas, head, ULONG_MAX) {                   \
  82                unsigned left;                                  \
  83                if (xas_retry(&xas, head))                      \
  84                        continue;                               \
  85                if (WARN_ON(xa_is_value(head)))                 \
  86                        break;                                  \
  87                if (WARN_ON(PageHuge(head)))                    \
  88                        break;                                  \
  89                for (j = (head->index < index) ? index - head->index : 0; \
  90                     j < thp_nr_pages(head); j++) {             \
  91                        void *kaddr = kmap_local_page(head + j);        \
  92                        base = kaddr + offset;                  \
  93                        len = PAGE_SIZE - offset;               \
  94                        len = min(n, len);                      \
  95                        left = (STEP);                          \
  96                        kunmap_local(kaddr);                    \
  97                        len -= left;                            \
  98                        __off += len;                           \
  99                        n -= len;                               \
 100                        if (left || n == 0)                     \
 101                                goto __out;                     \
 102                        offset = 0;                             \
 103                }                                               \
 104        }                                                       \
 105__out:                                                          \
 106        rcu_read_unlock();                                      \
 107        i->iov_offset += __off;                                         \
 108        n = __off;                                              \
 109}
 110
 111#define __iterate_and_advance(i, n, base, len, off, I, K) {     \
 112        if (unlikely(i->count < n))                             \
 113                n = i->count;                                   \
 114        if (likely(n)) {                                        \
 115                if (likely(iter_is_iovec(i))) {                 \
 116                        const struct iovec *iov = i->iov;       \
 117                        void __user *base;                      \
 118                        size_t len;                             \
 119                        iterate_iovec(i, n, base, len, off,     \
 120                                                iov, (I))       \
 121                        i->nr_segs -= iov - i->iov;             \
 122                        i->iov = iov;                           \
 123                } else if (iov_iter_is_bvec(i)) {               \
 124                        const struct bio_vec *bvec = i->bvec;   \
 125                        void *base;                             \
 126                        size_t len;                             \
 127                        iterate_bvec(i, n, base, len, off,      \
 128                                                bvec, (K))      \
 129                        i->nr_segs -= bvec - i->bvec;           \
 130                        i->bvec = bvec;                         \
 131                } else if (iov_iter_is_kvec(i)) {               \
 132                        const struct kvec *kvec = i->kvec;      \
 133                        void *base;                             \
 134                        size_t len;                             \
 135                        iterate_iovec(i, n, base, len, off,     \
 136                                                kvec, (K))      \
 137                        i->nr_segs -= kvec - i->kvec;           \
 138                        i->kvec = kvec;                         \
 139                } else if (iov_iter_is_xarray(i)) {             \
 140                        void *base;                             \
 141                        size_t len;                             \
 142                        iterate_xarray(i, n, base, len, off,    \
 143                                                        (K))    \
 144                }                                               \
 145                i->count -= n;                                  \
 146        }                                                       \
 147}
 148#define iterate_and_advance(i, n, base, len, off, I, K) \
 149        __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
 150
 151static int copyout(void __user *to, const void *from, size_t n)
 152{
 153        if (should_fail_usercopy())
 154                return n;
 155        if (access_ok(to, n)) {
 156                instrument_copy_to_user(to, from, n);
 157                n = raw_copy_to_user(to, from, n);
 158        }
 159        return n;
 160}
 161
 162static int copyin(void *to, const void __user *from, size_t n)
 163{
 164        if (should_fail_usercopy())
 165                return n;
 166        if (access_ok(from, n)) {
 167                instrument_copy_from_user(to, from, n);
 168                n = raw_copy_from_user(to, from, n);
 169        }
 170        return n;
 171}
 172
 173static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
 174                         struct iov_iter *i)
 175{
 176        size_t skip, copy, left, wanted;
 177        const struct iovec *iov;
 178        char __user *buf;
 179        void *kaddr, *from;
 180
 181        if (unlikely(bytes > i->count))
 182                bytes = i->count;
 183
 184        if (unlikely(!bytes))
 185                return 0;
 186
 187        might_fault();
 188        wanted = bytes;
 189        iov = i->iov;
 190        skip = i->iov_offset;
 191        buf = iov->iov_base + skip;
 192        copy = min(bytes, iov->iov_len - skip);
 193
 194        if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
 195                kaddr = kmap_atomic(page);
 196                from = kaddr + offset;
 197
 198                /* first chunk, usually the only one */
 199                left = copyout(buf, from, copy);
 200                copy -= left;
 201                skip += copy;
 202                from += copy;
 203                bytes -= copy;
 204
 205                while (unlikely(!left && bytes)) {
 206                        iov++;
 207                        buf = iov->iov_base;
 208                        copy = min(bytes, iov->iov_len);
 209                        left = copyout(buf, from, copy);
 210                        copy -= left;
 211                        skip = copy;
 212                        from += copy;
 213                        bytes -= copy;
 214                }
 215                if (likely(!bytes)) {
 216                        kunmap_atomic(kaddr);
 217                        goto done;
 218                }
 219                offset = from - kaddr;
 220                buf += copy;
 221                kunmap_atomic(kaddr);
 222                copy = min(bytes, iov->iov_len - skip);
 223        }
 224        /* Too bad - revert to non-atomic kmap */
 225
 226        kaddr = kmap(page);
 227        from = kaddr + offset;
 228        left = copyout(buf, from, copy);
 229        copy -= left;
 230        skip += copy;
 231        from += copy;
 232        bytes -= copy;
 233        while (unlikely(!left && bytes)) {
 234                iov++;
 235                buf = iov->iov_base;
 236                copy = min(bytes, iov->iov_len);
 237                left = copyout(buf, from, copy);
 238                copy -= left;
 239                skip = copy;
 240                from += copy;
 241                bytes -= copy;
 242        }
 243        kunmap(page);
 244
 245done:
 246        if (skip == iov->iov_len) {
 247                iov++;
 248                skip = 0;
 249        }
 250        i->count -= wanted - bytes;
 251        i->nr_segs -= iov - i->iov;
 252        i->iov = iov;
 253        i->iov_offset = skip;
 254        return wanted - bytes;
 255}
 256
 257static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
 258                         struct iov_iter *i)
 259{
 260        size_t skip, copy, left, wanted;
 261        const struct iovec *iov;
 262        char __user *buf;
 263        void *kaddr, *to;
 264
 265        if (unlikely(bytes > i->count))
 266                bytes = i->count;
 267
 268        if (unlikely(!bytes))
 269                return 0;
 270
 271        might_fault();
 272        wanted = bytes;
 273        iov = i->iov;
 274        skip = i->iov_offset;
 275        buf = iov->iov_base + skip;
 276        copy = min(bytes, iov->iov_len - skip);
 277
 278        if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
 279                kaddr = kmap_atomic(page);
 280                to = kaddr + offset;
 281
 282                /* first chunk, usually the only one */
 283                left = copyin(to, buf, copy);
 284                copy -= left;
 285                skip += copy;
 286                to += copy;
 287                bytes -= copy;
 288
 289                while (unlikely(!left && bytes)) {
 290                        iov++;
 291                        buf = iov->iov_base;
 292                        copy = min(bytes, iov->iov_len);
 293                        left = copyin(to, buf, copy);
 294                        copy -= left;
 295                        skip = copy;
 296                        to += copy;
 297                        bytes -= copy;
 298                }
 299                if (likely(!bytes)) {
 300                        kunmap_atomic(kaddr);
 301                        goto done;
 302                }
 303                offset = to - kaddr;
 304                buf += copy;
 305                kunmap_atomic(kaddr);
 306                copy = min(bytes, iov->iov_len - skip);
 307        }
 308        /* Too bad - revert to non-atomic kmap */
 309
 310        kaddr = kmap(page);
 311        to = kaddr + offset;
 312        left = copyin(to, buf, copy);
 313        copy -= left;
 314        skip += copy;
 315        to += copy;
 316        bytes -= copy;
 317        while (unlikely(!left && bytes)) {
 318                iov++;
 319                buf = iov->iov_base;
 320                copy = min(bytes, iov->iov_len);
 321                left = copyin(to, buf, copy);
 322                copy -= left;
 323                skip = copy;
 324                to += copy;
 325                bytes -= copy;
 326        }
 327        kunmap(page);
 328
 329done:
 330        if (skip == iov->iov_len) {
 331                iov++;
 332                skip = 0;
 333        }
 334        i->count -= wanted - bytes;
 335        i->nr_segs -= iov - i->iov;
 336        i->iov = iov;
 337        i->iov_offset = skip;
 338        return wanted - bytes;
 339}
 340
 341#ifdef PIPE_PARANOIA
 342static bool sanity(const struct iov_iter *i)
 343{
 344        struct pipe_inode_info *pipe = i->pipe;
 345        unsigned int p_head = pipe->head;
 346        unsigned int p_tail = pipe->tail;
 347        unsigned int p_mask = pipe->ring_size - 1;
 348        unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
 349        unsigned int i_head = i->head;
 350        unsigned int idx;
 351
 352        if (i->iov_offset) {
 353                struct pipe_buffer *p;
 354                if (unlikely(p_occupancy == 0))
 355                        goto Bad;       // pipe must be non-empty
 356                if (unlikely(i_head != p_head - 1))
 357                        goto Bad;       // must be at the last buffer...
 358
 359                p = &pipe->bufs[i_head & p_mask];
 360                if (unlikely(p->offset + p->len != i->iov_offset))
 361                        goto Bad;       // ... at the end of segment
 362        } else {
 363                if (i_head != p_head)
 364                        goto Bad;       // must be right after the last buffer
 365        }
 366        return true;
 367Bad:
 368        printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
 369        printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
 370                        p_head, p_tail, pipe->ring_size);
 371        for (idx = 0; idx < pipe->ring_size; idx++)
 372                printk(KERN_ERR "[%p %p %d %d]\n",
 373                        pipe->bufs[idx].ops,
 374                        pipe->bufs[idx].page,
 375                        pipe->bufs[idx].offset,
 376                        pipe->bufs[idx].len);
 377        WARN_ON(1);
 378        return false;
 379}
 380#else
 381#define sanity(i) true
 382#endif
 383
 384static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
 385                         struct iov_iter *i)
 386{
 387        struct pipe_inode_info *pipe = i->pipe;
 388        struct pipe_buffer *buf;
 389        unsigned int p_tail = pipe->tail;
 390        unsigned int p_mask = pipe->ring_size - 1;
 391        unsigned int i_head = i->head;
 392        size_t off;
 393
 394        if (unlikely(bytes > i->count))
 395                bytes = i->count;
 396
 397        if (unlikely(!bytes))
 398                return 0;
 399
 400        if (!sanity(i))
 401                return 0;
 402
 403        off = i->iov_offset;
 404        buf = &pipe->bufs[i_head & p_mask];
 405        if (off) {
 406                if (offset == off && buf->page == page) {
 407                        /* merge with the last one */
 408                        buf->len += bytes;
 409                        i->iov_offset += bytes;
 410                        goto out;
 411                }
 412                i_head++;
 413                buf = &pipe->bufs[i_head & p_mask];
 414        }
 415        if (pipe_full(i_head, p_tail, pipe->max_usage))
 416                return 0;
 417
 418        buf->ops = &page_cache_pipe_buf_ops;
 419        buf->flags = 0;
 420        get_page(page);
 421        buf->page = page;
 422        buf->offset = offset;
 423        buf->len = bytes;
 424
 425        pipe->head = i_head + 1;
 426        i->iov_offset = offset + bytes;
 427        i->head = i_head;
 428out:
 429        i->count -= bytes;
 430        return bytes;
 431}
 432
 433/*
 434 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
 435 * bytes.  For each iovec, fault in each page that constitutes the iovec.
 436 *
 437 * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
 438 * because it is an invalid address).
 439 */
 440int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
 441{
 442        if (iter_is_iovec(i)) {
 443                const struct iovec *p;
 444                size_t skip;
 445
 446                if (bytes > i->count)
 447                        bytes = i->count;
 448                for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
 449                        size_t len = min(bytes, p->iov_len - skip);
 450                        int err;
 451
 452                        if (unlikely(!len))
 453                                continue;
 454                        err = fault_in_pages_readable(p->iov_base + skip, len);
 455                        if (unlikely(err))
 456                                return err;
 457                        bytes -= len;
 458                }
 459        }
 460        return 0;
 461}
 462EXPORT_SYMBOL(iov_iter_fault_in_readable);
 463
 464void iov_iter_init(struct iov_iter *i, unsigned int direction,
 465                        const struct iovec *iov, unsigned long nr_segs,
 466                        size_t count)
 467{
 468        WARN_ON(direction & ~(READ | WRITE));
 469        *i = (struct iov_iter) {
 470                .iter_type = ITER_IOVEC,
 471                .data_source = direction,
 472                .iov = iov,
 473                .nr_segs = nr_segs,
 474                .iov_offset = 0,
 475                .count = count
 476        };
 477}
 478EXPORT_SYMBOL(iov_iter_init);
 479
 480static inline bool allocated(struct pipe_buffer *buf)
 481{
 482        return buf->ops == &default_pipe_buf_ops;
 483}
 484
 485static inline void data_start(const struct iov_iter *i,
 486                              unsigned int *iter_headp, size_t *offp)
 487{
 488        unsigned int p_mask = i->pipe->ring_size - 1;
 489        unsigned int iter_head = i->head;
 490        size_t off = i->iov_offset;
 491
 492        if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
 493                    off == PAGE_SIZE)) {
 494                iter_head++;
 495                off = 0;
 496        }
 497        *iter_headp = iter_head;
 498        *offp = off;
 499}
 500
 501static size_t push_pipe(struct iov_iter *i, size_t size,
 502                        int *iter_headp, size_t *offp)
 503{
 504        struct pipe_inode_info *pipe = i->pipe;
 505        unsigned int p_tail = pipe->tail;
 506        unsigned int p_mask = pipe->ring_size - 1;
 507        unsigned int iter_head;
 508        size_t off;
 509        ssize_t left;
 510
 511        if (unlikely(size > i->count))
 512                size = i->count;
 513        if (unlikely(!size))
 514                return 0;
 515
 516        left = size;
 517        data_start(i, &iter_head, &off);
 518        *iter_headp = iter_head;
 519        *offp = off;
 520        if (off) {
 521                left -= PAGE_SIZE - off;
 522                if (left <= 0) {
 523                        pipe->bufs[iter_head & p_mask].len += size;
 524                        return size;
 525                }
 526                pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
 527                iter_head++;
 528        }
 529        while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
 530                struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
 531                struct page *page = alloc_page(GFP_USER);
 532                if (!page)
 533                        break;
 534
 535                buf->ops = &default_pipe_buf_ops;
 536                buf->flags = 0;
 537                buf->page = page;
 538                buf->offset = 0;
 539                buf->len = min_t(ssize_t, left, PAGE_SIZE);
 540                left -= buf->len;
 541                iter_head++;
 542                pipe->head = iter_head;
 543
 544                if (left == 0)
 545                        return size;
 546        }
 547        return size - left;
 548}
 549
 550static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
 551                                struct iov_iter *i)
 552{
 553        struct pipe_inode_info *pipe = i->pipe;
 554        unsigned int p_mask = pipe->ring_size - 1;
 555        unsigned int i_head;
 556        size_t n, off;
 557
 558        if (!sanity(i))
 559                return 0;
 560
 561        bytes = n = push_pipe(i, bytes, &i_head, &off);
 562        if (unlikely(!n))
 563                return 0;
 564        do {
 565                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
 566                memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
 567                i->head = i_head;
 568                i->iov_offset = off + chunk;
 569                n -= chunk;
 570                addr += chunk;
 571                off = 0;
 572                i_head++;
 573        } while (n);
 574        i->count -= bytes;
 575        return bytes;
 576}
 577
 578static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
 579                              __wsum sum, size_t off)
 580{
 581        __wsum next = csum_partial_copy_nocheck(from, to, len);
 582        return csum_block_add(sum, next, off);
 583}
 584
 585static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
 586                                         struct iov_iter *i, __wsum *sump)
 587{
 588        struct pipe_inode_info *pipe = i->pipe;
 589        unsigned int p_mask = pipe->ring_size - 1;
 590        __wsum sum = *sump;
 591        size_t off = 0;
 592        unsigned int i_head;
 593        size_t r;
 594
 595        if (!sanity(i))
 596                return 0;
 597
 598        bytes = push_pipe(i, bytes, &i_head, &r);
 599        while (bytes) {
 600                size_t chunk = min_t(size_t, bytes, PAGE_SIZE - r);
 601                char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
 602                sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off);
 603                kunmap_local(p);
 604                i->head = i_head;
 605                i->iov_offset = r + chunk;
 606                bytes -= chunk;
 607                off += chunk;
 608                r = 0;
 609                i_head++;
 610        }
 611        *sump = sum;
 612        i->count -= off;
 613        return off;
 614}
 615
 616size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 617{
 618        if (unlikely(iov_iter_is_pipe(i)))
 619                return copy_pipe_to_iter(addr, bytes, i);
 620        if (iter_is_iovec(i))
 621                might_fault();
 622        iterate_and_advance(i, bytes, base, len, off,
 623                copyout(base, addr + off, len),
 624                memcpy(base, addr + off, len)
 625        )
 626
 627        return bytes;
 628}
 629EXPORT_SYMBOL(_copy_to_iter);
 630
 631#ifdef CONFIG_ARCH_HAS_COPY_MC
 632static int copyout_mc(void __user *to, const void *from, size_t n)
 633{
 634        if (access_ok(to, n)) {
 635                instrument_copy_to_user(to, from, n);
 636                n = copy_mc_to_user((__force void *) to, from, n);
 637        }
 638        return n;
 639}
 640
 641static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
 642                                struct iov_iter *i)
 643{
 644        struct pipe_inode_info *pipe = i->pipe;
 645        unsigned int p_mask = pipe->ring_size - 1;
 646        unsigned int i_head;
 647        size_t n, off, xfer = 0;
 648
 649        if (!sanity(i))
 650                return 0;
 651
 652        n = push_pipe(i, bytes, &i_head, &off);
 653        while (n) {
 654                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
 655                char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
 656                unsigned long rem;
 657                rem = copy_mc_to_kernel(p + off, addr + xfer, chunk);
 658                chunk -= rem;
 659                kunmap_local(p);
 660                i->head = i_head;
 661                i->iov_offset = off + chunk;
 662                xfer += chunk;
 663                if (rem)
 664                        break;
 665                n -= chunk;
 666                off = 0;
 667                i_head++;
 668        }
 669        i->count -= xfer;
 670        return xfer;
 671}
 672
 673/**
 674 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 675 * @addr: source kernel address
 676 * @bytes: total transfer length
 677 * @i: destination iterator
 678 *
 679 * The pmem driver deploys this for the dax operation
 680 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 681 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
 682 * successfully copied.
 683 *
 684 * The main differences between this and typical _copy_to_iter().
 685 *
 686 * * Typical tail/residue handling after a fault retries the copy
 687 *   byte-by-byte until the fault happens again. Re-triggering machine
 688 *   checks is potentially fatal so the implementation uses source
 689 *   alignment and poison alignment assumptions to avoid re-triggering
 690 *   hardware exceptions.
 691 *
 692 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
 693 *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
 694 *   a short copy.
 695 *
 696 * Return: number of bytes copied (may be %0)
 697 */
 698size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 699{
 700        if (unlikely(iov_iter_is_pipe(i)))
 701                return copy_mc_pipe_to_iter(addr, bytes, i);
 702        if (iter_is_iovec(i))
 703                might_fault();
 704        __iterate_and_advance(i, bytes, base, len, off,
 705                copyout_mc(base, addr + off, len),
 706                copy_mc_to_kernel(base, addr + off, len)
 707        )
 708
 709        return bytes;
 710}
 711EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
 712#endif /* CONFIG_ARCH_HAS_COPY_MC */
 713
 714size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 715{
 716        if (unlikely(iov_iter_is_pipe(i))) {
 717                WARN_ON(1);
 718                return 0;
 719        }
 720        if (iter_is_iovec(i))
 721                might_fault();
 722        iterate_and_advance(i, bytes, base, len, off,
 723                copyin(addr + off, base, len),
 724                memcpy(addr + off, base, len)
 725        )
 726
 727        return bytes;
 728}
 729EXPORT_SYMBOL(_copy_from_iter);
 730
 731size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 732{
 733        if (unlikely(iov_iter_is_pipe(i))) {
 734                WARN_ON(1);
 735                return 0;
 736        }
 737        iterate_and_advance(i, bytes, base, len, off,
 738                __copy_from_user_inatomic_nocache(addr + off, base, len),
 739                memcpy(addr + off, base, len)
 740        )
 741
 742        return bytes;
 743}
 744EXPORT_SYMBOL(_copy_from_iter_nocache);
 745
 746#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 747/**
 748 * _copy_from_iter_flushcache - write destination through cpu cache
 749 * @addr: destination kernel address
 750 * @bytes: total transfer length
 751 * @i: source iterator
 752 *
 753 * The pmem driver arranges for filesystem-dax to use this facility via
 754 * dax_copy_from_iter() for ensuring that writes to persistent memory
 755 * are flushed through the CPU cache. It is differentiated from
 756 * _copy_from_iter_nocache() in that guarantees all data is flushed for
 757 * all iterator types. The _copy_from_iter_nocache() only attempts to
 758 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 759 * instructions that strand dirty-data in the cache.
 760 *
 761 * Return: number of bytes copied (may be %0)
 762 */
 763size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 764{
 765        if (unlikely(iov_iter_is_pipe(i))) {
 766                WARN_ON(1);
 767                return 0;
 768        }
 769        iterate_and_advance(i, bytes, base, len, off,
 770                __copy_from_user_flushcache(addr + off, base, len),
 771                memcpy_flushcache(addr + off, base, len)
 772        )
 773
 774        return bytes;
 775}
 776EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
 777#endif
 778
 779static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
 780{
 781        struct page *head;
 782        size_t v = n + offset;
 783
 784        /*
 785         * The general case needs to access the page order in order
 786         * to compute the page size.
 787         * However, we mostly deal with order-0 pages and thus can
 788         * avoid a possible cache line miss for requests that fit all
 789         * page orders.
 790         */
 791        if (n <= v && v <= PAGE_SIZE)
 792                return true;
 793
 794        head = compound_head(page);
 795        v += (page - head) << PAGE_SHIFT;
 796
 797        if (likely(n <= v && v <= (page_size(head))))
 798                return true;
 799        WARN_ON(1);
 800        return false;
 801}
 802
 803static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 804                         struct iov_iter *i)
 805{
 806        if (likely(iter_is_iovec(i)))
 807                return copy_page_to_iter_iovec(page, offset, bytes, i);
 808        if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
 809                void *kaddr = kmap_local_page(page);
 810                size_t wanted = _copy_to_iter(kaddr + offset, bytes, i);
 811                kunmap_local(kaddr);
 812                return wanted;
 813        }
 814        if (iov_iter_is_pipe(i))
 815                return copy_page_to_iter_pipe(page, offset, bytes, i);
 816        if (unlikely(iov_iter_is_discard(i))) {
 817                if (unlikely(i->count < bytes))
 818                        bytes = i->count;
 819                i->count -= bytes;
 820                return bytes;
 821        }
 822        WARN_ON(1);
 823        return 0;
 824}
 825
 826size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 827                         struct iov_iter *i)
 828{
 829        size_t res = 0;
 830        if (unlikely(!page_copy_sane(page, offset, bytes)))
 831                return 0;
 832        page += offset / PAGE_SIZE; // first subpage
 833        offset %= PAGE_SIZE;
 834        while (1) {
 835                size_t n = __copy_page_to_iter(page, offset,
 836                                min(bytes, (size_t)PAGE_SIZE - offset), i);
 837                res += n;
 838                bytes -= n;
 839                if (!bytes || !n)
 840                        break;
 841                offset += n;
 842                if (offset == PAGE_SIZE) {
 843                        page++;
 844                        offset = 0;
 845                }
 846        }
 847        return res;
 848}
 849EXPORT_SYMBOL(copy_page_to_iter);
 850
 851size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 852                         struct iov_iter *i)
 853{
 854        if (unlikely(!page_copy_sane(page, offset, bytes)))
 855                return 0;
 856        if (likely(iter_is_iovec(i)))
 857                return copy_page_from_iter_iovec(page, offset, bytes, i);
 858        if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
 859                void *kaddr = kmap_local_page(page);
 860                size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
 861                kunmap_local(kaddr);
 862                return wanted;
 863        }
 864        WARN_ON(1);
 865        return 0;
 866}
 867EXPORT_SYMBOL(copy_page_from_iter);
 868
 869static size_t pipe_zero(size_t bytes, struct iov_iter *i)
 870{
 871        struct pipe_inode_info *pipe = i->pipe;
 872        unsigned int p_mask = pipe->ring_size - 1;
 873        unsigned int i_head;
 874        size_t n, off;
 875
 876        if (!sanity(i))
 877                return 0;
 878
 879        bytes = n = push_pipe(i, bytes, &i_head, &off);
 880        if (unlikely(!n))
 881                return 0;
 882
 883        do {
 884                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
 885                char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
 886                memset(p + off, 0, chunk);
 887                kunmap_local(p);
 888                i->head = i_head;
 889                i->iov_offset = off + chunk;
 890                n -= chunk;
 891                off = 0;
 892                i_head++;
 893        } while (n);
 894        i->count -= bytes;
 895        return bytes;
 896}
 897
 898size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 899{
 900        if (unlikely(iov_iter_is_pipe(i)))
 901                return pipe_zero(bytes, i);
 902        iterate_and_advance(i, bytes, base, len, count,
 903                clear_user(base, len),
 904                memset(base, 0, len)
 905        )
 906
 907        return bytes;
 908}
 909EXPORT_SYMBOL(iov_iter_zero);
 910
 911size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
 912                                  struct iov_iter *i)
 913{
 914        char *kaddr = kmap_atomic(page), *p = kaddr + offset;
 915        if (unlikely(!page_copy_sane(page, offset, bytes))) {
 916                kunmap_atomic(kaddr);
 917                return 0;
 918        }
 919        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
 920                kunmap_atomic(kaddr);
 921                WARN_ON(1);
 922                return 0;
 923        }
 924        iterate_and_advance(i, bytes, base, len, off,
 925                copyin(p + off, base, len),
 926                memcpy(p + off, base, len)
 927        )
 928        kunmap_atomic(kaddr);
 929        return bytes;
 930}
 931EXPORT_SYMBOL(copy_page_from_iter_atomic);
 932
 933static inline void pipe_truncate(struct iov_iter *i)
 934{
 935        struct pipe_inode_info *pipe = i->pipe;
 936        unsigned int p_tail = pipe->tail;
 937        unsigned int p_head = pipe->head;
 938        unsigned int p_mask = pipe->ring_size - 1;
 939
 940        if (!pipe_empty(p_head, p_tail)) {
 941                struct pipe_buffer *buf;
 942                unsigned int i_head = i->head;
 943                size_t off = i->iov_offset;
 944
 945                if (off) {
 946                        buf = &pipe->bufs[i_head & p_mask];
 947                        buf->len = off - buf->offset;
 948                        i_head++;
 949                }
 950                while (p_head != i_head) {
 951                        p_head--;
 952                        pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
 953                }
 954
 955                pipe->head = p_head;
 956        }
 957}
 958
 959static void pipe_advance(struct iov_iter *i, size_t size)
 960{
 961        struct pipe_inode_info *pipe = i->pipe;
 962        if (size) {
 963                struct pipe_buffer *buf;
 964                unsigned int p_mask = pipe->ring_size - 1;
 965                unsigned int i_head = i->head;
 966                size_t off = i->iov_offset, left = size;
 967
 968                if (off) /* make it relative to the beginning of buffer */
 969                        left += off - pipe->bufs[i_head & p_mask].offset;
 970                while (1) {
 971                        buf = &pipe->bufs[i_head & p_mask];
 972                        if (left <= buf->len)
 973                                break;
 974                        left -= buf->len;
 975                        i_head++;
 976                }
 977                i->head = i_head;
 978                i->iov_offset = buf->offset + left;
 979        }
 980        i->count -= size;
 981        /* ... and discard everything past that point */
 982        pipe_truncate(i);
 983}
 984
 985static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
 986{
 987        struct bvec_iter bi;
 988
 989        bi.bi_size = i->count;
 990        bi.bi_bvec_done = i->iov_offset;
 991        bi.bi_idx = 0;
 992        bvec_iter_advance(i->bvec, &bi, size);
 993
 994        i->bvec += bi.bi_idx;
 995        i->nr_segs -= bi.bi_idx;
 996        i->count = bi.bi_size;
 997        i->iov_offset = bi.bi_bvec_done;
 998}
 999
1000static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
1001{
1002        const struct iovec *iov, *end;
1003
1004        if (!i->count)
1005                return;
1006        i->count -= size;
1007
1008        size += i->iov_offset; // from beginning of current segment
1009        for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
1010                if (likely(size < iov->iov_len))
1011                        break;
1012                size -= iov->iov_len;
1013        }
1014        i->iov_offset = size;
1015        i->nr_segs -= iov - i->iov;
1016        i->iov = iov;
1017}
1018
1019void iov_iter_advance(struct iov_iter *i, size_t size)
1020{
1021        if (unlikely(i->count < size))
1022                size = i->count;
1023        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
1024                /* iovec and kvec have identical layouts */
1025                iov_iter_iovec_advance(i, size);
1026        } else if (iov_iter_is_bvec(i)) {
1027                iov_iter_bvec_advance(i, size);
1028        } else if (iov_iter_is_pipe(i)) {
1029                pipe_advance(i, size);
1030        } else if (unlikely(iov_iter_is_xarray(i))) {
1031                i->iov_offset += size;
1032                i->count -= size;
1033        } else if (iov_iter_is_discard(i)) {
1034                i->count -= size;
1035        }
1036}
1037EXPORT_SYMBOL(iov_iter_advance);
1038
1039void iov_iter_revert(struct iov_iter *i, size_t unroll)
1040{
1041        if (!unroll)
1042                return;
1043        if (WARN_ON(unroll > MAX_RW_COUNT))
1044                return;
1045        i->count += unroll;
1046        if (unlikely(iov_iter_is_pipe(i))) {
1047                struct pipe_inode_info *pipe = i->pipe;
1048                unsigned int p_mask = pipe->ring_size - 1;
1049                unsigned int i_head = i->head;
1050                size_t off = i->iov_offset;
1051                while (1) {
1052                        struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1053                        size_t n = off - b->offset;
1054                        if (unroll < n) {
1055                                off -= unroll;
1056                                break;
1057                        }
1058                        unroll -= n;
1059                        if (!unroll && i_head == i->start_head) {
1060                                off = 0;
1061                                break;
1062                        }
1063                        i_head--;
1064                        b = &pipe->bufs[i_head & p_mask];
1065                        off = b->offset + b->len;
1066                }
1067                i->iov_offset = off;
1068                i->head = i_head;
1069                pipe_truncate(i);
1070                return;
1071        }
1072        if (unlikely(iov_iter_is_discard(i)))
1073                return;
1074        if (unroll <= i->iov_offset) {
1075                i->iov_offset -= unroll;
1076                return;
1077        }
1078        unroll -= i->iov_offset;
1079        if (iov_iter_is_xarray(i)) {
1080                BUG(); /* We should never go beyond the start of the specified
1081                        * range since we might then be straying into pages that
1082                        * aren't pinned.
1083                        */
1084        } else if (iov_iter_is_bvec(i)) {
1085                const struct bio_vec *bvec = i->bvec;
1086                while (1) {
1087                        size_t n = (--bvec)->bv_len;
1088                        i->nr_segs++;
1089                        if (unroll <= n) {
1090                                i->bvec = bvec;
1091                                i->iov_offset = n - unroll;
1092                                return;
1093                        }
1094                        unroll -= n;
1095                }
1096        } else { /* same logics for iovec and kvec */
1097                const struct iovec *iov = i->iov;
1098                while (1) {
1099                        size_t n = (--iov)->iov_len;
1100                        i->nr_segs++;
1101                        if (unroll <= n) {
1102                                i->iov = iov;
1103                                i->iov_offset = n - unroll;
1104                                return;
1105                        }
1106                        unroll -= n;
1107                }
1108        }
1109}
1110EXPORT_SYMBOL(iov_iter_revert);
1111
1112/*
1113 * Return the count of just the current iov_iter segment.
1114 */
1115size_t iov_iter_single_seg_count(const struct iov_iter *i)
1116{
1117        if (i->nr_segs > 1) {
1118                if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1119                        return min(i->count, i->iov->iov_len - i->iov_offset);
1120                if (iov_iter_is_bvec(i))
1121                        return min(i->count, i->bvec->bv_len - i->iov_offset);
1122        }
1123        return i->count;
1124}
1125EXPORT_SYMBOL(iov_iter_single_seg_count);
1126
1127void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1128                        const struct kvec *kvec, unsigned long nr_segs,
1129                        size_t count)
1130{
1131        WARN_ON(direction & ~(READ | WRITE));
1132        *i = (struct iov_iter){
1133                .iter_type = ITER_KVEC,
1134                .data_source = direction,
1135                .kvec = kvec,
1136                .nr_segs = nr_segs,
1137                .iov_offset = 0,
1138                .count = count
1139        };
1140}
1141EXPORT_SYMBOL(iov_iter_kvec);
1142
1143void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1144                        const struct bio_vec *bvec, unsigned long nr_segs,
1145                        size_t count)
1146{
1147        WARN_ON(direction & ~(READ | WRITE));
1148        *i = (struct iov_iter){
1149                .iter_type = ITER_BVEC,
1150                .data_source = direction,
1151                .bvec = bvec,
1152                .nr_segs = nr_segs,
1153                .iov_offset = 0,
1154                .count = count
1155        };
1156}
1157EXPORT_SYMBOL(iov_iter_bvec);
1158
1159void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1160                        struct pipe_inode_info *pipe,
1161                        size_t count)
1162{
1163        BUG_ON(direction != READ);
1164        WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1165        *i = (struct iov_iter){
1166                .iter_type = ITER_PIPE,
1167                .data_source = false,
1168                .pipe = pipe,
1169                .head = pipe->head,
1170                .start_head = pipe->head,
1171                .iov_offset = 0,
1172                .count = count
1173        };
1174}
1175EXPORT_SYMBOL(iov_iter_pipe);
1176
1177/**
1178 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1179 * @i: The iterator to initialise.
1180 * @direction: The direction of the transfer.
1181 * @xarray: The xarray to access.
1182 * @start: The start file position.
1183 * @count: The size of the I/O buffer in bytes.
1184 *
1185 * Set up an I/O iterator to either draw data out of the pages attached to an
1186 * inode or to inject data into those pages.  The pages *must* be prevented
1187 * from evaporation, either by taking a ref on them or locking them by the
1188 * caller.
1189 */
1190void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1191                     struct xarray *xarray, loff_t start, size_t count)
1192{
1193        BUG_ON(direction & ~1);
1194        *i = (struct iov_iter) {
1195                .iter_type = ITER_XARRAY,
1196                .data_source = direction,
1197                .xarray = xarray,
1198                .xarray_start = start,
1199                .count = count,
1200                .iov_offset = 0
1201        };
1202}
1203EXPORT_SYMBOL(iov_iter_xarray);
1204
1205/**
1206 * iov_iter_discard - Initialise an I/O iterator that discards data
1207 * @i: The iterator to initialise.
1208 * @direction: The direction of the transfer.
1209 * @count: The size of the I/O buffer in bytes.
1210 *
1211 * Set up an I/O iterator that just discards everything that's written to it.
1212 * It's only available as a READ iterator.
1213 */
1214void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1215{
1216        BUG_ON(direction != READ);
1217        *i = (struct iov_iter){
1218                .iter_type = ITER_DISCARD,
1219                .data_source = false,
1220                .count = count,
1221                .iov_offset = 0
1222        };
1223}
1224EXPORT_SYMBOL(iov_iter_discard);
1225
1226static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
1227{
1228        unsigned long res = 0;
1229        size_t size = i->count;
1230        size_t skip = i->iov_offset;
1231        unsigned k;
1232
1233        for (k = 0; k < i->nr_segs; k++, skip = 0) {
1234                size_t len = i->iov[k].iov_len - skip;
1235                if (len) {
1236                        res |= (unsigned long)i->iov[k].iov_base + skip;
1237                        if (len > size)
1238                                len = size;
1239                        res |= len;
1240                        size -= len;
1241                        if (!size)
1242                                break;
1243                }
1244        }
1245        return res;
1246}
1247
1248static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1249{
1250        unsigned res = 0;
1251        size_t size = i->count;
1252        unsigned skip = i->iov_offset;
1253        unsigned k;
1254
1255        for (k = 0; k < i->nr_segs; k++, skip = 0) {
1256                size_t len = i->bvec[k].bv_len - skip;
1257                res |= (unsigned long)i->bvec[k].bv_offset + skip;
1258                if (len > size)
1259                        len = size;
1260                res |= len;
1261                size -= len;
1262                if (!size)
1263                        break;
1264        }
1265        return res;
1266}
1267
1268unsigned long iov_iter_alignment(const struct iov_iter *i)
1269{
1270        /* iovec and kvec have identical layouts */
1271        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1272                return iov_iter_alignment_iovec(i);
1273
1274        if (iov_iter_is_bvec(i))
1275                return iov_iter_alignment_bvec(i);
1276
1277        if (iov_iter_is_pipe(i)) {
1278                unsigned int p_mask = i->pipe->ring_size - 1;
1279                size_t size = i->count;
1280
1281                if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1282                        return size | i->iov_offset;
1283                return size;
1284        }
1285
1286        if (iov_iter_is_xarray(i))
1287                return (i->xarray_start + i->iov_offset) | i->count;
1288
1289        return 0;
1290}
1291EXPORT_SYMBOL(iov_iter_alignment);
1292
1293unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1294{
1295        unsigned long res = 0;
1296        unsigned long v = 0;
1297        size_t size = i->count;
1298        unsigned k;
1299
1300        if (WARN_ON(!iter_is_iovec(i)))
1301                return ~0U;
1302
1303        for (k = 0; k < i->nr_segs; k++) {
1304                if (i->iov[k].iov_len) {
1305                        unsigned long base = (unsigned long)i->iov[k].iov_base;
1306                        if (v) // if not the first one
1307                                res |= base | v; // this start | previous end
1308                        v = base + i->iov[k].iov_len;
1309                        if (size <= i->iov[k].iov_len)
1310                                break;
1311                        size -= i->iov[k].iov_len;
1312                }
1313        }
1314        return res;
1315}
1316EXPORT_SYMBOL(iov_iter_gap_alignment);
1317
1318static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1319                                size_t maxsize,
1320                                struct page **pages,
1321                                int iter_head,
1322                                size_t *start)
1323{
1324        struct pipe_inode_info *pipe = i->pipe;
1325        unsigned int p_mask = pipe->ring_size - 1;
1326        ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1327        if (!n)
1328                return -EFAULT;
1329
1330        maxsize = n;
1331        n += *start;
1332        while (n > 0) {
1333                get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1334                iter_head++;
1335                n -= PAGE_SIZE;
1336        }
1337
1338        return maxsize;
1339}
1340
1341static ssize_t pipe_get_pages(struct iov_iter *i,
1342                   struct page **pages, size_t maxsize, unsigned maxpages,
1343                   size_t *start)
1344{
1345        unsigned int iter_head, npages;
1346        size_t capacity;
1347
1348        if (!sanity(i))
1349                return -EFAULT;
1350
1351        data_start(i, &iter_head, start);
1352        /* Amount of free space: some of this one + all after this one */
1353        npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1354        capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1355
1356        return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1357}
1358
1359static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1360                                          pgoff_t index, unsigned int nr_pages)
1361{
1362        XA_STATE(xas, xa, index);
1363        struct page *page;
1364        unsigned int ret = 0;
1365
1366        rcu_read_lock();
1367        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1368                if (xas_retry(&xas, page))
1369                        continue;
1370
1371                /* Has the page moved or been split? */
1372                if (unlikely(page != xas_reload(&xas))) {
1373                        xas_reset(&xas);
1374                        continue;
1375                }
1376
1377                pages[ret] = find_subpage(page, xas.xa_index);
1378                get_page(pages[ret]);
1379                if (++ret == nr_pages)
1380                        break;
1381        }
1382        rcu_read_unlock();
1383        return ret;
1384}
1385
1386static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1387                                     struct page **pages, size_t maxsize,
1388                                     unsigned maxpages, size_t *_start_offset)
1389{
1390        unsigned nr, offset;
1391        pgoff_t index, count;
1392        size_t size = maxsize, actual;
1393        loff_t pos;
1394
1395        if (!size || !maxpages)
1396                return 0;
1397
1398        pos = i->xarray_start + i->iov_offset;
1399        index = pos >> PAGE_SHIFT;
1400        offset = pos & ~PAGE_MASK;
1401        *_start_offset = offset;
1402
1403        count = 1;
1404        if (size > PAGE_SIZE - offset) {
1405                size -= PAGE_SIZE - offset;
1406                count += size >> PAGE_SHIFT;
1407                size &= ~PAGE_MASK;
1408                if (size)
1409                        count++;
1410        }
1411
1412        if (count > maxpages)
1413                count = maxpages;
1414
1415        nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1416        if (nr == 0)
1417                return 0;
1418
1419        actual = PAGE_SIZE * nr;
1420        actual -= offset;
1421        if (nr == count && size > 0) {
1422                unsigned last_offset = (nr > 1) ? 0 : offset;
1423                actual -= PAGE_SIZE - (last_offset + size);
1424        }
1425        return actual;
1426}
1427
1428/* must be done on non-empty ITER_IOVEC one */
1429static unsigned long first_iovec_segment(const struct iov_iter *i,
1430                                         size_t *size, size_t *start,
1431                                         size_t maxsize, unsigned maxpages)
1432{
1433        size_t skip;
1434        long k;
1435
1436        for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1437                unsigned long addr = (unsigned long)i->iov[k].iov_base + skip;
1438                size_t len = i->iov[k].iov_len - skip;
1439
1440                if (unlikely(!len))
1441                        continue;
1442                if (len > maxsize)
1443                        len = maxsize;
1444                len += (*start = addr % PAGE_SIZE);
1445                if (len > maxpages * PAGE_SIZE)
1446                        len = maxpages * PAGE_SIZE;
1447                *size = len;
1448                return addr & PAGE_MASK;
1449        }
1450        BUG(); // if it had been empty, we wouldn't get called
1451}
1452
1453/* must be done on non-empty ITER_BVEC one */
1454static struct page *first_bvec_segment(const struct iov_iter *i,
1455                                       size_t *size, size_t *start,
1456                                       size_t maxsize, unsigned maxpages)
1457{
1458        struct page *page;
1459        size_t skip = i->iov_offset, len;
1460
1461        len = i->bvec->bv_len - skip;
1462        if (len > maxsize)
1463                len = maxsize;
1464        skip += i->bvec->bv_offset;
1465        page = i->bvec->bv_page + skip / PAGE_SIZE;
1466        len += (*start = skip % PAGE_SIZE);
1467        if (len > maxpages * PAGE_SIZE)
1468                len = maxpages * PAGE_SIZE;
1469        *size = len;
1470        return page;
1471}
1472
1473ssize_t iov_iter_get_pages(struct iov_iter *i,
1474                   struct page **pages, size_t maxsize, unsigned maxpages,
1475                   size_t *start)
1476{
1477        size_t len;
1478        int n, res;
1479
1480        if (maxsize > i->count)
1481                maxsize = i->count;
1482        if (!maxsize)
1483                return 0;
1484
1485        if (likely(iter_is_iovec(i))) {
1486                unsigned long addr;
1487
1488                addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
1489                n = DIV_ROUND_UP(len, PAGE_SIZE);
1490                res = get_user_pages_fast(addr, n,
1491                                iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
1492                                pages);
1493                if (unlikely(res < 0))
1494                        return res;
1495                return (res == n ? len : res * PAGE_SIZE) - *start;
1496        }
1497        if (iov_iter_is_bvec(i)) {
1498                struct page *page;
1499
1500                page = first_bvec_segment(i, &len, start, maxsize, maxpages);
1501                n = DIV_ROUND_UP(len, PAGE_SIZE);
1502                while (n--)
1503                        get_page(*pages++ = page++);
1504                return len - *start;
1505        }
1506        if (iov_iter_is_pipe(i))
1507                return pipe_get_pages(i, pages, maxsize, maxpages, start);
1508        if (iov_iter_is_xarray(i))
1509                return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1510        return -EFAULT;
1511}
1512EXPORT_SYMBOL(iov_iter_get_pages);
1513
1514static struct page **get_pages_array(size_t n)
1515{
1516        return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1517}
1518
1519static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1520                   struct page ***pages, size_t maxsize,
1521                   size_t *start)
1522{
1523        struct page **p;
1524        unsigned int iter_head, npages;
1525        ssize_t n;
1526
1527        if (!sanity(i))
1528                return -EFAULT;
1529
1530        data_start(i, &iter_head, start);
1531        /* Amount of free space: some of this one + all after this one */
1532        npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1533        n = npages * PAGE_SIZE - *start;
1534        if (maxsize > n)
1535                maxsize = n;
1536        else
1537                npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1538        p = get_pages_array(npages);
1539        if (!p)
1540                return -ENOMEM;
1541        n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1542        if (n > 0)
1543                *pages = p;
1544        else
1545                kvfree(p);
1546        return n;
1547}
1548
1549static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1550                                           struct page ***pages, size_t maxsize,
1551                                           size_t *_start_offset)
1552{
1553        struct page **p;
1554        unsigned nr, offset;
1555        pgoff_t index, count;
1556        size_t size = maxsize, actual;
1557        loff_t pos;
1558
1559        if (!size)
1560                return 0;
1561
1562        pos = i->xarray_start + i->iov_offset;
1563        index = pos >> PAGE_SHIFT;
1564        offset = pos & ~PAGE_MASK;
1565        *_start_offset = offset;
1566
1567        count = 1;
1568        if (size > PAGE_SIZE - offset) {
1569                size -= PAGE_SIZE - offset;
1570                count += size >> PAGE_SHIFT;
1571                size &= ~PAGE_MASK;
1572                if (size)
1573                        count++;
1574        }
1575
1576        p = get_pages_array(count);
1577        if (!p)
1578                return -ENOMEM;
1579        *pages = p;
1580
1581        nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1582        if (nr == 0)
1583                return 0;
1584
1585        actual = PAGE_SIZE * nr;
1586        actual -= offset;
1587        if (nr == count && size > 0) {
1588                unsigned last_offset = (nr > 1) ? 0 : offset;
1589                actual -= PAGE_SIZE - (last_offset + size);
1590        }
1591        return actual;
1592}
1593
1594ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1595                   struct page ***pages, size_t maxsize,
1596                   size_t *start)
1597{
1598        struct page **p;
1599        size_t len;
1600        int n, res;
1601
1602        if (maxsize > i->count)
1603                maxsize = i->count;
1604        if (!maxsize)
1605                return 0;
1606
1607        if (likely(iter_is_iovec(i))) {
1608                unsigned long addr;
1609
1610                addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
1611                n = DIV_ROUND_UP(len, PAGE_SIZE);
1612                p = get_pages_array(n);
1613                if (!p)
1614                        return -ENOMEM;
1615                res = get_user_pages_fast(addr, n,
1616                                iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
1617                if (unlikely(res < 0)) {
1618                        kvfree(p);
1619                        return res;
1620                }
1621                *pages = p;
1622                return (res == n ? len : res * PAGE_SIZE) - *start;
1623        }
1624        if (iov_iter_is_bvec(i)) {
1625                struct page *page;
1626
1627                page = first_bvec_segment(i, &len, start, maxsize, ~0U);
1628                n = DIV_ROUND_UP(len, PAGE_SIZE);
1629                *pages = p = get_pages_array(n);
1630                if (!p)
1631                        return -ENOMEM;
1632                while (n--)
1633                        get_page(*p++ = page++);
1634                return len - *start;
1635        }
1636        if (iov_iter_is_pipe(i))
1637                return pipe_get_pages_alloc(i, pages, maxsize, start);
1638        if (iov_iter_is_xarray(i))
1639                return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1640        return -EFAULT;
1641}
1642EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1643
1644size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1645                               struct iov_iter *i)
1646{
1647        __wsum sum, next;
1648        sum = *csum;
1649        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1650                WARN_ON(1);
1651                return 0;
1652        }
1653        iterate_and_advance(i, bytes, base, len, off, ({
1654                next = csum_and_copy_from_user(base, addr + off, len);
1655                sum = csum_block_add(sum, next, off);
1656                next ? 0 : len;
1657        }), ({
1658                sum = csum_and_memcpy(addr + off, base, len, sum, off);
1659        })
1660        )
1661        *csum = sum;
1662        return bytes;
1663}
1664EXPORT_SYMBOL(csum_and_copy_from_iter);
1665
1666size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1667                             struct iov_iter *i)
1668{
1669        struct csum_state *csstate = _csstate;
1670        __wsum sum, next;
1671
1672        if (unlikely(iov_iter_is_discard(i))) {
1673                WARN_ON(1);     /* for now */
1674                return 0;
1675        }
1676
1677        sum = csum_shift(csstate->csum, csstate->off);
1678        if (unlikely(iov_iter_is_pipe(i)))
1679                bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum);
1680        else iterate_and_advance(i, bytes, base, len, off, ({
1681                next = csum_and_copy_to_user(addr + off, base, len);
1682                sum = csum_block_add(sum, next, off);
1683                next ? 0 : len;
1684        }), ({
1685                sum = csum_and_memcpy(base, addr + off, len, sum, off);
1686        })
1687        )
1688        csstate->csum = csum_shift(sum, csstate->off);
1689        csstate->off += bytes;
1690        return bytes;
1691}
1692EXPORT_SYMBOL(csum_and_copy_to_iter);
1693
1694size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1695                struct iov_iter *i)
1696{
1697#ifdef CONFIG_CRYPTO_HASH
1698        struct ahash_request *hash = hashp;
1699        struct scatterlist sg;
1700        size_t copied;
1701
1702        copied = copy_to_iter(addr, bytes, i);
1703        sg_init_one(&sg, addr, copied);
1704        ahash_request_set_crypt(hash, &sg, NULL, copied);
1705        crypto_ahash_update(hash);
1706        return copied;
1707#else
1708        return 0;
1709#endif
1710}
1711EXPORT_SYMBOL(hash_and_copy_to_iter);
1712
1713static int iov_npages(const struct iov_iter *i, int maxpages)
1714{
1715        size_t skip = i->iov_offset, size = i->count;
1716        const struct iovec *p;
1717        int npages = 0;
1718
1719        for (p = i->iov; size; skip = 0, p++) {
1720                unsigned offs = offset_in_page(p->iov_base + skip);
1721                size_t len = min(p->iov_len - skip, size);
1722
1723                if (len) {
1724                        size -= len;
1725                        npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1726                        if (unlikely(npages > maxpages))
1727                                return maxpages;
1728                }
1729        }
1730        return npages;
1731}
1732
1733static int bvec_npages(const struct iov_iter *i, int maxpages)
1734{
1735        size_t skip = i->iov_offset, size = i->count;
1736        const struct bio_vec *p;
1737        int npages = 0;
1738
1739        for (p = i->bvec; size; skip = 0, p++) {
1740                unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1741                size_t len = min(p->bv_len - skip, size);
1742
1743                size -= len;
1744                npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1745                if (unlikely(npages > maxpages))
1746                        return maxpages;
1747        }
1748        return npages;
1749}
1750
1751int iov_iter_npages(const struct iov_iter *i, int maxpages)
1752{
1753        if (unlikely(!i->count))
1754                return 0;
1755        /* iovec and kvec have identical layouts */
1756        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1757                return iov_npages(i, maxpages);
1758        if (iov_iter_is_bvec(i))
1759                return bvec_npages(i, maxpages);
1760        if (iov_iter_is_pipe(i)) {
1761                unsigned int iter_head;
1762                int npages;
1763                size_t off;
1764
1765                if (!sanity(i))
1766                        return 0;
1767
1768                data_start(i, &iter_head, &off);
1769                /* some of this one + all after this one */
1770                npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1771                return min(npages, maxpages);
1772        }
1773        if (iov_iter_is_xarray(i)) {
1774                unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1775                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1776                return min(npages, maxpages);
1777        }
1778        return 0;
1779}
1780EXPORT_SYMBOL(iov_iter_npages);
1781
1782const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1783{
1784        *new = *old;
1785        if (unlikely(iov_iter_is_pipe(new))) {
1786                WARN_ON(1);
1787                return NULL;
1788        }
1789        if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
1790                return NULL;
1791        if (iov_iter_is_bvec(new))
1792                return new->bvec = kmemdup(new->bvec,
1793                                    new->nr_segs * sizeof(struct bio_vec),
1794                                    flags);
1795        else
1796                /* iovec and kvec have identical layout */
1797                return new->iov = kmemdup(new->iov,
1798                                   new->nr_segs * sizeof(struct iovec),
1799                                   flags);
1800}
1801EXPORT_SYMBOL(dup_iter);
1802
1803static int copy_compat_iovec_from_user(struct iovec *iov,
1804                const struct iovec __user *uvec, unsigned long nr_segs)
1805{
1806        const struct compat_iovec __user *uiov =
1807                (const struct compat_iovec __user *)uvec;
1808        int ret = -EFAULT, i;
1809
1810        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1811                return -EFAULT;
1812
1813        for (i = 0; i < nr_segs; i++) {
1814                compat_uptr_t buf;
1815                compat_ssize_t len;
1816
1817                unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1818                unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1819
1820                /* check for compat_size_t not fitting in compat_ssize_t .. */
1821                if (len < 0) {
1822                        ret = -EINVAL;
1823                        goto uaccess_end;
1824                }
1825                iov[i].iov_base = compat_ptr(buf);
1826                iov[i].iov_len = len;
1827        }
1828
1829        ret = 0;
1830uaccess_end:
1831        user_access_end();
1832        return ret;
1833}
1834
1835static int copy_iovec_from_user(struct iovec *iov,
1836                const struct iovec __user *uvec, unsigned long nr_segs)
1837{
1838        unsigned long seg;
1839
1840        if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1841                return -EFAULT;
1842        for (seg = 0; seg < nr_segs; seg++) {
1843                if ((ssize_t)iov[seg].iov_len < 0)
1844                        return -EINVAL;
1845        }
1846
1847        return 0;
1848}
1849
1850struct iovec *iovec_from_user(const struct iovec __user *uvec,
1851                unsigned long nr_segs, unsigned long fast_segs,
1852                struct iovec *fast_iov, bool compat)
1853{
1854        struct iovec *iov = fast_iov;
1855        int ret;
1856
1857        /*
1858         * SuS says "The readv() function *may* fail if the iovcnt argument was
1859         * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1860         * traditionally returned zero for zero segments, so...
1861         */
1862        if (nr_segs == 0)
1863                return iov;
1864        if (nr_segs > UIO_MAXIOV)
1865                return ERR_PTR(-EINVAL);
1866        if (nr_segs > fast_segs) {
1867                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1868                if (!iov)
1869                        return ERR_PTR(-ENOMEM);
1870        }
1871
1872        if (compat)
1873                ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1874        else
1875                ret = copy_iovec_from_user(iov, uvec, nr_segs);
1876        if (ret) {
1877                if (iov != fast_iov)
1878                        kfree(iov);
1879                return ERR_PTR(ret);
1880        }
1881
1882        return iov;
1883}
1884
1885ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1886                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1887                 struct iov_iter *i, bool compat)
1888{
1889        ssize_t total_len = 0;
1890        unsigned long seg;
1891        struct iovec *iov;
1892
1893        iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1894        if (IS_ERR(iov)) {
1895                *iovp = NULL;
1896                return PTR_ERR(iov);
1897        }
1898
1899        /*
1900         * According to the Single Unix Specification we should return EINVAL if
1901         * an element length is < 0 when cast to ssize_t or if the total length
1902         * would overflow the ssize_t return value of the system call.
1903         *
1904         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1905         * overflow case.
1906         */
1907        for (seg = 0; seg < nr_segs; seg++) {
1908                ssize_t len = (ssize_t)iov[seg].iov_len;
1909
1910                if (!access_ok(iov[seg].iov_base, len)) {
1911                        if (iov != *iovp)
1912                                kfree(iov);
1913                        *iovp = NULL;
1914                        return -EFAULT;
1915                }
1916
1917                if (len > MAX_RW_COUNT - total_len) {
1918                        len = MAX_RW_COUNT - total_len;
1919                        iov[seg].iov_len = len;
1920                }
1921                total_len += len;
1922        }
1923
1924        iov_iter_init(i, type, iov, nr_segs, total_len);
1925        if (iov == *iovp)
1926                *iovp = NULL;
1927        else
1928                *iovp = iov;
1929        return total_len;
1930}
1931
1932/**
1933 * import_iovec() - Copy an array of &struct iovec from userspace
1934 *     into the kernel, check that it is valid, and initialize a new
1935 *     &struct iov_iter iterator to access it.
1936 *
1937 * @type: One of %READ or %WRITE.
1938 * @uvec: Pointer to the userspace array.
1939 * @nr_segs: Number of elements in userspace array.
1940 * @fast_segs: Number of elements in @iov.
1941 * @iovp: (input and output parameter) Pointer to pointer to (usually small
1942 *     on-stack) kernel array.
1943 * @i: Pointer to iterator that will be initialized on success.
1944 *
1945 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1946 * then this function places %NULL in *@iov on return. Otherwise, a new
1947 * array will be allocated and the result placed in *@iov. This means that
1948 * the caller may call kfree() on *@iov regardless of whether the small
1949 * on-stack array was used or not (and regardless of whether this function
1950 * returns an error or not).
1951 *
1952 * Return: Negative error code on error, bytes imported on success
1953 */
1954ssize_t import_iovec(int type, const struct iovec __user *uvec,
1955                 unsigned nr_segs, unsigned fast_segs,
1956                 struct iovec **iovp, struct iov_iter *i)
1957{
1958        return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1959                              in_compat_syscall());
1960}
1961EXPORT_SYMBOL(import_iovec);
1962
1963int import_single_range(int rw, void __user *buf, size_t len,
1964                 struct iovec *iov, struct iov_iter *i)
1965{
1966        if (len > MAX_RW_COUNT)
1967                len = MAX_RW_COUNT;
1968        if (unlikely(!access_ok(buf, len)))
1969                return -EFAULT;
1970
1971        iov->iov_base = buf;
1972        iov->iov_len = len;
1973        iov_iter_init(i, rw, iov, 1, len);
1974        return 0;
1975}
1976EXPORT_SYMBOL(import_single_range);
1977
1978/**
1979 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
1980 *     iov_iter_save_state() was called.
1981 *
1982 * @i: &struct iov_iter to restore
1983 * @state: state to restore from
1984 *
1985 * Used after iov_iter_save_state() to bring restore @i, if operations may
1986 * have advanced it.
1987 *
1988 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
1989 */
1990void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
1991{
1992        if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
1993                         !iov_iter_is_kvec(i))
1994                return;
1995        i->iov_offset = state->iov_offset;
1996        i->count = state->count;
1997        /*
1998         * For the *vec iters, nr_segs + iov is constant - if we increment
1999         * the vec, then we also decrement the nr_segs count. Hence we don't
2000         * need to track both of these, just one is enough and we can deduct
2001         * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
2002         * size, so we can just increment the iov pointer as they are unionzed.
2003         * ITER_BVEC _may_ be the same size on some archs, but on others it is
2004         * not. Be safe and handle it separately.
2005         */
2006        BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
2007        if (iov_iter_is_bvec(i))
2008                i->bvec -= state->nr_segs - i->nr_segs;
2009        else
2010                i->iov -= state->nr_segs - i->nr_segs;
2011        i->nr_segs = state->nr_segs;
2012}
2013