linux/lib/iov_iter.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2#include <crypto/hash.h>
   3#include <linux/export.h>
   4#include <linux/bvec.h>
   5#include <linux/fault-inject-usercopy.h>
   6#include <linux/uio.h>
   7#include <linux/pagemap.h>
   8#include <linux/highmem.h>
   9#include <linux/slab.h>
  10#include <linux/vmalloc.h>
  11#include <linux/splice.h>
  12#include <linux/compat.h>
  13#include <net/checksum.h>
  14#include <linux/scatterlist.h>
  15#include <linux/instrumented.h>
  16
  17#define PIPE_PARANOIA /* for now */
  18
  19/* covers iovec and kvec alike */
  20#define iterate_iovec(i, n, base, len, off, __p, STEP) {        \
  21        size_t off = 0;                                         \
  22        size_t skip = i->iov_offset;                            \
  23        do {                                                    \
  24                len = min(n, __p->iov_len - skip);              \
  25                if (likely(len)) {                              \
  26                        base = __p->iov_base + skip;            \
  27                        len -= (STEP);                          \
  28                        off += len;                             \
  29                        skip += len;                            \
  30                        n -= len;                               \
  31                        if (skip < __p->iov_len)                \
  32                                break;                          \
  33                }                                               \
  34                __p++;                                          \
  35                skip = 0;                                       \
  36        } while (n);                                            \
  37        i->iov_offset = skip;                                   \
  38        n = off;                                                \
  39}
  40
  41#define iterate_bvec(i, n, base, len, off, p, STEP) {           \
  42        size_t off = 0;                                         \
  43        unsigned skip = i->iov_offset;                          \
  44        while (n) {                                             \
  45                unsigned offset = p->bv_offset + skip;          \
  46                unsigned left;                                  \
  47                void *kaddr = kmap_local_page(p->bv_page +      \
  48                                        offset / PAGE_SIZE);    \
  49                base = kaddr + offset % PAGE_SIZE;              \
  50                len = min(min(n, (size_t)(p->bv_len - skip)),   \
  51                     (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \
  52                left = (STEP);                                  \
  53                kunmap_local(kaddr);                            \
  54                len -= left;                                    \
  55                off += len;                                     \
  56                skip += len;                                    \
  57                if (skip == p->bv_len) {                        \
  58                        skip = 0;                               \
  59                        p++;                                    \
  60                }                                               \
  61                n -= len;                                       \
  62                if (left)                                       \
  63                        break;                                  \
  64        }                                                       \
  65        i->iov_offset = skip;                                   \
  66        n = off;                                                \
  67}
  68
  69#define iterate_xarray(i, n, base, len, __off, STEP) {          \
  70        __label__ __out;                                        \
  71        size_t __off = 0;                                       \
  72        struct page *head = NULL;                               \
  73        loff_t start = i->xarray_start + i->iov_offset;         \
  74        unsigned offset = start % PAGE_SIZE;                    \
  75        pgoff_t index = start / PAGE_SIZE;                      \
  76        int j;                                                  \
  77                                                                \
  78        XA_STATE(xas, i->xarray, index);                        \
  79                                                                \
  80        rcu_read_lock();                                        \
  81        xas_for_each(&xas, head, ULONG_MAX) {                   \
  82                unsigned left;                                  \
  83                if (xas_retry(&xas, head))                      \
  84                        continue;                               \
  85                if (WARN_ON(xa_is_value(head)))                 \
  86                        break;                                  \
  87                if (WARN_ON(PageHuge(head)))                    \
  88                        break;                                  \
  89                for (j = (head->index < index) ? index - head->index : 0; \
  90                     j < thp_nr_pages(head); j++) {             \
  91                        void *kaddr = kmap_local_page(head + j);        \
  92                        base = kaddr + offset;                  \
  93                        len = PAGE_SIZE - offset;               \
  94                        len = min(n, len);                      \
  95                        left = (STEP);                          \
  96                        kunmap_local(kaddr);                    \
  97                        len -= left;                            \
  98                        __off += len;                           \
  99                        n -= len;                               \
 100                        if (left || n == 0)                     \
 101                                goto __out;                     \
 102                        offset = 0;                             \
 103                }                                               \
 104        }                                                       \
 105__out:                                                          \
 106        rcu_read_unlock();                                      \
 107        i->iov_offset += __off;                                         \
 108        n = __off;                                              \
 109}
 110
 111#define __iterate_and_advance(i, n, base, len, off, I, K) {     \
 112        if (unlikely(i->count < n))                             \
 113                n = i->count;                                   \
 114        if (likely(n)) {                                        \
 115                if (likely(iter_is_iovec(i))) {                 \
 116                        const struct iovec *iov = i->iov;       \
 117                        void __user *base;                      \
 118                        size_t len;                             \
 119                        iterate_iovec(i, n, base, len, off,     \
 120                                                iov, (I))       \
 121                        i->nr_segs -= iov - i->iov;             \
 122                        i->iov = iov;                           \
 123                } else if (iov_iter_is_bvec(i)) {               \
 124                        const struct bio_vec *bvec = i->bvec;   \
 125                        void *base;                             \
 126                        size_t len;                             \
 127                        iterate_bvec(i, n, base, len, off,      \
 128                                                bvec, (K))      \
 129                        i->nr_segs -= bvec - i->bvec;           \
 130                        i->bvec = bvec;                         \
 131                } else if (iov_iter_is_kvec(i)) {               \
 132                        const struct kvec *kvec = i->kvec;      \
 133                        void *base;                             \
 134                        size_t len;                             \
 135                        iterate_iovec(i, n, base, len, off,     \
 136                                                kvec, (K))      \
 137                        i->nr_segs -= kvec - i->kvec;           \
 138                        i->kvec = kvec;                         \
 139                } else if (iov_iter_is_xarray(i)) {             \
 140                        void *base;                             \
 141                        size_t len;                             \
 142                        iterate_xarray(i, n, base, len, off,    \
 143                                                        (K))    \
 144                }                                               \
 145                i->count -= n;                                  \
 146        }                                                       \
 147}
 148#define iterate_and_advance(i, n, base, len, off, I, K) \
 149        __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
 150
 151static int copyout(void __user *to, const void *from, size_t n)
 152{
 153        if (should_fail_usercopy())
 154                return n;
 155        if (access_ok(to, n)) {
 156                instrument_copy_to_user(to, from, n);
 157                n = raw_copy_to_user(to, from, n);
 158        }
 159        return n;
 160}
 161
 162static int copyin(void *to, const void __user *from, size_t n)
 163{
 164        if (should_fail_usercopy())
 165                return n;
 166        if (access_ok(from, n)) {
 167                instrument_copy_from_user(to, from, n);
 168                n = raw_copy_from_user(to, from, n);
 169        }
 170        return n;
 171}
 172
 173static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
 174                         struct iov_iter *i)
 175{
 176        size_t skip, copy, left, wanted;
 177        const struct iovec *iov;
 178        char __user *buf;
 179        void *kaddr, *from;
 180
 181        if (unlikely(bytes > i->count))
 182                bytes = i->count;
 183
 184        if (unlikely(!bytes))
 185                return 0;
 186
 187        might_fault();
 188        wanted = bytes;
 189        iov = i->iov;
 190        skip = i->iov_offset;
 191        buf = iov->iov_base + skip;
 192        copy = min(bytes, iov->iov_len - skip);
 193
 194        if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
 195                kaddr = kmap_atomic(page);
 196                from = kaddr + offset;
 197
 198                /* first chunk, usually the only one */
 199                left = copyout(buf, from, copy);
 200                copy -= left;
 201                skip += copy;
 202                from += copy;
 203                bytes -= copy;
 204
 205                while (unlikely(!left && bytes)) {
 206                        iov++;
 207                        buf = iov->iov_base;
 208                        copy = min(bytes, iov->iov_len);
 209                        left = copyout(buf, from, copy);
 210                        copy -= left;
 211                        skip = copy;
 212                        from += copy;
 213                        bytes -= copy;
 214                }
 215                if (likely(!bytes)) {
 216                        kunmap_atomic(kaddr);
 217                        goto done;
 218                }
 219                offset = from - kaddr;
 220                buf += copy;
 221                kunmap_atomic(kaddr);
 222                copy = min(bytes, iov->iov_len - skip);
 223        }
 224        /* Too bad - revert to non-atomic kmap */
 225
 226        kaddr = kmap(page);
 227        from = kaddr + offset;
 228        left = copyout(buf, from, copy);
 229        copy -= left;
 230        skip += copy;
 231        from += copy;
 232        bytes -= copy;
 233        while (unlikely(!left && bytes)) {
 234                iov++;
 235                buf = iov->iov_base;
 236                copy = min(bytes, iov->iov_len);
 237                left = copyout(buf, from, copy);
 238                copy -= left;
 239                skip = copy;
 240                from += copy;
 241                bytes -= copy;
 242        }
 243        kunmap(page);
 244
 245done:
 246        if (skip == iov->iov_len) {
 247                iov++;
 248                skip = 0;
 249        }
 250        i->count -= wanted - bytes;
 251        i->nr_segs -= iov - i->iov;
 252        i->iov = iov;
 253        i->iov_offset = skip;
 254        return wanted - bytes;
 255}
 256
 257static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
 258                         struct iov_iter *i)
 259{
 260        size_t skip, copy, left, wanted;
 261        const struct iovec *iov;
 262        char __user *buf;
 263        void *kaddr, *to;
 264
 265        if (unlikely(bytes > i->count))
 266                bytes = i->count;
 267
 268        if (unlikely(!bytes))
 269                return 0;
 270
 271        might_fault();
 272        wanted = bytes;
 273        iov = i->iov;
 274        skip = i->iov_offset;
 275        buf = iov->iov_base + skip;
 276        copy = min(bytes, iov->iov_len - skip);
 277
 278        if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
 279                kaddr = kmap_atomic(page);
 280                to = kaddr + offset;
 281
 282                /* first chunk, usually the only one */
 283                left = copyin(to, buf, copy);
 284                copy -= left;
 285                skip += copy;
 286                to += copy;
 287                bytes -= copy;
 288
 289                while (unlikely(!left && bytes)) {
 290                        iov++;
 291                        buf = iov->iov_base;
 292                        copy = min(bytes, iov->iov_len);
 293                        left = copyin(to, buf, copy);
 294                        copy -= left;
 295                        skip = copy;
 296                        to += copy;
 297                        bytes -= copy;
 298                }
 299                if (likely(!bytes)) {
 300                        kunmap_atomic(kaddr);
 301                        goto done;
 302                }
 303                offset = to - kaddr;
 304                buf += copy;
 305                kunmap_atomic(kaddr);
 306                copy = min(bytes, iov->iov_len - skip);
 307        }
 308        /* Too bad - revert to non-atomic kmap */
 309
 310        kaddr = kmap(page);
 311        to = kaddr + offset;
 312        left = copyin(to, buf, copy);
 313        copy -= left;
 314        skip += copy;
 315        to += copy;
 316        bytes -= copy;
 317        while (unlikely(!left && bytes)) {
 318                iov++;
 319                buf = iov->iov_base;
 320                copy = min(bytes, iov->iov_len);
 321                left = copyin(to, buf, copy);
 322                copy -= left;
 323                skip = copy;
 324                to += copy;
 325                bytes -= copy;
 326        }
 327        kunmap(page);
 328
 329done:
 330        if (skip == iov->iov_len) {
 331                iov++;
 332                skip = 0;
 333        }
 334        i->count -= wanted - bytes;
 335        i->nr_segs -= iov - i->iov;
 336        i->iov = iov;
 337        i->iov_offset = skip;
 338        return wanted - bytes;
 339}
 340
 341#ifdef PIPE_PARANOIA
 342static bool sanity(const struct iov_iter *i)
 343{
 344        struct pipe_inode_info *pipe = i->pipe;
 345        unsigned int p_head = pipe->head;
 346        unsigned int p_tail = pipe->tail;
 347        unsigned int p_mask = pipe->ring_size - 1;
 348        unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
 349        unsigned int i_head = i->head;
 350        unsigned int idx;
 351
 352        if (i->iov_offset) {
 353                struct pipe_buffer *p;
 354                if (unlikely(p_occupancy == 0))
 355                        goto Bad;       // pipe must be non-empty
 356                if (unlikely(i_head != p_head - 1))
 357                        goto Bad;       // must be at the last buffer...
 358
 359                p = &pipe->bufs[i_head & p_mask];
 360                if (unlikely(p->offset + p->len != i->iov_offset))
 361                        goto Bad;       // ... at the end of segment
 362        } else {
 363                if (i_head != p_head)
 364                        goto Bad;       // must be right after the last buffer
 365        }
 366        return true;
 367Bad:
 368        printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
 369        printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
 370                        p_head, p_tail, pipe->ring_size);
 371        for (idx = 0; idx < pipe->ring_size; idx++)
 372                printk(KERN_ERR "[%p %p %d %d]\n",
 373                        pipe->bufs[idx].ops,
 374                        pipe->bufs[idx].page,
 375                        pipe->bufs[idx].offset,
 376                        pipe->bufs[idx].len);
 377        WARN_ON(1);
 378        return false;
 379}
 380#else
 381#define sanity(i) true
 382#endif
 383
 384static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
 385                         struct iov_iter *i)
 386{
 387        struct pipe_inode_info *pipe = i->pipe;
 388        struct pipe_buffer *buf;
 389        unsigned int p_tail = pipe->tail;
 390        unsigned int p_mask = pipe->ring_size - 1;
 391        unsigned int i_head = i->head;
 392        size_t off;
 393
 394        if (unlikely(bytes > i->count))
 395                bytes = i->count;
 396
 397        if (unlikely(!bytes))
 398                return 0;
 399
 400        if (!sanity(i))
 401                return 0;
 402
 403        off = i->iov_offset;
 404        buf = &pipe->bufs[i_head & p_mask];
 405        if (off) {
 406                if (offset == off && buf->page == page) {
 407                        /* merge with the last one */
 408                        buf->len += bytes;
 409                        i->iov_offset += bytes;
 410                        goto out;
 411                }
 412                i_head++;
 413                buf = &pipe->bufs[i_head & p_mask];
 414        }
 415        if (pipe_full(i_head, p_tail, pipe->max_usage))
 416                return 0;
 417
 418        buf->ops = &page_cache_pipe_buf_ops;
 419        get_page(page);
 420        buf->page = page;
 421        buf->offset = offset;
 422        buf->len = bytes;
 423
 424        pipe->head = i_head + 1;
 425        i->iov_offset = offset + bytes;
 426        i->head = i_head;
 427out:
 428        i->count -= bytes;
 429        return bytes;
 430}
 431
 432/*
 433 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
 434 * bytes.  For each iovec, fault in each page that constitutes the iovec.
 435 *
 436 * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
 437 * because it is an invalid address).
 438 */
 439int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
 440{
 441        if (iter_is_iovec(i)) {
 442                const struct iovec *p;
 443                size_t skip;
 444
 445                if (bytes > i->count)
 446                        bytes = i->count;
 447                for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
 448                        size_t len = min(bytes, p->iov_len - skip);
 449                        int err;
 450
 451                        if (unlikely(!len))
 452                                continue;
 453                        err = fault_in_pages_readable(p->iov_base + skip, len);
 454                        if (unlikely(err))
 455                                return err;
 456                        bytes -= len;
 457                }
 458        }
 459        return 0;
 460}
 461EXPORT_SYMBOL(iov_iter_fault_in_readable);
 462
 463void iov_iter_init(struct iov_iter *i, unsigned int direction,
 464                        const struct iovec *iov, unsigned long nr_segs,
 465                        size_t count)
 466{
 467        WARN_ON(direction & ~(READ | WRITE));
 468        *i = (struct iov_iter) {
 469                .iter_type = ITER_IOVEC,
 470                .data_source = direction,
 471                .iov = iov,
 472                .nr_segs = nr_segs,
 473                .iov_offset = 0,
 474                .count = count
 475        };
 476}
 477EXPORT_SYMBOL(iov_iter_init);
 478
 479static inline bool allocated(struct pipe_buffer *buf)
 480{
 481        return buf->ops == &default_pipe_buf_ops;
 482}
 483
 484static inline void data_start(const struct iov_iter *i,
 485                              unsigned int *iter_headp, size_t *offp)
 486{
 487        unsigned int p_mask = i->pipe->ring_size - 1;
 488        unsigned int iter_head = i->head;
 489        size_t off = i->iov_offset;
 490
 491        if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
 492                    off == PAGE_SIZE)) {
 493                iter_head++;
 494                off = 0;
 495        }
 496        *iter_headp = iter_head;
 497        *offp = off;
 498}
 499
 500static size_t push_pipe(struct iov_iter *i, size_t size,
 501                        int *iter_headp, size_t *offp)
 502{
 503        struct pipe_inode_info *pipe = i->pipe;
 504        unsigned int p_tail = pipe->tail;
 505        unsigned int p_mask = pipe->ring_size - 1;
 506        unsigned int iter_head;
 507        size_t off;
 508        ssize_t left;
 509
 510        if (unlikely(size > i->count))
 511                size = i->count;
 512        if (unlikely(!size))
 513                return 0;
 514
 515        left = size;
 516        data_start(i, &iter_head, &off);
 517        *iter_headp = iter_head;
 518        *offp = off;
 519        if (off) {
 520                left -= PAGE_SIZE - off;
 521                if (left <= 0) {
 522                        pipe->bufs[iter_head & p_mask].len += size;
 523                        return size;
 524                }
 525                pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
 526                iter_head++;
 527        }
 528        while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
 529                struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
 530                struct page *page = alloc_page(GFP_USER);
 531                if (!page)
 532                        break;
 533
 534                buf->ops = &default_pipe_buf_ops;
 535                buf->page = page;
 536                buf->offset = 0;
 537                buf->len = min_t(ssize_t, left, PAGE_SIZE);
 538                left -= buf->len;
 539                iter_head++;
 540                pipe->head = iter_head;
 541
 542                if (left == 0)
 543                        return size;
 544        }
 545        return size - left;
 546}
 547
 548static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
 549                                struct iov_iter *i)
 550{
 551        struct pipe_inode_info *pipe = i->pipe;
 552        unsigned int p_mask = pipe->ring_size - 1;
 553        unsigned int i_head;
 554        size_t n, off;
 555
 556        if (!sanity(i))
 557                return 0;
 558
 559        bytes = n = push_pipe(i, bytes, &i_head, &off);
 560        if (unlikely(!n))
 561                return 0;
 562        do {
 563                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
 564                memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
 565                i->head = i_head;
 566                i->iov_offset = off + chunk;
 567                n -= chunk;
 568                addr += chunk;
 569                off = 0;
 570                i_head++;
 571        } while (n);
 572        i->count -= bytes;
 573        return bytes;
 574}
 575
 576static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
 577                              __wsum sum, size_t off)
 578{
 579        __wsum next = csum_partial_copy_nocheck(from, to, len);
 580        return csum_block_add(sum, next, off);
 581}
 582
 583static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
 584                                         struct iov_iter *i, __wsum *sump)
 585{
 586        struct pipe_inode_info *pipe = i->pipe;
 587        unsigned int p_mask = pipe->ring_size - 1;
 588        __wsum sum = *sump;
 589        size_t off = 0;
 590        unsigned int i_head;
 591        size_t r;
 592
 593        if (!sanity(i))
 594                return 0;
 595
 596        bytes = push_pipe(i, bytes, &i_head, &r);
 597        while (bytes) {
 598                size_t chunk = min_t(size_t, bytes, PAGE_SIZE - r);
 599                char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
 600                sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off);
 601                kunmap_local(p);
 602                i->head = i_head;
 603                i->iov_offset = r + chunk;
 604                bytes -= chunk;
 605                off += chunk;
 606                r = 0;
 607                i_head++;
 608        }
 609        *sump = sum;
 610        i->count -= off;
 611        return off;
 612}
 613
 614size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 615{
 616        if (unlikely(iov_iter_is_pipe(i)))
 617                return copy_pipe_to_iter(addr, bytes, i);
 618        if (iter_is_iovec(i))
 619                might_fault();
 620        iterate_and_advance(i, bytes, base, len, off,
 621                copyout(base, addr + off, len),
 622                memcpy(base, addr + off, len)
 623        )
 624
 625        return bytes;
 626}
 627EXPORT_SYMBOL(_copy_to_iter);
 628
 629#ifdef CONFIG_ARCH_HAS_COPY_MC
 630static int copyout_mc(void __user *to, const void *from, size_t n)
 631{
 632        if (access_ok(to, n)) {
 633                instrument_copy_to_user(to, from, n);
 634                n = copy_mc_to_user((__force void *) to, from, n);
 635        }
 636        return n;
 637}
 638
 639static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
 640                                struct iov_iter *i)
 641{
 642        struct pipe_inode_info *pipe = i->pipe;
 643        unsigned int p_mask = pipe->ring_size - 1;
 644        unsigned int i_head;
 645        size_t n, off, xfer = 0;
 646
 647        if (!sanity(i))
 648                return 0;
 649
 650        n = push_pipe(i, bytes, &i_head, &off);
 651        while (n) {
 652                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
 653                char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
 654                unsigned long rem;
 655                rem = copy_mc_to_kernel(p + off, addr + xfer, chunk);
 656                chunk -= rem;
 657                kunmap_local(p);
 658                i->head = i_head;
 659                i->iov_offset = off + chunk;
 660                xfer += chunk;
 661                if (rem)
 662                        break;
 663                n -= chunk;
 664                off = 0;
 665                i_head++;
 666        }
 667        i->count -= xfer;
 668        return xfer;
 669}
 670
 671/**
 672 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 673 * @addr: source kernel address
 674 * @bytes: total transfer length
 675 * @i: destination iterator
 676 *
 677 * The pmem driver deploys this for the dax operation
 678 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 679 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
 680 * successfully copied.
 681 *
 682 * The main differences between this and typical _copy_to_iter().
 683 *
 684 * * Typical tail/residue handling after a fault retries the copy
 685 *   byte-by-byte until the fault happens again. Re-triggering machine
 686 *   checks is potentially fatal so the implementation uses source
 687 *   alignment and poison alignment assumptions to avoid re-triggering
 688 *   hardware exceptions.
 689 *
 690 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
 691 *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
 692 *   a short copy.
 693 *
 694 * Return: number of bytes copied (may be %0)
 695 */
 696size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 697{
 698        if (unlikely(iov_iter_is_pipe(i)))
 699                return copy_mc_pipe_to_iter(addr, bytes, i);
 700        if (iter_is_iovec(i))
 701                might_fault();
 702        __iterate_and_advance(i, bytes, base, len, off,
 703                copyout_mc(base, addr + off, len),
 704                copy_mc_to_kernel(base, addr + off, len)
 705        )
 706
 707        return bytes;
 708}
 709EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
 710#endif /* CONFIG_ARCH_HAS_COPY_MC */
 711
 712size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 713{
 714        if (unlikely(iov_iter_is_pipe(i))) {
 715                WARN_ON(1);
 716                return 0;
 717        }
 718        if (iter_is_iovec(i))
 719                might_fault();
 720        iterate_and_advance(i, bytes, base, len, off,
 721                copyin(addr + off, base, len),
 722                memcpy(addr + off, base, len)
 723        )
 724
 725        return bytes;
 726}
 727EXPORT_SYMBOL(_copy_from_iter);
 728
 729size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 730{
 731        if (unlikely(iov_iter_is_pipe(i))) {
 732                WARN_ON(1);
 733                return 0;
 734        }
 735        iterate_and_advance(i, bytes, base, len, off,
 736                __copy_from_user_inatomic_nocache(addr + off, base, len),
 737                memcpy(addr + off, base, len)
 738        )
 739
 740        return bytes;
 741}
 742EXPORT_SYMBOL(_copy_from_iter_nocache);
 743
 744#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 745/**
 746 * _copy_from_iter_flushcache - write destination through cpu cache
 747 * @addr: destination kernel address
 748 * @bytes: total transfer length
 749 * @i: source iterator
 750 *
 751 * The pmem driver arranges for filesystem-dax to use this facility via
 752 * dax_copy_from_iter() for ensuring that writes to persistent memory
 753 * are flushed through the CPU cache. It is differentiated from
 754 * _copy_from_iter_nocache() in that guarantees all data is flushed for
 755 * all iterator types. The _copy_from_iter_nocache() only attempts to
 756 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 757 * instructions that strand dirty-data in the cache.
 758 *
 759 * Return: number of bytes copied (may be %0)
 760 */
 761size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 762{
 763        if (unlikely(iov_iter_is_pipe(i))) {
 764                WARN_ON(1);
 765                return 0;
 766        }
 767        iterate_and_advance(i, bytes, base, len, off,
 768                __copy_from_user_flushcache(addr + off, base, len),
 769                memcpy_flushcache(addr + off, base, len)
 770        )
 771
 772        return bytes;
 773}
 774EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
 775#endif
 776
 777static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
 778{
 779        struct page *head;
 780        size_t v = n + offset;
 781
 782        /*
 783         * The general case needs to access the page order in order
 784         * to compute the page size.
 785         * However, we mostly deal with order-0 pages and thus can
 786         * avoid a possible cache line miss for requests that fit all
 787         * page orders.
 788         */
 789        if (n <= v && v <= PAGE_SIZE)
 790                return true;
 791
 792        head = compound_head(page);
 793        v += (page - head) << PAGE_SHIFT;
 794
 795        if (likely(n <= v && v <= (page_size(head))))
 796                return true;
 797        WARN_ON(1);
 798        return false;
 799}
 800
 801static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 802                         struct iov_iter *i)
 803{
 804        if (likely(iter_is_iovec(i)))
 805                return copy_page_to_iter_iovec(page, offset, bytes, i);
 806        if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
 807                void *kaddr = kmap_local_page(page);
 808                size_t wanted = _copy_to_iter(kaddr + offset, bytes, i);
 809                kunmap_local(kaddr);
 810                return wanted;
 811        }
 812        if (iov_iter_is_pipe(i))
 813                return copy_page_to_iter_pipe(page, offset, bytes, i);
 814        if (unlikely(iov_iter_is_discard(i))) {
 815                if (unlikely(i->count < bytes))
 816                        bytes = i->count;
 817                i->count -= bytes;
 818                return bytes;
 819        }
 820        WARN_ON(1);
 821        return 0;
 822}
 823
 824size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 825                         struct iov_iter *i)
 826{
 827        size_t res = 0;
 828        if (unlikely(!page_copy_sane(page, offset, bytes)))
 829                return 0;
 830        page += offset / PAGE_SIZE; // first subpage
 831        offset %= PAGE_SIZE;
 832        while (1) {
 833                size_t n = __copy_page_to_iter(page, offset,
 834                                min(bytes, (size_t)PAGE_SIZE - offset), i);
 835                res += n;
 836                bytes -= n;
 837                if (!bytes || !n)
 838                        break;
 839                offset += n;
 840                if (offset == PAGE_SIZE) {
 841                        page++;
 842                        offset = 0;
 843                }
 844        }
 845        return res;
 846}
 847EXPORT_SYMBOL(copy_page_to_iter);
 848
 849size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 850                         struct iov_iter *i)
 851{
 852        if (unlikely(!page_copy_sane(page, offset, bytes)))
 853                return 0;
 854        if (likely(iter_is_iovec(i)))
 855                return copy_page_from_iter_iovec(page, offset, bytes, i);
 856        if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
 857                void *kaddr = kmap_local_page(page);
 858                size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
 859                kunmap_local(kaddr);
 860                return wanted;
 861        }
 862        WARN_ON(1);
 863        return 0;
 864}
 865EXPORT_SYMBOL(copy_page_from_iter);
 866
 867static size_t pipe_zero(size_t bytes, struct iov_iter *i)
 868{
 869        struct pipe_inode_info *pipe = i->pipe;
 870        unsigned int p_mask = pipe->ring_size - 1;
 871        unsigned int i_head;
 872        size_t n, off;
 873
 874        if (!sanity(i))
 875                return 0;
 876
 877        bytes = n = push_pipe(i, bytes, &i_head, &off);
 878        if (unlikely(!n))
 879                return 0;
 880
 881        do {
 882                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
 883                char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
 884                memset(p + off, 0, chunk);
 885                kunmap_local(p);
 886                i->head = i_head;
 887                i->iov_offset = off + chunk;
 888                n -= chunk;
 889                off = 0;
 890                i_head++;
 891        } while (n);
 892        i->count -= bytes;
 893        return bytes;
 894}
 895
 896size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 897{
 898        if (unlikely(iov_iter_is_pipe(i)))
 899                return pipe_zero(bytes, i);
 900        iterate_and_advance(i, bytes, base, len, count,
 901                clear_user(base, len),
 902                memset(base, 0, len)
 903        )
 904
 905        return bytes;
 906}
 907EXPORT_SYMBOL(iov_iter_zero);
 908
 909size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
 910                                  struct iov_iter *i)
 911{
 912        char *kaddr = kmap_atomic(page), *p = kaddr + offset;
 913        if (unlikely(!page_copy_sane(page, offset, bytes))) {
 914                kunmap_atomic(kaddr);
 915                return 0;
 916        }
 917        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
 918                kunmap_atomic(kaddr);
 919                WARN_ON(1);
 920                return 0;
 921        }
 922        iterate_and_advance(i, bytes, base, len, off,
 923                copyin(p + off, base, len),
 924                memcpy(p + off, base, len)
 925        )
 926        kunmap_atomic(kaddr);
 927        return bytes;
 928}
 929EXPORT_SYMBOL(copy_page_from_iter_atomic);
 930
 931static inline void pipe_truncate(struct iov_iter *i)
 932{
 933        struct pipe_inode_info *pipe = i->pipe;
 934        unsigned int p_tail = pipe->tail;
 935        unsigned int p_head = pipe->head;
 936        unsigned int p_mask = pipe->ring_size - 1;
 937
 938        if (!pipe_empty(p_head, p_tail)) {
 939                struct pipe_buffer *buf;
 940                unsigned int i_head = i->head;
 941                size_t off = i->iov_offset;
 942
 943                if (off) {
 944                        buf = &pipe->bufs[i_head & p_mask];
 945                        buf->len = off - buf->offset;
 946                        i_head++;
 947                }
 948                while (p_head != i_head) {
 949                        p_head--;
 950                        pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
 951                }
 952
 953                pipe->head = p_head;
 954        }
 955}
 956
 957static void pipe_advance(struct iov_iter *i, size_t size)
 958{
 959        struct pipe_inode_info *pipe = i->pipe;
 960        if (size) {
 961                struct pipe_buffer *buf;
 962                unsigned int p_mask = pipe->ring_size - 1;
 963                unsigned int i_head = i->head;
 964                size_t off = i->iov_offset, left = size;
 965
 966                if (off) /* make it relative to the beginning of buffer */
 967                        left += off - pipe->bufs[i_head & p_mask].offset;
 968                while (1) {
 969                        buf = &pipe->bufs[i_head & p_mask];
 970                        if (left <= buf->len)
 971                                break;
 972                        left -= buf->len;
 973                        i_head++;
 974                }
 975                i->head = i_head;
 976                i->iov_offset = buf->offset + left;
 977        }
 978        i->count -= size;
 979        /* ... and discard everything past that point */
 980        pipe_truncate(i);
 981}
 982
 983static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
 984{
 985        struct bvec_iter bi;
 986
 987        bi.bi_size = i->count;
 988        bi.bi_bvec_done = i->iov_offset;
 989        bi.bi_idx = 0;
 990        bvec_iter_advance(i->bvec, &bi, size);
 991
 992        i->bvec += bi.bi_idx;
 993        i->nr_segs -= bi.bi_idx;
 994        i->count = bi.bi_size;
 995        i->iov_offset = bi.bi_bvec_done;
 996}
 997
 998static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
 999{
1000        const struct iovec *iov, *end;
1001
1002        if (!i->count)
1003                return;
1004        i->count -= size;
1005
1006        size += i->iov_offset; // from beginning of current segment
1007        for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
1008                if (likely(size < iov->iov_len))
1009                        break;
1010                size -= iov->iov_len;
1011        }
1012        i->iov_offset = size;
1013        i->nr_segs -= iov - i->iov;
1014        i->iov = iov;
1015}
1016
1017void iov_iter_advance(struct iov_iter *i, size_t size)
1018{
1019        if (unlikely(i->count < size))
1020                size = i->count;
1021        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
1022                /* iovec and kvec have identical layouts */
1023                iov_iter_iovec_advance(i, size);
1024        } else if (iov_iter_is_bvec(i)) {
1025                iov_iter_bvec_advance(i, size);
1026        } else if (iov_iter_is_pipe(i)) {
1027                pipe_advance(i, size);
1028        } else if (unlikely(iov_iter_is_xarray(i))) {
1029                i->iov_offset += size;
1030                i->count -= size;
1031        } else if (iov_iter_is_discard(i)) {
1032                i->count -= size;
1033        }
1034}
1035EXPORT_SYMBOL(iov_iter_advance);
1036
1037void iov_iter_revert(struct iov_iter *i, size_t unroll)
1038{
1039        if (!unroll)
1040                return;
1041        if (WARN_ON(unroll > MAX_RW_COUNT))
1042                return;
1043        i->count += unroll;
1044        if (unlikely(iov_iter_is_pipe(i))) {
1045                struct pipe_inode_info *pipe = i->pipe;
1046                unsigned int p_mask = pipe->ring_size - 1;
1047                unsigned int i_head = i->head;
1048                size_t off = i->iov_offset;
1049                while (1) {
1050                        struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1051                        size_t n = off - b->offset;
1052                        if (unroll < n) {
1053                                off -= unroll;
1054                                break;
1055                        }
1056                        unroll -= n;
1057                        if (!unroll && i_head == i->start_head) {
1058                                off = 0;
1059                                break;
1060                        }
1061                        i_head--;
1062                        b = &pipe->bufs[i_head & p_mask];
1063                        off = b->offset + b->len;
1064                }
1065                i->iov_offset = off;
1066                i->head = i_head;
1067                pipe_truncate(i);
1068                return;
1069        }
1070        if (unlikely(iov_iter_is_discard(i)))
1071                return;
1072        if (unroll <= i->iov_offset) {
1073                i->iov_offset -= unroll;
1074                return;
1075        }
1076        unroll -= i->iov_offset;
1077        if (iov_iter_is_xarray(i)) {
1078                BUG(); /* We should never go beyond the start of the specified
1079                        * range since we might then be straying into pages that
1080                        * aren't pinned.
1081                        */
1082        } else if (iov_iter_is_bvec(i)) {
1083                const struct bio_vec *bvec = i->bvec;
1084                while (1) {
1085                        size_t n = (--bvec)->bv_len;
1086                        i->nr_segs++;
1087                        if (unroll <= n) {
1088                                i->bvec = bvec;
1089                                i->iov_offset = n - unroll;
1090                                return;
1091                        }
1092                        unroll -= n;
1093                }
1094        } else { /* same logics for iovec and kvec */
1095                const struct iovec *iov = i->iov;
1096                while (1) {
1097                        size_t n = (--iov)->iov_len;
1098                        i->nr_segs++;
1099                        if (unroll <= n) {
1100                                i->iov = iov;
1101                                i->iov_offset = n - unroll;
1102                                return;
1103                        }
1104                        unroll -= n;
1105                }
1106        }
1107}
1108EXPORT_SYMBOL(iov_iter_revert);
1109
1110/*
1111 * Return the count of just the current iov_iter segment.
1112 */
1113size_t iov_iter_single_seg_count(const struct iov_iter *i)
1114{
1115        if (i->nr_segs > 1) {
1116                if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1117                        return min(i->count, i->iov->iov_len - i->iov_offset);
1118                if (iov_iter_is_bvec(i))
1119                        return min(i->count, i->bvec->bv_len - i->iov_offset);
1120        }
1121        return i->count;
1122}
1123EXPORT_SYMBOL(iov_iter_single_seg_count);
1124
1125void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1126                        const struct kvec *kvec, unsigned long nr_segs,
1127                        size_t count)
1128{
1129        WARN_ON(direction & ~(READ | WRITE));
1130        *i = (struct iov_iter){
1131                .iter_type = ITER_KVEC,
1132                .data_source = direction,
1133                .kvec = kvec,
1134                .nr_segs = nr_segs,
1135                .iov_offset = 0,
1136                .count = count
1137        };
1138}
1139EXPORT_SYMBOL(iov_iter_kvec);
1140
1141void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1142                        const struct bio_vec *bvec, unsigned long nr_segs,
1143                        size_t count)
1144{
1145        WARN_ON(direction & ~(READ | WRITE));
1146        *i = (struct iov_iter){
1147                .iter_type = ITER_BVEC,
1148                .data_source = direction,
1149                .bvec = bvec,
1150                .nr_segs = nr_segs,
1151                .iov_offset = 0,
1152                .count = count
1153        };
1154}
1155EXPORT_SYMBOL(iov_iter_bvec);
1156
1157void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1158                        struct pipe_inode_info *pipe,
1159                        size_t count)
1160{
1161        BUG_ON(direction != READ);
1162        WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1163        *i = (struct iov_iter){
1164                .iter_type = ITER_PIPE,
1165                .data_source = false,
1166                .pipe = pipe,
1167                .head = pipe->head,
1168                .start_head = pipe->head,
1169                .iov_offset = 0,
1170                .count = count
1171        };
1172}
1173EXPORT_SYMBOL(iov_iter_pipe);
1174
1175/**
1176 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1177 * @i: The iterator to initialise.
1178 * @direction: The direction of the transfer.
1179 * @xarray: The xarray to access.
1180 * @start: The start file position.
1181 * @count: The size of the I/O buffer in bytes.
1182 *
1183 * Set up an I/O iterator to either draw data out of the pages attached to an
1184 * inode or to inject data into those pages.  The pages *must* be prevented
1185 * from evaporation, either by taking a ref on them or locking them by the
1186 * caller.
1187 */
1188void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1189                     struct xarray *xarray, loff_t start, size_t count)
1190{
1191        BUG_ON(direction & ~1);
1192        *i = (struct iov_iter) {
1193                .iter_type = ITER_XARRAY,
1194                .data_source = direction,
1195                .xarray = xarray,
1196                .xarray_start = start,
1197                .count = count,
1198                .iov_offset = 0
1199        };
1200}
1201EXPORT_SYMBOL(iov_iter_xarray);
1202
1203/**
1204 * iov_iter_discard - Initialise an I/O iterator that discards data
1205 * @i: The iterator to initialise.
1206 * @direction: The direction of the transfer.
1207 * @count: The size of the I/O buffer in bytes.
1208 *
1209 * Set up an I/O iterator that just discards everything that's written to it.
1210 * It's only available as a READ iterator.
1211 */
1212void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1213{
1214        BUG_ON(direction != READ);
1215        *i = (struct iov_iter){
1216                .iter_type = ITER_DISCARD,
1217                .data_source = false,
1218                .count = count,
1219                .iov_offset = 0
1220        };
1221}
1222EXPORT_SYMBOL(iov_iter_discard);
1223
1224static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
1225{
1226        unsigned long res = 0;
1227        size_t size = i->count;
1228        size_t skip = i->iov_offset;
1229        unsigned k;
1230
1231        for (k = 0; k < i->nr_segs; k++, skip = 0) {
1232                size_t len = i->iov[k].iov_len - skip;
1233                if (len) {
1234                        res |= (unsigned long)i->iov[k].iov_base + skip;
1235                        if (len > size)
1236                                len = size;
1237                        res |= len;
1238                        size -= len;
1239                        if (!size)
1240                                break;
1241                }
1242        }
1243        return res;
1244}
1245
1246static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1247{
1248        unsigned res = 0;
1249        size_t size = i->count;
1250        unsigned skip = i->iov_offset;
1251        unsigned k;
1252
1253        for (k = 0; k < i->nr_segs; k++, skip = 0) {
1254                size_t len = i->bvec[k].bv_len - skip;
1255                res |= (unsigned long)i->bvec[k].bv_offset + skip;
1256                if (len > size)
1257                        len = size;
1258                res |= len;
1259                size -= len;
1260                if (!size)
1261                        break;
1262        }
1263        return res;
1264}
1265
1266unsigned long iov_iter_alignment(const struct iov_iter *i)
1267{
1268        /* iovec and kvec have identical layouts */
1269        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1270                return iov_iter_alignment_iovec(i);
1271
1272        if (iov_iter_is_bvec(i))
1273                return iov_iter_alignment_bvec(i);
1274
1275        if (iov_iter_is_pipe(i)) {
1276                unsigned int p_mask = i->pipe->ring_size - 1;
1277                size_t size = i->count;
1278
1279                if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1280                        return size | i->iov_offset;
1281                return size;
1282        }
1283
1284        if (iov_iter_is_xarray(i))
1285                return (i->xarray_start + i->iov_offset) | i->count;
1286
1287        return 0;
1288}
1289EXPORT_SYMBOL(iov_iter_alignment);
1290
1291unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1292{
1293        unsigned long res = 0;
1294        unsigned long v = 0;
1295        size_t size = i->count;
1296        unsigned k;
1297
1298        if (WARN_ON(!iter_is_iovec(i)))
1299                return ~0U;
1300
1301        for (k = 0; k < i->nr_segs; k++) {
1302                if (i->iov[k].iov_len) {
1303                        unsigned long base = (unsigned long)i->iov[k].iov_base;
1304                        if (v) // if not the first one
1305                                res |= base | v; // this start | previous end
1306                        v = base + i->iov[k].iov_len;
1307                        if (size <= i->iov[k].iov_len)
1308                                break;
1309                        size -= i->iov[k].iov_len;
1310                }
1311        }
1312        return res;
1313}
1314EXPORT_SYMBOL(iov_iter_gap_alignment);
1315
1316static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1317                                size_t maxsize,
1318                                struct page **pages,
1319                                int iter_head,
1320                                size_t *start)
1321{
1322        struct pipe_inode_info *pipe = i->pipe;
1323        unsigned int p_mask = pipe->ring_size - 1;
1324        ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1325        if (!n)
1326                return -EFAULT;
1327
1328        maxsize = n;
1329        n += *start;
1330        while (n > 0) {
1331                get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1332                iter_head++;
1333                n -= PAGE_SIZE;
1334        }
1335
1336        return maxsize;
1337}
1338
1339static ssize_t pipe_get_pages(struct iov_iter *i,
1340                   struct page **pages, size_t maxsize, unsigned maxpages,
1341                   size_t *start)
1342{
1343        unsigned int iter_head, npages;
1344        size_t capacity;
1345
1346        if (!sanity(i))
1347                return -EFAULT;
1348
1349        data_start(i, &iter_head, start);
1350        /* Amount of free space: some of this one + all after this one */
1351        npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1352        capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1353
1354        return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1355}
1356
1357static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1358                                          pgoff_t index, unsigned int nr_pages)
1359{
1360        XA_STATE(xas, xa, index);
1361        struct page *page;
1362        unsigned int ret = 0;
1363
1364        rcu_read_lock();
1365        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1366                if (xas_retry(&xas, page))
1367                        continue;
1368
1369                /* Has the page moved or been split? */
1370                if (unlikely(page != xas_reload(&xas))) {
1371                        xas_reset(&xas);
1372                        continue;
1373                }
1374
1375                pages[ret] = find_subpage(page, xas.xa_index);
1376                get_page(pages[ret]);
1377                if (++ret == nr_pages)
1378                        break;
1379        }
1380        rcu_read_unlock();
1381        return ret;
1382}
1383
1384static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1385                                     struct page **pages, size_t maxsize,
1386                                     unsigned maxpages, size_t *_start_offset)
1387{
1388        unsigned nr, offset;
1389        pgoff_t index, count;
1390        size_t size = maxsize, actual;
1391        loff_t pos;
1392
1393        if (!size || !maxpages)
1394                return 0;
1395
1396        pos = i->xarray_start + i->iov_offset;
1397        index = pos >> PAGE_SHIFT;
1398        offset = pos & ~PAGE_MASK;
1399        *_start_offset = offset;
1400
1401        count = 1;
1402        if (size > PAGE_SIZE - offset) {
1403                size -= PAGE_SIZE - offset;
1404                count += size >> PAGE_SHIFT;
1405                size &= ~PAGE_MASK;
1406                if (size)
1407                        count++;
1408        }
1409
1410        if (count > maxpages)
1411                count = maxpages;
1412
1413        nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1414        if (nr == 0)
1415                return 0;
1416
1417        actual = PAGE_SIZE * nr;
1418        actual -= offset;
1419        if (nr == count && size > 0) {
1420                unsigned last_offset = (nr > 1) ? 0 : offset;
1421                actual -= PAGE_SIZE - (last_offset + size);
1422        }
1423        return actual;
1424}
1425
1426/* must be done on non-empty ITER_IOVEC one */
1427static unsigned long first_iovec_segment(const struct iov_iter *i,
1428                                         size_t *size, size_t *start,
1429                                         size_t maxsize, unsigned maxpages)
1430{
1431        size_t skip;
1432        long k;
1433
1434        for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1435                unsigned long addr = (unsigned long)i->iov[k].iov_base + skip;
1436                size_t len = i->iov[k].iov_len - skip;
1437
1438                if (unlikely(!len))
1439                        continue;
1440                if (len > maxsize)
1441                        len = maxsize;
1442                len += (*start = addr % PAGE_SIZE);
1443                if (len > maxpages * PAGE_SIZE)
1444                        len = maxpages * PAGE_SIZE;
1445                *size = len;
1446                return addr & PAGE_MASK;
1447        }
1448        BUG(); // if it had been empty, we wouldn't get called
1449}
1450
1451/* must be done on non-empty ITER_BVEC one */
1452static struct page *first_bvec_segment(const struct iov_iter *i,
1453                                       size_t *size, size_t *start,
1454                                       size_t maxsize, unsigned maxpages)
1455{
1456        struct page *page;
1457        size_t skip = i->iov_offset, len;
1458
1459        len = i->bvec->bv_len - skip;
1460        if (len > maxsize)
1461                len = maxsize;
1462        skip += i->bvec->bv_offset;
1463        page = i->bvec->bv_page + skip / PAGE_SIZE;
1464        len += (*start = skip % PAGE_SIZE);
1465        if (len > maxpages * PAGE_SIZE)
1466                len = maxpages * PAGE_SIZE;
1467        *size = len;
1468        return page;
1469}
1470
1471ssize_t iov_iter_get_pages(struct iov_iter *i,
1472                   struct page **pages, size_t maxsize, unsigned maxpages,
1473                   size_t *start)
1474{
1475        size_t len;
1476        int n, res;
1477
1478        if (maxsize > i->count)
1479                maxsize = i->count;
1480        if (!maxsize)
1481                return 0;
1482
1483        if (likely(iter_is_iovec(i))) {
1484                unsigned long addr;
1485
1486                addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
1487                n = DIV_ROUND_UP(len, PAGE_SIZE);
1488                res = get_user_pages_fast(addr, n,
1489                                iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
1490                                pages);
1491                if (unlikely(res < 0))
1492                        return res;
1493                return (res == n ? len : res * PAGE_SIZE) - *start;
1494        }
1495        if (iov_iter_is_bvec(i)) {
1496                struct page *page;
1497
1498                page = first_bvec_segment(i, &len, start, maxsize, maxpages);
1499                n = DIV_ROUND_UP(len, PAGE_SIZE);
1500                while (n--)
1501                        get_page(*pages++ = page++);
1502                return len - *start;
1503        }
1504        if (iov_iter_is_pipe(i))
1505                return pipe_get_pages(i, pages, maxsize, maxpages, start);
1506        if (iov_iter_is_xarray(i))
1507                return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1508        return -EFAULT;
1509}
1510EXPORT_SYMBOL(iov_iter_get_pages);
1511
1512static struct page **get_pages_array(size_t n)
1513{
1514        return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1515}
1516
1517static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1518                   struct page ***pages, size_t maxsize,
1519                   size_t *start)
1520{
1521        struct page **p;
1522        unsigned int iter_head, npages;
1523        ssize_t n;
1524
1525        if (!sanity(i))
1526                return -EFAULT;
1527
1528        data_start(i, &iter_head, start);
1529        /* Amount of free space: some of this one + all after this one */
1530        npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1531        n = npages * PAGE_SIZE - *start;
1532        if (maxsize > n)
1533                maxsize = n;
1534        else
1535                npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1536        p = get_pages_array(npages);
1537        if (!p)
1538                return -ENOMEM;
1539        n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1540        if (n > 0)
1541                *pages = p;
1542        else
1543                kvfree(p);
1544        return n;
1545}
1546
1547static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1548                                           struct page ***pages, size_t maxsize,
1549                                           size_t *_start_offset)
1550{
1551        struct page **p;
1552        unsigned nr, offset;
1553        pgoff_t index, count;
1554        size_t size = maxsize, actual;
1555        loff_t pos;
1556
1557        if (!size)
1558                return 0;
1559
1560        pos = i->xarray_start + i->iov_offset;
1561        index = pos >> PAGE_SHIFT;
1562        offset = pos & ~PAGE_MASK;
1563        *_start_offset = offset;
1564
1565        count = 1;
1566        if (size > PAGE_SIZE - offset) {
1567                size -= PAGE_SIZE - offset;
1568                count += size >> PAGE_SHIFT;
1569                size &= ~PAGE_MASK;
1570                if (size)
1571                        count++;
1572        }
1573
1574        p = get_pages_array(count);
1575        if (!p)
1576                return -ENOMEM;
1577        *pages = p;
1578
1579        nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1580        if (nr == 0)
1581                return 0;
1582
1583        actual = PAGE_SIZE * nr;
1584        actual -= offset;
1585        if (nr == count && size > 0) {
1586                unsigned last_offset = (nr > 1) ? 0 : offset;
1587                actual -= PAGE_SIZE - (last_offset + size);
1588        }
1589        return actual;
1590}
1591
1592ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1593                   struct page ***pages, size_t maxsize,
1594                   size_t *start)
1595{
1596        struct page **p;
1597        size_t len;
1598        int n, res;
1599
1600        if (maxsize > i->count)
1601                maxsize = i->count;
1602        if (!maxsize)
1603                return 0;
1604
1605        if (likely(iter_is_iovec(i))) {
1606                unsigned long addr;
1607
1608                addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
1609                n = DIV_ROUND_UP(len, PAGE_SIZE);
1610                p = get_pages_array(n);
1611                if (!p)
1612                        return -ENOMEM;
1613                res = get_user_pages_fast(addr, n,
1614                                iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
1615                if (unlikely(res < 0)) {
1616                        kvfree(p);
1617                        return res;
1618                }
1619                *pages = p;
1620                return (res == n ? len : res * PAGE_SIZE) - *start;
1621        }
1622        if (iov_iter_is_bvec(i)) {
1623                struct page *page;
1624
1625                page = first_bvec_segment(i, &len, start, maxsize, ~0U);
1626                n = DIV_ROUND_UP(len, PAGE_SIZE);
1627                *pages = p = get_pages_array(n);
1628                if (!p)
1629                        return -ENOMEM;
1630                while (n--)
1631                        get_page(*p++ = page++);
1632                return len - *start;
1633        }
1634        if (iov_iter_is_pipe(i))
1635                return pipe_get_pages_alloc(i, pages, maxsize, start);
1636        if (iov_iter_is_xarray(i))
1637                return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1638        return -EFAULT;
1639}
1640EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1641
1642size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1643                               struct iov_iter *i)
1644{
1645        __wsum sum, next;
1646        sum = *csum;
1647        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1648                WARN_ON(1);
1649                return 0;
1650        }
1651        iterate_and_advance(i, bytes, base, len, off, ({
1652                next = csum_and_copy_from_user(base, addr + off, len);
1653                sum = csum_block_add(sum, next, off);
1654                next ? 0 : len;
1655        }), ({
1656                sum = csum_and_memcpy(addr + off, base, len, sum, off);
1657        })
1658        )
1659        *csum = sum;
1660        return bytes;
1661}
1662EXPORT_SYMBOL(csum_and_copy_from_iter);
1663
1664size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1665                             struct iov_iter *i)
1666{
1667        struct csum_state *csstate = _csstate;
1668        __wsum sum, next;
1669
1670        if (unlikely(iov_iter_is_discard(i))) {
1671                WARN_ON(1);     /* for now */
1672                return 0;
1673        }
1674
1675        sum = csum_shift(csstate->csum, csstate->off);
1676        if (unlikely(iov_iter_is_pipe(i)))
1677                bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum);
1678        else iterate_and_advance(i, bytes, base, len, off, ({
1679                next = csum_and_copy_to_user(addr + off, base, len);
1680                sum = csum_block_add(sum, next, off);
1681                next ? 0 : len;
1682        }), ({
1683                sum = csum_and_memcpy(base, addr + off, len, sum, off);
1684        })
1685        )
1686        csstate->csum = csum_shift(sum, csstate->off);
1687        csstate->off += bytes;
1688        return bytes;
1689}
1690EXPORT_SYMBOL(csum_and_copy_to_iter);
1691
1692size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1693                struct iov_iter *i)
1694{
1695#ifdef CONFIG_CRYPTO_HASH
1696        struct ahash_request *hash = hashp;
1697        struct scatterlist sg;
1698        size_t copied;
1699
1700        copied = copy_to_iter(addr, bytes, i);
1701        sg_init_one(&sg, addr, copied);
1702        ahash_request_set_crypt(hash, &sg, NULL, copied);
1703        crypto_ahash_update(hash);
1704        return copied;
1705#else
1706        return 0;
1707#endif
1708}
1709EXPORT_SYMBOL(hash_and_copy_to_iter);
1710
1711static int iov_npages(const struct iov_iter *i, int maxpages)
1712{
1713        size_t skip = i->iov_offset, size = i->count;
1714        const struct iovec *p;
1715        int npages = 0;
1716
1717        for (p = i->iov; size; skip = 0, p++) {
1718                unsigned offs = offset_in_page(p->iov_base + skip);
1719                size_t len = min(p->iov_len - skip, size);
1720
1721                if (len) {
1722                        size -= len;
1723                        npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1724                        if (unlikely(npages > maxpages))
1725                                return maxpages;
1726                }
1727        }
1728        return npages;
1729}
1730
1731static int bvec_npages(const struct iov_iter *i, int maxpages)
1732{
1733        size_t skip = i->iov_offset, size = i->count;
1734        const struct bio_vec *p;
1735        int npages = 0;
1736
1737        for (p = i->bvec; size; skip = 0, p++) {
1738                unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1739                size_t len = min(p->bv_len - skip, size);
1740
1741                size -= len;
1742                npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1743                if (unlikely(npages > maxpages))
1744                        return maxpages;
1745        }
1746        return npages;
1747}
1748
1749int iov_iter_npages(const struct iov_iter *i, int maxpages)
1750{
1751        if (unlikely(!i->count))
1752                return 0;
1753        /* iovec and kvec have identical layouts */
1754        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1755                return iov_npages(i, maxpages);
1756        if (iov_iter_is_bvec(i))
1757                return bvec_npages(i, maxpages);
1758        if (iov_iter_is_pipe(i)) {
1759                unsigned int iter_head;
1760                int npages;
1761                size_t off;
1762
1763                if (!sanity(i))
1764                        return 0;
1765
1766                data_start(i, &iter_head, &off);
1767                /* some of this one + all after this one */
1768                npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1769                return min(npages, maxpages);
1770        }
1771        if (iov_iter_is_xarray(i)) {
1772                unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1773                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1774                return min(npages, maxpages);
1775        }
1776        return 0;
1777}
1778EXPORT_SYMBOL(iov_iter_npages);
1779
1780const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1781{
1782        *new = *old;
1783        if (unlikely(iov_iter_is_pipe(new))) {
1784                WARN_ON(1);
1785                return NULL;
1786        }
1787        if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
1788                return NULL;
1789        if (iov_iter_is_bvec(new))
1790                return new->bvec = kmemdup(new->bvec,
1791                                    new->nr_segs * sizeof(struct bio_vec),
1792                                    flags);
1793        else
1794                /* iovec and kvec have identical layout */
1795                return new->iov = kmemdup(new->iov,
1796                                   new->nr_segs * sizeof(struct iovec),
1797                                   flags);
1798}
1799EXPORT_SYMBOL(dup_iter);
1800
1801static int copy_compat_iovec_from_user(struct iovec *iov,
1802                const struct iovec __user *uvec, unsigned long nr_segs)
1803{
1804        const struct compat_iovec __user *uiov =
1805                (const struct compat_iovec __user *)uvec;
1806        int ret = -EFAULT, i;
1807
1808        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1809                return -EFAULT;
1810
1811        for (i = 0; i < nr_segs; i++) {
1812                compat_uptr_t buf;
1813                compat_ssize_t len;
1814
1815                unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1816                unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1817
1818                /* check for compat_size_t not fitting in compat_ssize_t .. */
1819                if (len < 0) {
1820                        ret = -EINVAL;
1821                        goto uaccess_end;
1822                }
1823                iov[i].iov_base = compat_ptr(buf);
1824                iov[i].iov_len = len;
1825        }
1826
1827        ret = 0;
1828uaccess_end:
1829        user_access_end();
1830        return ret;
1831}
1832
1833static int copy_iovec_from_user(struct iovec *iov,
1834                const struct iovec __user *uvec, unsigned long nr_segs)
1835{
1836        unsigned long seg;
1837
1838        if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1839                return -EFAULT;
1840        for (seg = 0; seg < nr_segs; seg++) {
1841                if ((ssize_t)iov[seg].iov_len < 0)
1842                        return -EINVAL;
1843        }
1844
1845        return 0;
1846}
1847
1848struct iovec *iovec_from_user(const struct iovec __user *uvec,
1849                unsigned long nr_segs, unsigned long fast_segs,
1850                struct iovec *fast_iov, bool compat)
1851{
1852        struct iovec *iov = fast_iov;
1853        int ret;
1854
1855        /*
1856         * SuS says "The readv() function *may* fail if the iovcnt argument was
1857         * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1858         * traditionally returned zero for zero segments, so...
1859         */
1860        if (nr_segs == 0)
1861                return iov;
1862        if (nr_segs > UIO_MAXIOV)
1863                return ERR_PTR(-EINVAL);
1864        if (nr_segs > fast_segs) {
1865                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1866                if (!iov)
1867                        return ERR_PTR(-ENOMEM);
1868        }
1869
1870        if (compat)
1871                ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1872        else
1873                ret = copy_iovec_from_user(iov, uvec, nr_segs);
1874        if (ret) {
1875                if (iov != fast_iov)
1876                        kfree(iov);
1877                return ERR_PTR(ret);
1878        }
1879
1880        return iov;
1881}
1882
1883ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1884                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1885                 struct iov_iter *i, bool compat)
1886{
1887        ssize_t total_len = 0;
1888        unsigned long seg;
1889        struct iovec *iov;
1890
1891        iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1892        if (IS_ERR(iov)) {
1893                *iovp = NULL;
1894                return PTR_ERR(iov);
1895        }
1896
1897        /*
1898         * According to the Single Unix Specification we should return EINVAL if
1899         * an element length is < 0 when cast to ssize_t or if the total length
1900         * would overflow the ssize_t return value of the system call.
1901         *
1902         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1903         * overflow case.
1904         */
1905        for (seg = 0; seg < nr_segs; seg++) {
1906                ssize_t len = (ssize_t)iov[seg].iov_len;
1907
1908                if (!access_ok(iov[seg].iov_base, len)) {
1909                        if (iov != *iovp)
1910                                kfree(iov);
1911                        *iovp = NULL;
1912                        return -EFAULT;
1913                }
1914
1915                if (len > MAX_RW_COUNT - total_len) {
1916                        len = MAX_RW_COUNT - total_len;
1917                        iov[seg].iov_len = len;
1918                }
1919                total_len += len;
1920        }
1921
1922        iov_iter_init(i, type, iov, nr_segs, total_len);
1923        if (iov == *iovp)
1924                *iovp = NULL;
1925        else
1926                *iovp = iov;
1927        return total_len;
1928}
1929
1930/**
1931 * import_iovec() - Copy an array of &struct iovec from userspace
1932 *     into the kernel, check that it is valid, and initialize a new
1933 *     &struct iov_iter iterator to access it.
1934 *
1935 * @type: One of %READ or %WRITE.
1936 * @uvec: Pointer to the userspace array.
1937 * @nr_segs: Number of elements in userspace array.
1938 * @fast_segs: Number of elements in @iov.
1939 * @iovp: (input and output parameter) Pointer to pointer to (usually small
1940 *     on-stack) kernel array.
1941 * @i: Pointer to iterator that will be initialized on success.
1942 *
1943 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1944 * then this function places %NULL in *@iov on return. Otherwise, a new
1945 * array will be allocated and the result placed in *@iov. This means that
1946 * the caller may call kfree() on *@iov regardless of whether the small
1947 * on-stack array was used or not (and regardless of whether this function
1948 * returns an error or not).
1949 *
1950 * Return: Negative error code on error, bytes imported on success
1951 */
1952ssize_t import_iovec(int type, const struct iovec __user *uvec,
1953                 unsigned nr_segs, unsigned fast_segs,
1954                 struct iovec **iovp, struct iov_iter *i)
1955{
1956        return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1957                              in_compat_syscall());
1958}
1959EXPORT_SYMBOL(import_iovec);
1960
1961int import_single_range(int rw, void __user *buf, size_t len,
1962                 struct iovec *iov, struct iov_iter *i)
1963{
1964        if (len > MAX_RW_COUNT)
1965                len = MAX_RW_COUNT;
1966        if (unlikely(!access_ok(buf, len)))
1967                return -EFAULT;
1968
1969        iov->iov_base = buf;
1970        iov->iov_len = len;
1971        iov_iter_init(i, rw, iov, 1, len);
1972        return 0;
1973}
1974EXPORT_SYMBOL(import_single_range);
1975
1976/**
1977 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
1978 *     iov_iter_save_state() was called.
1979 *
1980 * @i: &struct iov_iter to restore
1981 * @state: state to restore from
1982 *
1983 * Used after iov_iter_save_state() to bring restore @i, if operations may
1984 * have advanced it.
1985 *
1986 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
1987 */
1988void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
1989{
1990        if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
1991                         !iov_iter_is_kvec(i))
1992                return;
1993        i->iov_offset = state->iov_offset;
1994        i->count = state->count;
1995        /*
1996         * For the *vec iters, nr_segs + iov is constant - if we increment
1997         * the vec, then we also decrement the nr_segs count. Hence we don't
1998         * need to track both of these, just one is enough and we can deduct
1999         * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
2000         * size, so we can just increment the iov pointer as they are unionzed.
2001         * ITER_BVEC _may_ be the same size on some archs, but on others it is
2002         * not. Be safe and handle it separately.
2003         */
2004        BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
2005        if (iov_iter_is_bvec(i))
2006                i->bvec -= state->nr_segs - i->nr_segs;
2007        else
2008                i->iov -= state->nr_segs - i->nr_segs;
2009        i->nr_segs = state->nr_segs;
2010}
2011