linux/lib/iov_iter.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2#include <crypto/hash.h>
   3#include <linux/export.h>
   4#include <linux/bvec.h>
   5#include <linux/fault-inject-usercopy.h>
   6#include <linux/uio.h>
   7#include <linux/pagemap.h>
   8#include <linux/highmem.h>
   9#include <linux/slab.h>
  10#include <linux/vmalloc.h>
  11#include <linux/splice.h>
  12#include <linux/compat.h>
  13#include <net/checksum.h>
  14#include <linux/scatterlist.h>
  15#include <linux/instrumented.h>
  16
  17#define PIPE_PARANOIA /* for now */
  18
  19/* covers iovec and kvec alike */
  20#define iterate_iovec(i, n, base, len, off, __p, STEP) {        \
  21        size_t off = 0;                                         \
  22        size_t skip = i->iov_offset;                            \
  23        do {                                                    \
  24                len = min(n, __p->iov_len - skip);              \
  25                if (likely(len)) {                              \
  26                        base = __p->iov_base + skip;            \
  27                        len -= (STEP);                          \
  28                        off += len;                             \
  29                        skip += len;                            \
  30                        n -= len;                               \
  31                        if (skip < __p->iov_len)                \
  32                                break;                          \
  33                }                                               \
  34                __p++;                                          \
  35                skip = 0;                                       \
  36        } while (n);                                            \
  37        i->iov_offset = skip;                                   \
  38        n = off;                                                \
  39}
  40
  41#define iterate_bvec(i, n, base, len, off, p, STEP) {           \
  42        size_t off = 0;                                         \
  43        unsigned skip = i->iov_offset;                          \
  44        while (n) {                                             \
  45                unsigned offset = p->bv_offset + skip;          \
  46                unsigned left;                                  \
  47                void *kaddr = kmap_local_page(p->bv_page +      \
  48                                        offset / PAGE_SIZE);    \
  49                base = kaddr + offset % PAGE_SIZE;              \
  50                len = min(min(n, (size_t)(p->bv_len - skip)),   \
  51                     (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \
  52                left = (STEP);                                  \
  53                kunmap_local(kaddr);                            \
  54                len -= left;                                    \
  55                off += len;                                     \
  56                skip += len;                                    \
  57                if (skip == p->bv_len) {                        \
  58                        skip = 0;                               \
  59                        p++;                                    \
  60                }                                               \
  61                n -= len;                                       \
  62                if (left)                                       \
  63                        break;                                  \
  64        }                                                       \
  65        i->iov_offset = skip;                                   \
  66        n = off;                                                \
  67}
  68
  69#define iterate_xarray(i, n, base, len, __off, STEP) {          \
  70        __label__ __out;                                        \
  71        size_t __off = 0;                                       \
  72        struct page *head = NULL;                               \
  73        loff_t start = i->xarray_start + i->iov_offset;         \
  74        unsigned offset = start % PAGE_SIZE;                    \
  75        pgoff_t index = start / PAGE_SIZE;                      \
  76        int j;                                                  \
  77                                                                \
  78        XA_STATE(xas, i->xarray, index);                        \
  79                                                                \
  80        rcu_read_lock();                                        \
  81        xas_for_each(&xas, head, ULONG_MAX) {                   \
  82                unsigned left;                                  \
  83                if (xas_retry(&xas, head))                      \
  84                        continue;                               \
  85                if (WARN_ON(xa_is_value(head)))                 \
  86                        break;                                  \
  87                if (WARN_ON(PageHuge(head)))                    \
  88                        break;                                  \
  89                for (j = (head->index < index) ? index - head->index : 0; \
  90                     j < thp_nr_pages(head); j++) {             \
  91                        void *kaddr = kmap_local_page(head + j);        \
  92                        base = kaddr + offset;                  \
  93                        len = PAGE_SIZE - offset;               \
  94                        len = min(n, len);                      \
  95                        left = (STEP);                          \
  96                        kunmap_local(kaddr);                    \
  97                        len -= left;                            \
  98                        __off += len;                           \
  99                        n -= len;                               \
 100                        if (left || n == 0)                     \
 101                                goto __out;                     \
 102                        offset = 0;                             \
 103                }                                               \
 104        }                                                       \
 105__out:                                                          \
 106        rcu_read_unlock();                                      \
 107        i->iov_offset += __off;                                         \
 108        n = __off;                                              \
 109}
 110
 111#define __iterate_and_advance(i, n, base, len, off, I, K) {     \
 112        if (unlikely(i->count < n))                             \
 113                n = i->count;                                   \
 114        if (likely(n)) {                                        \
 115                if (likely(iter_is_iovec(i))) {                 \
 116                        const struct iovec *iov = i->iov;       \
 117                        void __user *base;                      \
 118                        size_t len;                             \
 119                        iterate_iovec(i, n, base, len, off,     \
 120                                                iov, (I))       \
 121                        i->nr_segs -= iov - i->iov;             \
 122                        i->iov = iov;                           \
 123                } else if (iov_iter_is_bvec(i)) {               \
 124                        const struct bio_vec *bvec = i->bvec;   \
 125                        void *base;                             \
 126                        size_t len;                             \
 127                        iterate_bvec(i, n, base, len, off,      \
 128                                                bvec, (K))      \
 129                        i->nr_segs -= bvec - i->bvec;           \
 130                        i->bvec = bvec;                         \
 131                } else if (iov_iter_is_kvec(i)) {               \
 132                        const struct kvec *kvec = i->kvec;      \
 133                        void *base;                             \
 134                        size_t len;                             \
 135                        iterate_iovec(i, n, base, len, off,     \
 136                                                kvec, (K))      \
 137                        i->nr_segs -= kvec - i->kvec;           \
 138                        i->kvec = kvec;                         \
 139                } else if (iov_iter_is_xarray(i)) {             \
 140                        void *base;                             \
 141                        size_t len;                             \
 142                        iterate_xarray(i, n, base, len, off,    \
 143                                                        (K))    \
 144                }                                               \
 145                i->count -= n;                                  \
 146        }                                                       \
 147}
 148#define iterate_and_advance(i, n, base, len, off, I, K) \
 149        __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
 150
 151static int copyout(void __user *to, const void *from, size_t n)
 152{
 153        if (should_fail_usercopy())
 154                return n;
 155        if (access_ok(to, n)) {
 156                instrument_copy_to_user(to, from, n);
 157                n = raw_copy_to_user(to, from, n);
 158        }
 159        return n;
 160}
 161
 162static int copyin(void *to, const void __user *from, size_t n)
 163{
 164        if (should_fail_usercopy())
 165                return n;
 166        if (access_ok(from, n)) {
 167                instrument_copy_from_user(to, from, n);
 168                n = raw_copy_from_user(to, from, n);
 169        }
 170        return n;
 171}
 172
 173static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
 174                         struct iov_iter *i)
 175{
 176        size_t skip, copy, left, wanted;
 177        const struct iovec *iov;
 178        char __user *buf;
 179        void *kaddr, *from;
 180
 181        if (unlikely(bytes > i->count))
 182                bytes = i->count;
 183
 184        if (unlikely(!bytes))
 185                return 0;
 186
 187        might_fault();
 188        wanted = bytes;
 189        iov = i->iov;
 190        skip = i->iov_offset;
 191        buf = iov->iov_base + skip;
 192        copy = min(bytes, iov->iov_len - skip);
 193
 194        if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
 195                kaddr = kmap_atomic(page);
 196                from = kaddr + offset;
 197
 198                /* first chunk, usually the only one */
 199                left = copyout(buf, from, copy);
 200                copy -= left;
 201                skip += copy;
 202                from += copy;
 203                bytes -= copy;
 204
 205                while (unlikely(!left && bytes)) {
 206                        iov++;
 207                        buf = iov->iov_base;
 208                        copy = min(bytes, iov->iov_len);
 209                        left = copyout(buf, from, copy);
 210                        copy -= left;
 211                        skip = copy;
 212                        from += copy;
 213                        bytes -= copy;
 214                }
 215                if (likely(!bytes)) {
 216                        kunmap_atomic(kaddr);
 217                        goto done;
 218                }
 219                offset = from - kaddr;
 220                buf += copy;
 221                kunmap_atomic(kaddr);
 222                copy = min(bytes, iov->iov_len - skip);
 223        }
 224        /* Too bad - revert to non-atomic kmap */
 225
 226        kaddr = kmap(page);
 227        from = kaddr + offset;
 228        left = copyout(buf, from, copy);
 229        copy -= left;
 230        skip += copy;
 231        from += copy;
 232        bytes -= copy;
 233        while (unlikely(!left && bytes)) {
 234                iov++;
 235                buf = iov->iov_base;
 236                copy = min(bytes, iov->iov_len);
 237                left = copyout(buf, from, copy);
 238                copy -= left;
 239                skip = copy;
 240                from += copy;
 241                bytes -= copy;
 242        }
 243        kunmap(page);
 244
 245done:
 246        if (skip == iov->iov_len) {
 247                iov++;
 248                skip = 0;
 249        }
 250        i->count -= wanted - bytes;
 251        i->nr_segs -= iov - i->iov;
 252        i->iov = iov;
 253        i->iov_offset = skip;
 254        return wanted - bytes;
 255}
 256
 257static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
 258                         struct iov_iter *i)
 259{
 260        size_t skip, copy, left, wanted;
 261        const struct iovec *iov;
 262        char __user *buf;
 263        void *kaddr, *to;
 264
 265        if (unlikely(bytes > i->count))
 266                bytes = i->count;
 267
 268        if (unlikely(!bytes))
 269                return 0;
 270
 271        might_fault();
 272        wanted = bytes;
 273        iov = i->iov;
 274        skip = i->iov_offset;
 275        buf = iov->iov_base + skip;
 276        copy = min(bytes, iov->iov_len - skip);
 277
 278        if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
 279                kaddr = kmap_atomic(page);
 280                to = kaddr + offset;
 281
 282                /* first chunk, usually the only one */
 283                left = copyin(to, buf, copy);
 284                copy -= left;
 285                skip += copy;
 286                to += copy;
 287                bytes -= copy;
 288
 289                while (unlikely(!left && bytes)) {
 290                        iov++;
 291                        buf = iov->iov_base;
 292                        copy = min(bytes, iov->iov_len);
 293                        left = copyin(to, buf, copy);
 294                        copy -= left;
 295                        skip = copy;
 296                        to += copy;
 297                        bytes -= copy;
 298                }
 299                if (likely(!bytes)) {
 300                        kunmap_atomic(kaddr);
 301                        goto done;
 302                }
 303                offset = to - kaddr;
 304                buf += copy;
 305                kunmap_atomic(kaddr);
 306                copy = min(bytes, iov->iov_len - skip);
 307        }
 308        /* Too bad - revert to non-atomic kmap */
 309
 310        kaddr = kmap(page);
 311        to = kaddr + offset;
 312        left = copyin(to, buf, copy);
 313        copy -= left;
 314        skip += copy;
 315        to += copy;
 316        bytes -= copy;
 317        while (unlikely(!left && bytes)) {
 318                iov++;
 319                buf = iov->iov_base;
 320                copy = min(bytes, iov->iov_len);
 321                left = copyin(to, buf, copy);
 322                copy -= left;
 323                skip = copy;
 324                to += copy;
 325                bytes -= copy;
 326        }
 327        kunmap(page);
 328
 329done:
 330        if (skip == iov->iov_len) {
 331                iov++;
 332                skip = 0;
 333        }
 334        i->count -= wanted - bytes;
 335        i->nr_segs -= iov - i->iov;
 336        i->iov = iov;
 337        i->iov_offset = skip;
 338        return wanted - bytes;
 339}
 340
 341#ifdef PIPE_PARANOIA
 342static bool sanity(const struct iov_iter *i)
 343{
 344        struct pipe_inode_info *pipe = i->pipe;
 345        unsigned int p_head = pipe->head;
 346        unsigned int p_tail = pipe->tail;
 347        unsigned int p_mask = pipe->ring_size - 1;
 348        unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
 349        unsigned int i_head = i->head;
 350        unsigned int idx;
 351
 352        if (i->iov_offset) {
 353                struct pipe_buffer *p;
 354                if (unlikely(p_occupancy == 0))
 355                        goto Bad;       // pipe must be non-empty
 356                if (unlikely(i_head != p_head - 1))
 357                        goto Bad;       // must be at the last buffer...
 358
 359                p = &pipe->bufs[i_head & p_mask];
 360                if (unlikely(p->offset + p->len != i->iov_offset))
 361                        goto Bad;       // ... at the end of segment
 362        } else {
 363                if (i_head != p_head)
 364                        goto Bad;       // must be right after the last buffer
 365        }
 366        return true;
 367Bad:
 368        printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
 369        printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
 370                        p_head, p_tail, pipe->ring_size);
 371        for (idx = 0; idx < pipe->ring_size; idx++)
 372                printk(KERN_ERR "[%p %p %d %d]\n",
 373                        pipe->bufs[idx].ops,
 374                        pipe->bufs[idx].page,
 375                        pipe->bufs[idx].offset,
 376                        pipe->bufs[idx].len);
 377        WARN_ON(1);
 378        return false;
 379}
 380#else
 381#define sanity(i) true
 382#endif
 383
 384static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
 385                         struct iov_iter *i)
 386{
 387        struct pipe_inode_info *pipe = i->pipe;
 388        struct pipe_buffer *buf;
 389        unsigned int p_tail = pipe->tail;
 390        unsigned int p_mask = pipe->ring_size - 1;
 391        unsigned int i_head = i->head;
 392        size_t off;
 393
 394        if (unlikely(bytes > i->count))
 395                bytes = i->count;
 396
 397        if (unlikely(!bytes))
 398                return 0;
 399
 400        if (!sanity(i))
 401                return 0;
 402
 403        off = i->iov_offset;
 404        buf = &pipe->bufs[i_head & p_mask];
 405        if (off) {
 406                if (offset == off && buf->page == page) {
 407                        /* merge with the last one */
 408                        buf->len += bytes;
 409                        i->iov_offset += bytes;
 410                        goto out;
 411                }
 412                i_head++;
 413                buf = &pipe->bufs[i_head & p_mask];
 414        }
 415        if (pipe_full(i_head, p_tail, pipe->max_usage))
 416                return 0;
 417
 418        buf->ops = &page_cache_pipe_buf_ops;
 419        get_page(page);
 420        buf->page = page;
 421        buf->offset = offset;
 422        buf->len = bytes;
 423
 424        pipe->head = i_head + 1;
 425        i->iov_offset = offset + bytes;
 426        i->head = i_head;
 427out:
 428        i->count -= bytes;
 429        return bytes;
 430}
 431
 432/*
 433 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
 434 * bytes.  For each iovec, fault in each page that constitutes the iovec.
 435 *
 436 * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
 437 * because it is an invalid address).
 438 */
 439int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
 440{
 441        if (iter_is_iovec(i)) {
 442                const struct iovec *p;
 443                size_t skip;
 444
 445                if (bytes > i->count)
 446                        bytes = i->count;
 447                for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
 448                        size_t len = min(bytes, p->iov_len - skip);
 449                        int err;
 450
 451                        if (unlikely(!len))
 452                                continue;
 453                        err = fault_in_pages_readable(p->iov_base + skip, len);
 454                        if (unlikely(err))
 455                                return err;
 456                        bytes -= len;
 457                }
 458        }
 459        return 0;
 460}
 461EXPORT_SYMBOL(iov_iter_fault_in_readable);
 462
 463void iov_iter_init(struct iov_iter *i, unsigned int direction,
 464                        const struct iovec *iov, unsigned long nr_segs,
 465                        size_t count)
 466{
 467        WARN_ON(direction & ~(READ | WRITE));
 468        *i = (struct iov_iter) {
 469                .iter_type = ITER_IOVEC,
 470                .data_source = direction,
 471                .iov = iov,
 472                .nr_segs = nr_segs,
 473                .iov_offset = 0,
 474                .count = count
 475        };
 476}
 477EXPORT_SYMBOL(iov_iter_init);
 478
 479static inline bool allocated(struct pipe_buffer *buf)
 480{
 481        return buf->ops == &default_pipe_buf_ops;
 482}
 483
 484static inline void data_start(const struct iov_iter *i,
 485                              unsigned int *iter_headp, size_t *offp)
 486{
 487        unsigned int p_mask = i->pipe->ring_size - 1;
 488        unsigned int iter_head = i->head;
 489        size_t off = i->iov_offset;
 490
 491        if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
 492                    off == PAGE_SIZE)) {
 493                iter_head++;
 494                off = 0;
 495        }
 496        *iter_headp = iter_head;
 497        *offp = off;
 498}
 499
 500static size_t push_pipe(struct iov_iter *i, size_t size,
 501                        int *iter_headp, size_t *offp)
 502{
 503        struct pipe_inode_info *pipe = i->pipe;
 504        unsigned int p_tail = pipe->tail;
 505        unsigned int p_mask = pipe->ring_size - 1;
 506        unsigned int iter_head;
 507        size_t off;
 508        ssize_t left;
 509
 510        if (unlikely(size > i->count))
 511                size = i->count;
 512        if (unlikely(!size))
 513                return 0;
 514
 515        left = size;
 516        data_start(i, &iter_head, &off);
 517        *iter_headp = iter_head;
 518        *offp = off;
 519        if (off) {
 520                left -= PAGE_SIZE - off;
 521                if (left <= 0) {
 522                        pipe->bufs[iter_head & p_mask].len += size;
 523                        return size;
 524                }
 525                pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
 526                iter_head++;
 527        }
 528        while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
 529                struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
 530                struct page *page = alloc_page(GFP_USER);
 531                if (!page)
 532                        break;
 533
 534                buf->ops = &default_pipe_buf_ops;
 535                buf->page = page;
 536                buf->offset = 0;
 537                buf->len = min_t(ssize_t, left, PAGE_SIZE);
 538                left -= buf->len;
 539                iter_head++;
 540                pipe->head = iter_head;
 541
 542                if (left == 0)
 543                        return size;
 544        }
 545        return size - left;
 546}
 547
 548static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
 549                                struct iov_iter *i)
 550{
 551        struct pipe_inode_info *pipe = i->pipe;
 552        unsigned int p_mask = pipe->ring_size - 1;
 553        unsigned int i_head;
 554        size_t n, off;
 555
 556        if (!sanity(i))
 557                return 0;
 558
 559        bytes = n = push_pipe(i, bytes, &i_head, &off);
 560        if (unlikely(!n))
 561                return 0;
 562        do {
 563                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
 564                memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
 565                i->head = i_head;
 566                i->iov_offset = off + chunk;
 567                n -= chunk;
 568                addr += chunk;
 569                off = 0;
 570                i_head++;
 571        } while (n);
 572        i->count -= bytes;
 573        return bytes;
 574}
 575
 576static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
 577                              __wsum sum, size_t off)
 578{
 579        __wsum next = csum_partial_copy_nocheck(from, to, len);
 580        return csum_block_add(sum, next, off);
 581}
 582
 583static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
 584                                         struct iov_iter *i, __wsum *sump)
 585{
 586        struct pipe_inode_info *pipe = i->pipe;
 587        unsigned int p_mask = pipe->ring_size - 1;
 588        __wsum sum = *sump;
 589        size_t off = 0;
 590        unsigned int i_head;
 591        size_t r;
 592
 593        if (!sanity(i))
 594                return 0;
 595
 596        bytes = push_pipe(i, bytes, &i_head, &r);
 597        while (bytes) {
 598                size_t chunk = min_t(size_t, bytes, PAGE_SIZE - r);
 599                char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
 600                sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off);
 601                kunmap_local(p);
 602                i->head = i_head;
 603                i->iov_offset = r + chunk;
 604                bytes -= chunk;
 605                off += chunk;
 606                r = 0;
 607                i_head++;
 608        }
 609        *sump = sum;
 610        i->count -= off;
 611        return off;
 612}
 613
 614size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 615{
 616        if (unlikely(iov_iter_is_pipe(i)))
 617                return copy_pipe_to_iter(addr, bytes, i);
 618        if (iter_is_iovec(i))
 619                might_fault();
 620        iterate_and_advance(i, bytes, base, len, off,
 621                copyout(base, addr + off, len),
 622                memcpy(base, addr + off, len)
 623        )
 624
 625        return bytes;
 626}
 627EXPORT_SYMBOL(_copy_to_iter);
 628
 629#ifdef CONFIG_ARCH_HAS_COPY_MC
 630static int copyout_mc(void __user *to, const void *from, size_t n)
 631{
 632        if (access_ok(to, n)) {
 633                instrument_copy_to_user(to, from, n);
 634                n = copy_mc_to_user((__force void *) to, from, n);
 635        }
 636        return n;
 637}
 638
 639static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
 640                                struct iov_iter *i)
 641{
 642        struct pipe_inode_info *pipe = i->pipe;
 643        unsigned int p_mask = pipe->ring_size - 1;
 644        unsigned int i_head;
 645        size_t n, off, xfer = 0;
 646
 647        if (!sanity(i))
 648                return 0;
 649
 650        n = push_pipe(i, bytes, &i_head, &off);
 651        while (n) {
 652                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
 653                char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
 654                unsigned long rem;
 655                rem = copy_mc_to_kernel(p + off, addr + xfer, chunk);
 656                chunk -= rem;
 657                kunmap_local(p);
 658                i->head = i_head;
 659                i->iov_offset = off + chunk;
 660                xfer += chunk;
 661                if (rem)
 662                        break;
 663                n -= chunk;
 664                off = 0;
 665                i_head++;
 666        }
 667        i->count -= xfer;
 668        return xfer;
 669}
 670
 671/**
 672 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 673 * @addr: source kernel address
 674 * @bytes: total transfer length
 675 * @iter: destination iterator
 676 *
 677 * The pmem driver deploys this for the dax operation
 678 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 679 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
 680 * successfully copied.
 681 *
 682 * The main differences between this and typical _copy_to_iter().
 683 *
 684 * * Typical tail/residue handling after a fault retries the copy
 685 *   byte-by-byte until the fault happens again. Re-triggering machine
 686 *   checks is potentially fatal so the implementation uses source
 687 *   alignment and poison alignment assumptions to avoid re-triggering
 688 *   hardware exceptions.
 689 *
 690 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
 691 *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
 692 *   a short copy.
 693 */
 694size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 695{
 696        if (unlikely(iov_iter_is_pipe(i)))
 697                return copy_mc_pipe_to_iter(addr, bytes, i);
 698        if (iter_is_iovec(i))
 699                might_fault();
 700        __iterate_and_advance(i, bytes, base, len, off,
 701                copyout_mc(base, addr + off, len),
 702                copy_mc_to_kernel(base, addr + off, len)
 703        )
 704
 705        return bytes;
 706}
 707EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
 708#endif /* CONFIG_ARCH_HAS_COPY_MC */
 709
 710size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 711{
 712        if (unlikely(iov_iter_is_pipe(i))) {
 713                WARN_ON(1);
 714                return 0;
 715        }
 716        if (iter_is_iovec(i))
 717                might_fault();
 718        iterate_and_advance(i, bytes, base, len, off,
 719                copyin(addr + off, base, len),
 720                memcpy(addr + off, base, len)
 721        )
 722
 723        return bytes;
 724}
 725EXPORT_SYMBOL(_copy_from_iter);
 726
 727size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 728{
 729        if (unlikely(iov_iter_is_pipe(i))) {
 730                WARN_ON(1);
 731                return 0;
 732        }
 733        iterate_and_advance(i, bytes, base, len, off,
 734                __copy_from_user_inatomic_nocache(addr + off, base, len),
 735                memcpy(addr + off, base, len)
 736        )
 737
 738        return bytes;
 739}
 740EXPORT_SYMBOL(_copy_from_iter_nocache);
 741
 742#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 743/**
 744 * _copy_from_iter_flushcache - write destination through cpu cache
 745 * @addr: destination kernel address
 746 * @bytes: total transfer length
 747 * @iter: source iterator
 748 *
 749 * The pmem driver arranges for filesystem-dax to use this facility via
 750 * dax_copy_from_iter() for ensuring that writes to persistent memory
 751 * are flushed through the CPU cache. It is differentiated from
 752 * _copy_from_iter_nocache() in that guarantees all data is flushed for
 753 * all iterator types. The _copy_from_iter_nocache() only attempts to
 754 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 755 * instructions that strand dirty-data in the cache.
 756 */
 757size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 758{
 759        if (unlikely(iov_iter_is_pipe(i))) {
 760                WARN_ON(1);
 761                return 0;
 762        }
 763        iterate_and_advance(i, bytes, base, len, off,
 764                __copy_from_user_flushcache(addr + off, base, len),
 765                memcpy_flushcache(addr + off, base, len)
 766        )
 767
 768        return bytes;
 769}
 770EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
 771#endif
 772
 773static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
 774{
 775        struct page *head;
 776        size_t v = n + offset;
 777
 778        /*
 779         * The general case needs to access the page order in order
 780         * to compute the page size.
 781         * However, we mostly deal with order-0 pages and thus can
 782         * avoid a possible cache line miss for requests that fit all
 783         * page orders.
 784         */
 785        if (n <= v && v <= PAGE_SIZE)
 786                return true;
 787
 788        head = compound_head(page);
 789        v += (page - head) << PAGE_SHIFT;
 790
 791        if (likely(n <= v && v <= (page_size(head))))
 792                return true;
 793        WARN_ON(1);
 794        return false;
 795}
 796
 797static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 798                         struct iov_iter *i)
 799{
 800        if (likely(iter_is_iovec(i)))
 801                return copy_page_to_iter_iovec(page, offset, bytes, i);
 802        if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
 803                void *kaddr = kmap_local_page(page);
 804                size_t wanted = _copy_to_iter(kaddr + offset, bytes, i);
 805                kunmap_local(kaddr);
 806                return wanted;
 807        }
 808        if (iov_iter_is_pipe(i))
 809                return copy_page_to_iter_pipe(page, offset, bytes, i);
 810        if (unlikely(iov_iter_is_discard(i))) {
 811                if (unlikely(i->count < bytes))
 812                        bytes = i->count;
 813                i->count -= bytes;
 814                return bytes;
 815        }
 816        WARN_ON(1);
 817        return 0;
 818}
 819
 820size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 821                         struct iov_iter *i)
 822{
 823        size_t res = 0;
 824        if (unlikely(!page_copy_sane(page, offset, bytes)))
 825                return 0;
 826        page += offset / PAGE_SIZE; // first subpage
 827        offset %= PAGE_SIZE;
 828        while (1) {
 829                size_t n = __copy_page_to_iter(page, offset,
 830                                min(bytes, (size_t)PAGE_SIZE - offset), i);
 831                res += n;
 832                bytes -= n;
 833                if (!bytes || !n)
 834                        break;
 835                offset += n;
 836                if (offset == PAGE_SIZE) {
 837                        page++;
 838                        offset = 0;
 839                }
 840        }
 841        return res;
 842}
 843EXPORT_SYMBOL(copy_page_to_iter);
 844
 845size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 846                         struct iov_iter *i)
 847{
 848        if (unlikely(!page_copy_sane(page, offset, bytes)))
 849                return 0;
 850        if (likely(iter_is_iovec(i)))
 851                return copy_page_from_iter_iovec(page, offset, bytes, i);
 852        if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
 853                void *kaddr = kmap_local_page(page);
 854                size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
 855                kunmap_local(kaddr);
 856                return wanted;
 857        }
 858        WARN_ON(1);
 859        return 0;
 860}
 861EXPORT_SYMBOL(copy_page_from_iter);
 862
 863static size_t pipe_zero(size_t bytes, struct iov_iter *i)
 864{
 865        struct pipe_inode_info *pipe = i->pipe;
 866        unsigned int p_mask = pipe->ring_size - 1;
 867        unsigned int i_head;
 868        size_t n, off;
 869
 870        if (!sanity(i))
 871                return 0;
 872
 873        bytes = n = push_pipe(i, bytes, &i_head, &off);
 874        if (unlikely(!n))
 875                return 0;
 876
 877        do {
 878                size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
 879                char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page);
 880                memset(p + off, 0, chunk);
 881                kunmap_local(p);
 882                i->head = i_head;
 883                i->iov_offset = off + chunk;
 884                n -= chunk;
 885                off = 0;
 886                i_head++;
 887        } while (n);
 888        i->count -= bytes;
 889        return bytes;
 890}
 891
 892size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 893{
 894        if (unlikely(iov_iter_is_pipe(i)))
 895                return pipe_zero(bytes, i);
 896        iterate_and_advance(i, bytes, base, len, count,
 897                clear_user(base, len),
 898                memset(base, 0, len)
 899        )
 900
 901        return bytes;
 902}
 903EXPORT_SYMBOL(iov_iter_zero);
 904
 905size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
 906                                  struct iov_iter *i)
 907{
 908        char *kaddr = kmap_atomic(page), *p = kaddr + offset;
 909        if (unlikely(!page_copy_sane(page, offset, bytes))) {
 910                kunmap_atomic(kaddr);
 911                return 0;
 912        }
 913        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
 914                kunmap_atomic(kaddr);
 915                WARN_ON(1);
 916                return 0;
 917        }
 918        iterate_and_advance(i, bytes, base, len, off,
 919                copyin(p + off, base, len),
 920                memcpy(p + off, base, len)
 921        )
 922        kunmap_atomic(kaddr);
 923        return bytes;
 924}
 925EXPORT_SYMBOL(copy_page_from_iter_atomic);
 926
 927static inline void pipe_truncate(struct iov_iter *i)
 928{
 929        struct pipe_inode_info *pipe = i->pipe;
 930        unsigned int p_tail = pipe->tail;
 931        unsigned int p_head = pipe->head;
 932        unsigned int p_mask = pipe->ring_size - 1;
 933
 934        if (!pipe_empty(p_head, p_tail)) {
 935                struct pipe_buffer *buf;
 936                unsigned int i_head = i->head;
 937                size_t off = i->iov_offset;
 938
 939                if (off) {
 940                        buf = &pipe->bufs[i_head & p_mask];
 941                        buf->len = off - buf->offset;
 942                        i_head++;
 943                }
 944                while (p_head != i_head) {
 945                        p_head--;
 946                        pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
 947                }
 948
 949                pipe->head = p_head;
 950        }
 951}
 952
 953static void pipe_advance(struct iov_iter *i, size_t size)
 954{
 955        struct pipe_inode_info *pipe = i->pipe;
 956        if (size) {
 957                struct pipe_buffer *buf;
 958                unsigned int p_mask = pipe->ring_size - 1;
 959                unsigned int i_head = i->head;
 960                size_t off = i->iov_offset, left = size;
 961
 962                if (off) /* make it relative to the beginning of buffer */
 963                        left += off - pipe->bufs[i_head & p_mask].offset;
 964                while (1) {
 965                        buf = &pipe->bufs[i_head & p_mask];
 966                        if (left <= buf->len)
 967                                break;
 968                        left -= buf->len;
 969                        i_head++;
 970                }
 971                i->head = i_head;
 972                i->iov_offset = buf->offset + left;
 973        }
 974        i->count -= size;
 975        /* ... and discard everything past that point */
 976        pipe_truncate(i);
 977}
 978
 979static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
 980{
 981        struct bvec_iter bi;
 982
 983        bi.bi_size = i->count;
 984        bi.bi_bvec_done = i->iov_offset;
 985        bi.bi_idx = 0;
 986        bvec_iter_advance(i->bvec, &bi, size);
 987
 988        i->bvec += bi.bi_idx;
 989        i->nr_segs -= bi.bi_idx;
 990        i->count = bi.bi_size;
 991        i->iov_offset = bi.bi_bvec_done;
 992}
 993
 994static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
 995{
 996        const struct iovec *iov, *end;
 997
 998        if (!i->count)
 999                return;
1000        i->count -= size;
1001
1002        size += i->iov_offset; // from beginning of current segment
1003        for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
1004                if (likely(size < iov->iov_len))
1005                        break;
1006                size -= iov->iov_len;
1007        }
1008        i->iov_offset = size;
1009        i->nr_segs -= iov - i->iov;
1010        i->iov = iov;
1011}
1012
1013void iov_iter_advance(struct iov_iter *i, size_t size)
1014{
1015        if (unlikely(i->count < size))
1016                size = i->count;
1017        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
1018                /* iovec and kvec have identical layouts */
1019                iov_iter_iovec_advance(i, size);
1020        } else if (iov_iter_is_bvec(i)) {
1021                iov_iter_bvec_advance(i, size);
1022        } else if (iov_iter_is_pipe(i)) {
1023                pipe_advance(i, size);
1024        } else if (unlikely(iov_iter_is_xarray(i))) {
1025                i->iov_offset += size;
1026                i->count -= size;
1027        } else if (iov_iter_is_discard(i)) {
1028                i->count -= size;
1029        }
1030}
1031EXPORT_SYMBOL(iov_iter_advance);
1032
1033void iov_iter_revert(struct iov_iter *i, size_t unroll)
1034{
1035        if (!unroll)
1036                return;
1037        if (WARN_ON(unroll > MAX_RW_COUNT))
1038                return;
1039        i->count += unroll;
1040        if (unlikely(iov_iter_is_pipe(i))) {
1041                struct pipe_inode_info *pipe = i->pipe;
1042                unsigned int p_mask = pipe->ring_size - 1;
1043                unsigned int i_head = i->head;
1044                size_t off = i->iov_offset;
1045                while (1) {
1046                        struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1047                        size_t n = off - b->offset;
1048                        if (unroll < n) {
1049                                off -= unroll;
1050                                break;
1051                        }
1052                        unroll -= n;
1053                        if (!unroll && i_head == i->start_head) {
1054                                off = 0;
1055                                break;
1056                        }
1057                        i_head--;
1058                        b = &pipe->bufs[i_head & p_mask];
1059                        off = b->offset + b->len;
1060                }
1061                i->iov_offset = off;
1062                i->head = i_head;
1063                pipe_truncate(i);
1064                return;
1065        }
1066        if (unlikely(iov_iter_is_discard(i)))
1067                return;
1068        if (unroll <= i->iov_offset) {
1069                i->iov_offset -= unroll;
1070                return;
1071        }
1072        unroll -= i->iov_offset;
1073        if (iov_iter_is_xarray(i)) {
1074                BUG(); /* We should never go beyond the start of the specified
1075                        * range since we might then be straying into pages that
1076                        * aren't pinned.
1077                        */
1078        } else if (iov_iter_is_bvec(i)) {
1079                const struct bio_vec *bvec = i->bvec;
1080                while (1) {
1081                        size_t n = (--bvec)->bv_len;
1082                        i->nr_segs++;
1083                        if (unroll <= n) {
1084                                i->bvec = bvec;
1085                                i->iov_offset = n - unroll;
1086                                return;
1087                        }
1088                        unroll -= n;
1089                }
1090        } else { /* same logics for iovec and kvec */
1091                const struct iovec *iov = i->iov;
1092                while (1) {
1093                        size_t n = (--iov)->iov_len;
1094                        i->nr_segs++;
1095                        if (unroll <= n) {
1096                                i->iov = iov;
1097                                i->iov_offset = n - unroll;
1098                                return;
1099                        }
1100                        unroll -= n;
1101                }
1102        }
1103}
1104EXPORT_SYMBOL(iov_iter_revert);
1105
1106/*
1107 * Return the count of just the current iov_iter segment.
1108 */
1109size_t iov_iter_single_seg_count(const struct iov_iter *i)
1110{
1111        if (i->nr_segs > 1) {
1112                if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1113                        return min(i->count, i->iov->iov_len - i->iov_offset);
1114                if (iov_iter_is_bvec(i))
1115                        return min(i->count, i->bvec->bv_len - i->iov_offset);
1116        }
1117        return i->count;
1118}
1119EXPORT_SYMBOL(iov_iter_single_seg_count);
1120
1121void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1122                        const struct kvec *kvec, unsigned long nr_segs,
1123                        size_t count)
1124{
1125        WARN_ON(direction & ~(READ | WRITE));
1126        *i = (struct iov_iter){
1127                .iter_type = ITER_KVEC,
1128                .data_source = direction,
1129                .kvec = kvec,
1130                .nr_segs = nr_segs,
1131                .iov_offset = 0,
1132                .count = count
1133        };
1134}
1135EXPORT_SYMBOL(iov_iter_kvec);
1136
1137void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1138                        const struct bio_vec *bvec, unsigned long nr_segs,
1139                        size_t count)
1140{
1141        WARN_ON(direction & ~(READ | WRITE));
1142        *i = (struct iov_iter){
1143                .iter_type = ITER_BVEC,
1144                .data_source = direction,
1145                .bvec = bvec,
1146                .nr_segs = nr_segs,
1147                .iov_offset = 0,
1148                .count = count
1149        };
1150}
1151EXPORT_SYMBOL(iov_iter_bvec);
1152
1153void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1154                        struct pipe_inode_info *pipe,
1155                        size_t count)
1156{
1157        BUG_ON(direction != READ);
1158        WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1159        *i = (struct iov_iter){
1160                .iter_type = ITER_PIPE,
1161                .data_source = false,
1162                .pipe = pipe,
1163                .head = pipe->head,
1164                .start_head = pipe->head,
1165                .iov_offset = 0,
1166                .count = count
1167        };
1168}
1169EXPORT_SYMBOL(iov_iter_pipe);
1170
1171/**
1172 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1173 * @i: The iterator to initialise.
1174 * @direction: The direction of the transfer.
1175 * @xarray: The xarray to access.
1176 * @start: The start file position.
1177 * @count: The size of the I/O buffer in bytes.
1178 *
1179 * Set up an I/O iterator to either draw data out of the pages attached to an
1180 * inode or to inject data into those pages.  The pages *must* be prevented
1181 * from evaporation, either by taking a ref on them or locking them by the
1182 * caller.
1183 */
1184void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1185                     struct xarray *xarray, loff_t start, size_t count)
1186{
1187        BUG_ON(direction & ~1);
1188        *i = (struct iov_iter) {
1189                .iter_type = ITER_XARRAY,
1190                .data_source = direction,
1191                .xarray = xarray,
1192                .xarray_start = start,
1193                .count = count,
1194                .iov_offset = 0
1195        };
1196}
1197EXPORT_SYMBOL(iov_iter_xarray);
1198
1199/**
1200 * iov_iter_discard - Initialise an I/O iterator that discards data
1201 * @i: The iterator to initialise.
1202 * @direction: The direction of the transfer.
1203 * @count: The size of the I/O buffer in bytes.
1204 *
1205 * Set up an I/O iterator that just discards everything that's written to it.
1206 * It's only available as a READ iterator.
1207 */
1208void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1209{
1210        BUG_ON(direction != READ);
1211        *i = (struct iov_iter){
1212                .iter_type = ITER_DISCARD,
1213                .data_source = false,
1214                .count = count,
1215                .iov_offset = 0
1216        };
1217}
1218EXPORT_SYMBOL(iov_iter_discard);
1219
1220static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
1221{
1222        unsigned long res = 0;
1223        size_t size = i->count;
1224        size_t skip = i->iov_offset;
1225        unsigned k;
1226
1227        for (k = 0; k < i->nr_segs; k++, skip = 0) {
1228                size_t len = i->iov[k].iov_len - skip;
1229                if (len) {
1230                        res |= (unsigned long)i->iov[k].iov_base + skip;
1231                        if (len > size)
1232                                len = size;
1233                        res |= len;
1234                        size -= len;
1235                        if (!size)
1236                                break;
1237                }
1238        }
1239        return res;
1240}
1241
1242static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1243{
1244        unsigned res = 0;
1245        size_t size = i->count;
1246        unsigned skip = i->iov_offset;
1247        unsigned k;
1248
1249        for (k = 0; k < i->nr_segs; k++, skip = 0) {
1250                size_t len = i->bvec[k].bv_len - skip;
1251                res |= (unsigned long)i->bvec[k].bv_offset + skip;
1252                if (len > size)
1253                        len = size;
1254                res |= len;
1255                size -= len;
1256                if (!size)
1257                        break;
1258        }
1259        return res;
1260}
1261
1262unsigned long iov_iter_alignment(const struct iov_iter *i)
1263{
1264        /* iovec and kvec have identical layouts */
1265        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1266                return iov_iter_alignment_iovec(i);
1267
1268        if (iov_iter_is_bvec(i))
1269                return iov_iter_alignment_bvec(i);
1270
1271        if (iov_iter_is_pipe(i)) {
1272                unsigned int p_mask = i->pipe->ring_size - 1;
1273                size_t size = i->count;
1274
1275                if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1276                        return size | i->iov_offset;
1277                return size;
1278        }
1279
1280        if (iov_iter_is_xarray(i))
1281                return (i->xarray_start + i->iov_offset) | i->count;
1282
1283        return 0;
1284}
1285EXPORT_SYMBOL(iov_iter_alignment);
1286
1287unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1288{
1289        unsigned long res = 0;
1290        unsigned long v = 0;
1291        size_t size = i->count;
1292        unsigned k;
1293
1294        if (WARN_ON(!iter_is_iovec(i)))
1295                return ~0U;
1296
1297        for (k = 0; k < i->nr_segs; k++) {
1298                if (i->iov[k].iov_len) {
1299                        unsigned long base = (unsigned long)i->iov[k].iov_base;
1300                        if (v) // if not the first one
1301                                res |= base | v; // this start | previous end
1302                        v = base + i->iov[k].iov_len;
1303                        if (size <= i->iov[k].iov_len)
1304                                break;
1305                        size -= i->iov[k].iov_len;
1306                }
1307        }
1308        return res;
1309}
1310EXPORT_SYMBOL(iov_iter_gap_alignment);
1311
1312static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1313                                size_t maxsize,
1314                                struct page **pages,
1315                                int iter_head,
1316                                size_t *start)
1317{
1318        struct pipe_inode_info *pipe = i->pipe;
1319        unsigned int p_mask = pipe->ring_size - 1;
1320        ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1321        if (!n)
1322                return -EFAULT;
1323
1324        maxsize = n;
1325        n += *start;
1326        while (n > 0) {
1327                get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1328                iter_head++;
1329                n -= PAGE_SIZE;
1330        }
1331
1332        return maxsize;
1333}
1334
1335static ssize_t pipe_get_pages(struct iov_iter *i,
1336                   struct page **pages, size_t maxsize, unsigned maxpages,
1337                   size_t *start)
1338{
1339        unsigned int iter_head, npages;
1340        size_t capacity;
1341
1342        if (!sanity(i))
1343                return -EFAULT;
1344
1345        data_start(i, &iter_head, start);
1346        /* Amount of free space: some of this one + all after this one */
1347        npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1348        capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1349
1350        return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1351}
1352
1353static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1354                                          pgoff_t index, unsigned int nr_pages)
1355{
1356        XA_STATE(xas, xa, index);
1357        struct page *page;
1358        unsigned int ret = 0;
1359
1360        rcu_read_lock();
1361        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1362                if (xas_retry(&xas, page))
1363                        continue;
1364
1365                /* Has the page moved or been split? */
1366                if (unlikely(page != xas_reload(&xas))) {
1367                        xas_reset(&xas);
1368                        continue;
1369                }
1370
1371                pages[ret] = find_subpage(page, xas.xa_index);
1372                get_page(pages[ret]);
1373                if (++ret == nr_pages)
1374                        break;
1375        }
1376        rcu_read_unlock();
1377        return ret;
1378}
1379
1380static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1381                                     struct page **pages, size_t maxsize,
1382                                     unsigned maxpages, size_t *_start_offset)
1383{
1384        unsigned nr, offset;
1385        pgoff_t index, count;
1386        size_t size = maxsize, actual;
1387        loff_t pos;
1388
1389        if (!size || !maxpages)
1390                return 0;
1391
1392        pos = i->xarray_start + i->iov_offset;
1393        index = pos >> PAGE_SHIFT;
1394        offset = pos & ~PAGE_MASK;
1395        *_start_offset = offset;
1396
1397        count = 1;
1398        if (size > PAGE_SIZE - offset) {
1399                size -= PAGE_SIZE - offset;
1400                count += size >> PAGE_SHIFT;
1401                size &= ~PAGE_MASK;
1402                if (size)
1403                        count++;
1404        }
1405
1406        if (count > maxpages)
1407                count = maxpages;
1408
1409        nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1410        if (nr == 0)
1411                return 0;
1412
1413        actual = PAGE_SIZE * nr;
1414        actual -= offset;
1415        if (nr == count && size > 0) {
1416                unsigned last_offset = (nr > 1) ? 0 : offset;
1417                actual -= PAGE_SIZE - (last_offset + size);
1418        }
1419        return actual;
1420}
1421
1422/* must be done on non-empty ITER_IOVEC one */
1423static unsigned long first_iovec_segment(const struct iov_iter *i,
1424                                         size_t *size, size_t *start,
1425                                         size_t maxsize, unsigned maxpages)
1426{
1427        size_t skip;
1428        long k;
1429
1430        for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1431                unsigned long addr = (unsigned long)i->iov[k].iov_base + skip;
1432                size_t len = i->iov[k].iov_len - skip;
1433
1434                if (unlikely(!len))
1435                        continue;
1436                if (len > maxsize)
1437                        len = maxsize;
1438                len += (*start = addr % PAGE_SIZE);
1439                if (len > maxpages * PAGE_SIZE)
1440                        len = maxpages * PAGE_SIZE;
1441                *size = len;
1442                return addr & PAGE_MASK;
1443        }
1444        BUG(); // if it had been empty, we wouldn't get called
1445}
1446
1447/* must be done on non-empty ITER_BVEC one */
1448static struct page *first_bvec_segment(const struct iov_iter *i,
1449                                       size_t *size, size_t *start,
1450                                       size_t maxsize, unsigned maxpages)
1451{
1452        struct page *page;
1453        size_t skip = i->iov_offset, len;
1454
1455        len = i->bvec->bv_len - skip;
1456        if (len > maxsize)
1457                len = maxsize;
1458        skip += i->bvec->bv_offset;
1459        page = i->bvec->bv_page + skip / PAGE_SIZE;
1460        len += (*start = skip % PAGE_SIZE);
1461        if (len > maxpages * PAGE_SIZE)
1462                len = maxpages * PAGE_SIZE;
1463        *size = len;
1464        return page;
1465}
1466
1467ssize_t iov_iter_get_pages(struct iov_iter *i,
1468                   struct page **pages, size_t maxsize, unsigned maxpages,
1469                   size_t *start)
1470{
1471        size_t len;
1472        int n, res;
1473
1474        if (maxsize > i->count)
1475                maxsize = i->count;
1476        if (!maxsize)
1477                return 0;
1478
1479        if (likely(iter_is_iovec(i))) {
1480                unsigned long addr;
1481
1482                addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
1483                n = DIV_ROUND_UP(len, PAGE_SIZE);
1484                res = get_user_pages_fast(addr, n,
1485                                iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
1486                                pages);
1487                if (unlikely(res < 0))
1488                        return res;
1489                return (res == n ? len : res * PAGE_SIZE) - *start;
1490        }
1491        if (iov_iter_is_bvec(i)) {
1492                struct page *page;
1493
1494                page = first_bvec_segment(i, &len, start, maxsize, maxpages);
1495                n = DIV_ROUND_UP(len, PAGE_SIZE);
1496                while (n--)
1497                        get_page(*pages++ = page++);
1498                return len - *start;
1499        }
1500        if (iov_iter_is_pipe(i))
1501                return pipe_get_pages(i, pages, maxsize, maxpages, start);
1502        if (iov_iter_is_xarray(i))
1503                return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1504        return -EFAULT;
1505}
1506EXPORT_SYMBOL(iov_iter_get_pages);
1507
1508static struct page **get_pages_array(size_t n)
1509{
1510        return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1511}
1512
1513static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1514                   struct page ***pages, size_t maxsize,
1515                   size_t *start)
1516{
1517        struct page **p;
1518        unsigned int iter_head, npages;
1519        ssize_t n;
1520
1521        if (!sanity(i))
1522                return -EFAULT;
1523
1524        data_start(i, &iter_head, start);
1525        /* Amount of free space: some of this one + all after this one */
1526        npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1527        n = npages * PAGE_SIZE - *start;
1528        if (maxsize > n)
1529                maxsize = n;
1530        else
1531                npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1532        p = get_pages_array(npages);
1533        if (!p)
1534                return -ENOMEM;
1535        n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1536        if (n > 0)
1537                *pages = p;
1538        else
1539                kvfree(p);
1540        return n;
1541}
1542
1543static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1544                                           struct page ***pages, size_t maxsize,
1545                                           size_t *_start_offset)
1546{
1547        struct page **p;
1548        unsigned nr, offset;
1549        pgoff_t index, count;
1550        size_t size = maxsize, actual;
1551        loff_t pos;
1552
1553        if (!size)
1554                return 0;
1555
1556        pos = i->xarray_start + i->iov_offset;
1557        index = pos >> PAGE_SHIFT;
1558        offset = pos & ~PAGE_MASK;
1559        *_start_offset = offset;
1560
1561        count = 1;
1562        if (size > PAGE_SIZE - offset) {
1563                size -= PAGE_SIZE - offset;
1564                count += size >> PAGE_SHIFT;
1565                size &= ~PAGE_MASK;
1566                if (size)
1567                        count++;
1568        }
1569
1570        p = get_pages_array(count);
1571        if (!p)
1572                return -ENOMEM;
1573        *pages = p;
1574
1575        nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1576        if (nr == 0)
1577                return 0;
1578
1579        actual = PAGE_SIZE * nr;
1580        actual -= offset;
1581        if (nr == count && size > 0) {
1582                unsigned last_offset = (nr > 1) ? 0 : offset;
1583                actual -= PAGE_SIZE - (last_offset + size);
1584        }
1585        return actual;
1586}
1587
1588ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1589                   struct page ***pages, size_t maxsize,
1590                   size_t *start)
1591{
1592        struct page **p;
1593        size_t len;
1594        int n, res;
1595
1596        if (maxsize > i->count)
1597                maxsize = i->count;
1598        if (!maxsize)
1599                return 0;
1600
1601        if (likely(iter_is_iovec(i))) {
1602                unsigned long addr;
1603
1604                addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
1605                n = DIV_ROUND_UP(len, PAGE_SIZE);
1606                p = get_pages_array(n);
1607                if (!p)
1608                        return -ENOMEM;
1609                res = get_user_pages_fast(addr, n,
1610                                iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
1611                if (unlikely(res < 0)) {
1612                        kvfree(p);
1613                        return res;
1614                }
1615                *pages = p;
1616                return (res == n ? len : res * PAGE_SIZE) - *start;
1617        }
1618        if (iov_iter_is_bvec(i)) {
1619                struct page *page;
1620
1621                page = first_bvec_segment(i, &len, start, maxsize, ~0U);
1622                n = DIV_ROUND_UP(len, PAGE_SIZE);
1623                *pages = p = get_pages_array(n);
1624                if (!p)
1625                        return -ENOMEM;
1626                while (n--)
1627                        get_page(*p++ = page++);
1628                return len - *start;
1629        }
1630        if (iov_iter_is_pipe(i))
1631                return pipe_get_pages_alloc(i, pages, maxsize, start);
1632        if (iov_iter_is_xarray(i))
1633                return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1634        return -EFAULT;
1635}
1636EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1637
1638size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1639                               struct iov_iter *i)
1640{
1641        __wsum sum, next;
1642        sum = *csum;
1643        if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1644                WARN_ON(1);
1645                return 0;
1646        }
1647        iterate_and_advance(i, bytes, base, len, off, ({
1648                next = csum_and_copy_from_user(base, addr + off, len);
1649                sum = csum_block_add(sum, next, off);
1650                next ? 0 : len;
1651        }), ({
1652                sum = csum_and_memcpy(addr + off, base, len, sum, off);
1653        })
1654        )
1655        *csum = sum;
1656        return bytes;
1657}
1658EXPORT_SYMBOL(csum_and_copy_from_iter);
1659
1660size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1661                             struct iov_iter *i)
1662{
1663        struct csum_state *csstate = _csstate;
1664        __wsum sum, next;
1665
1666        if (unlikely(iov_iter_is_discard(i))) {
1667                WARN_ON(1);     /* for now */
1668                return 0;
1669        }
1670
1671        sum = csum_shift(csstate->csum, csstate->off);
1672        if (unlikely(iov_iter_is_pipe(i)))
1673                bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum);
1674        else iterate_and_advance(i, bytes, base, len, off, ({
1675                next = csum_and_copy_to_user(addr + off, base, len);
1676                sum = csum_block_add(sum, next, off);
1677                next ? 0 : len;
1678        }), ({
1679                sum = csum_and_memcpy(base, addr + off, len, sum, off);
1680        })
1681        )
1682        csstate->csum = csum_shift(sum, csstate->off);
1683        csstate->off += bytes;
1684        return bytes;
1685}
1686EXPORT_SYMBOL(csum_and_copy_to_iter);
1687
1688size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1689                struct iov_iter *i)
1690{
1691#ifdef CONFIG_CRYPTO_HASH
1692        struct ahash_request *hash = hashp;
1693        struct scatterlist sg;
1694        size_t copied;
1695
1696        copied = copy_to_iter(addr, bytes, i);
1697        sg_init_one(&sg, addr, copied);
1698        ahash_request_set_crypt(hash, &sg, NULL, copied);
1699        crypto_ahash_update(hash);
1700        return copied;
1701#else
1702        return 0;
1703#endif
1704}
1705EXPORT_SYMBOL(hash_and_copy_to_iter);
1706
1707static int iov_npages(const struct iov_iter *i, int maxpages)
1708{
1709        size_t skip = i->iov_offset, size = i->count;
1710        const struct iovec *p;
1711        int npages = 0;
1712
1713        for (p = i->iov; size; skip = 0, p++) {
1714                unsigned offs = offset_in_page(p->iov_base + skip);
1715                size_t len = min(p->iov_len - skip, size);
1716
1717                if (len) {
1718                        size -= len;
1719                        npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1720                        if (unlikely(npages > maxpages))
1721                                return maxpages;
1722                }
1723        }
1724        return npages;
1725}
1726
1727static int bvec_npages(const struct iov_iter *i, int maxpages)
1728{
1729        size_t skip = i->iov_offset, size = i->count;
1730        const struct bio_vec *p;
1731        int npages = 0;
1732
1733        for (p = i->bvec; size; skip = 0, p++) {
1734                unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1735                size_t len = min(p->bv_len - skip, size);
1736
1737                size -= len;
1738                npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1739                if (unlikely(npages > maxpages))
1740                        return maxpages;
1741        }
1742        return npages;
1743}
1744
1745int iov_iter_npages(const struct iov_iter *i, int maxpages)
1746{
1747        if (unlikely(!i->count))
1748                return 0;
1749        /* iovec and kvec have identical layouts */
1750        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1751                return iov_npages(i, maxpages);
1752        if (iov_iter_is_bvec(i))
1753                return bvec_npages(i, maxpages);
1754        if (iov_iter_is_pipe(i)) {
1755                unsigned int iter_head;
1756                int npages;
1757                size_t off;
1758
1759                if (!sanity(i))
1760                        return 0;
1761
1762                data_start(i, &iter_head, &off);
1763                /* some of this one + all after this one */
1764                npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1765                return min(npages, maxpages);
1766        }
1767        if (iov_iter_is_xarray(i)) {
1768                unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1769                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1770                return min(npages, maxpages);
1771        }
1772        return 0;
1773}
1774EXPORT_SYMBOL(iov_iter_npages);
1775
1776const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1777{
1778        *new = *old;
1779        if (unlikely(iov_iter_is_pipe(new))) {
1780                WARN_ON(1);
1781                return NULL;
1782        }
1783        if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
1784                return NULL;
1785        if (iov_iter_is_bvec(new))
1786                return new->bvec = kmemdup(new->bvec,
1787                                    new->nr_segs * sizeof(struct bio_vec),
1788                                    flags);
1789        else
1790                /* iovec and kvec have identical layout */
1791                return new->iov = kmemdup(new->iov,
1792                                   new->nr_segs * sizeof(struct iovec),
1793                                   flags);
1794}
1795EXPORT_SYMBOL(dup_iter);
1796
1797static int copy_compat_iovec_from_user(struct iovec *iov,
1798                const struct iovec __user *uvec, unsigned long nr_segs)
1799{
1800        const struct compat_iovec __user *uiov =
1801                (const struct compat_iovec __user *)uvec;
1802        int ret = -EFAULT, i;
1803
1804        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1805                return -EFAULT;
1806
1807        for (i = 0; i < nr_segs; i++) {
1808                compat_uptr_t buf;
1809                compat_ssize_t len;
1810
1811                unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1812                unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1813
1814                /* check for compat_size_t not fitting in compat_ssize_t .. */
1815                if (len < 0) {
1816                        ret = -EINVAL;
1817                        goto uaccess_end;
1818                }
1819                iov[i].iov_base = compat_ptr(buf);
1820                iov[i].iov_len = len;
1821        }
1822
1823        ret = 0;
1824uaccess_end:
1825        user_access_end();
1826        return ret;
1827}
1828
1829static int copy_iovec_from_user(struct iovec *iov,
1830                const struct iovec __user *uvec, unsigned long nr_segs)
1831{
1832        unsigned long seg;
1833
1834        if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1835                return -EFAULT;
1836        for (seg = 0; seg < nr_segs; seg++) {
1837                if ((ssize_t)iov[seg].iov_len < 0)
1838                        return -EINVAL;
1839        }
1840
1841        return 0;
1842}
1843
1844struct iovec *iovec_from_user(const struct iovec __user *uvec,
1845                unsigned long nr_segs, unsigned long fast_segs,
1846                struct iovec *fast_iov, bool compat)
1847{
1848        struct iovec *iov = fast_iov;
1849        int ret;
1850
1851        /*
1852         * SuS says "The readv() function *may* fail if the iovcnt argument was
1853         * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1854         * traditionally returned zero for zero segments, so...
1855         */
1856        if (nr_segs == 0)
1857                return iov;
1858        if (nr_segs > UIO_MAXIOV)
1859                return ERR_PTR(-EINVAL);
1860        if (nr_segs > fast_segs) {
1861                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1862                if (!iov)
1863                        return ERR_PTR(-ENOMEM);
1864        }
1865
1866        if (compat)
1867                ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1868        else
1869                ret = copy_iovec_from_user(iov, uvec, nr_segs);
1870        if (ret) {
1871                if (iov != fast_iov)
1872                        kfree(iov);
1873                return ERR_PTR(ret);
1874        }
1875
1876        return iov;
1877}
1878
1879ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1880                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1881                 struct iov_iter *i, bool compat)
1882{
1883        ssize_t total_len = 0;
1884        unsigned long seg;
1885        struct iovec *iov;
1886
1887        iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1888        if (IS_ERR(iov)) {
1889                *iovp = NULL;
1890                return PTR_ERR(iov);
1891        }
1892
1893        /*
1894         * According to the Single Unix Specification we should return EINVAL if
1895         * an element length is < 0 when cast to ssize_t or if the total length
1896         * would overflow the ssize_t return value of the system call.
1897         *
1898         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1899         * overflow case.
1900         */
1901        for (seg = 0; seg < nr_segs; seg++) {
1902                ssize_t len = (ssize_t)iov[seg].iov_len;
1903
1904                if (!access_ok(iov[seg].iov_base, len)) {
1905                        if (iov != *iovp)
1906                                kfree(iov);
1907                        *iovp = NULL;
1908                        return -EFAULT;
1909                }
1910
1911                if (len > MAX_RW_COUNT - total_len) {
1912                        len = MAX_RW_COUNT - total_len;
1913                        iov[seg].iov_len = len;
1914                }
1915                total_len += len;
1916        }
1917
1918        iov_iter_init(i, type, iov, nr_segs, total_len);
1919        if (iov == *iovp)
1920                *iovp = NULL;
1921        else
1922                *iovp = iov;
1923        return total_len;
1924}
1925
1926/**
1927 * import_iovec() - Copy an array of &struct iovec from userspace
1928 *     into the kernel, check that it is valid, and initialize a new
1929 *     &struct iov_iter iterator to access it.
1930 *
1931 * @type: One of %READ or %WRITE.
1932 * @uvec: Pointer to the userspace array.
1933 * @nr_segs: Number of elements in userspace array.
1934 * @fast_segs: Number of elements in @iov.
1935 * @iovp: (input and output parameter) Pointer to pointer to (usually small
1936 *     on-stack) kernel array.
1937 * @i: Pointer to iterator that will be initialized on success.
1938 *
1939 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1940 * then this function places %NULL in *@iov on return. Otherwise, a new
1941 * array will be allocated and the result placed in *@iov. This means that
1942 * the caller may call kfree() on *@iov regardless of whether the small
1943 * on-stack array was used or not (and regardless of whether this function
1944 * returns an error or not).
1945 *
1946 * Return: Negative error code on error, bytes imported on success
1947 */
1948ssize_t import_iovec(int type, const struct iovec __user *uvec,
1949                 unsigned nr_segs, unsigned fast_segs,
1950                 struct iovec **iovp, struct iov_iter *i)
1951{
1952        return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1953                              in_compat_syscall());
1954}
1955EXPORT_SYMBOL(import_iovec);
1956
1957int import_single_range(int rw, void __user *buf, size_t len,
1958                 struct iovec *iov, struct iov_iter *i)
1959{
1960        if (len > MAX_RW_COUNT)
1961                len = MAX_RW_COUNT;
1962        if (unlikely(!access_ok(buf, len)))
1963                return -EFAULT;
1964
1965        iov->iov_base = buf;
1966        iov->iov_len = len;
1967        iov_iter_init(i, rw, iov, 1, len);
1968        return 0;
1969}
1970EXPORT_SYMBOL(import_single_range);
1971