LXR linux/mm/gup.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2#include <linux/kernel.h>
   3#include <linux/errno.h>
   4#include <linux/err.h>
   5#include <linux/spinlock.h>
   6
   7#include <linux/mm.h>
   8#include <linux/memremap.h>
   9#include <linux/pagemap.h>
  10#include <linux/rmap.h>
  11#include <linux/swap.h>
  12#include <linux/swapops.h>
  13#include <linux/secretmem.h>
  14
  15#include <linux/sched/signal.h>
  16#include <linux/rwsem.h>
  17#include <linux/hugetlb.h>
  18#include <linux/migrate.h>
  19#include <linux/mm_inline.h>
  20#include <linux/sched/mm.h>
  21
  22#include <asm/mmu_context.h>
  23#include <asm/tlbflush.h>
  24
  25#include "internal.h"
  26
  27struct follow_page_context {
  28        struct dev_pagemap *pgmap;
  29        unsigned int page_mask;
  30};
  31
  32static void hpage_pincount_add(struct page *page, int refs)
  33{
  34        VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
  35        VM_BUG_ON_PAGE(page != compound_head(page), page);
  36
  37        atomic_add(refs, compound_pincount_ptr(page));
  38}
  39
  40static void hpage_pincount_sub(struct page *page, int refs)
  41{
  42        VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
  43        VM_BUG_ON_PAGE(page != compound_head(page), page);
  44
  45        atomic_sub(refs, compound_pincount_ptr(page));
  46}
  47
  48/* Equivalent to calling put_page() @refs times. */
  49static void put_page_refs(struct page *page, int refs)
  50{
  51#ifdef CONFIG_DEBUG_VM
  52        if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
  53                return;
  54#endif
  55
  56        /*
  57         * Calling put_page() for each ref is unnecessarily slow. Only the last
  58         * ref needs a put_page().
  59         */
  60        if (refs > 1)
  61                page_ref_sub(page, refs - 1);
  62        put_page(page);
  63}
  64
  65/*
  66 * Return the compound head page with ref appropriately incremented,
  67 * or NULL if that failed.
  68 */
  69static inline struct page *try_get_compound_head(struct page *page, int refs)
  70{
  71        struct page *head = compound_head(page);
  72
  73        if (WARN_ON_ONCE(page_ref_count(head) < 0))
  74                return NULL;
  75        if (unlikely(!page_cache_add_speculative(head, refs)))
  76                return NULL;
  77
  78        /*
  79         * At this point we have a stable reference to the head page; but it
  80         * could be that between the compound_head() lookup and the refcount
  81         * increment, the compound page was split, in which case we'd end up
  82         * holding a reference on a page that has nothing to do with the page
  83         * we were given anymore.
  84         * So now that the head page is stable, recheck that the pages still
  85         * belong together.
  86         */
  87        if (unlikely(compound_head(page) != head)) {
  88                put_page_refs(head, refs);
  89                return NULL;
  90        }
  91
  92        return head;
  93}
  94
  95/**
  96 * try_grab_compound_head() - attempt to elevate a page's refcount, by a
  97 * flags-dependent amount.
  98 *
  99 * Even though the name includes "compound_head", this function is still
 100 * appropriate for callers that have a non-compound @page to get.
 101 *
 102 * @page:  pointer to page to be grabbed
 103 * @refs:  the value to (effectively) add to the page's refcount
 104 * @flags: gup flags: these are the FOLL_* flag values.
 105 *
 106 * "grab" names in this file mean, "look at flags to decide whether to use
 107 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 108 *
 109 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
 110 * same time. (That's true throughout the get_user_pages*() and
 111 * pin_user_pages*() APIs.) Cases:
 112 *
 113 *    FOLL_GET: page's refcount will be incremented by @refs.
 114 *
 115 *    FOLL_PIN on compound pages that are > two pages long: page's refcount will
 116 *    be incremented by @refs, and page[2].hpage_pinned_refcount will be
 117 *    incremented by @refs * GUP_PIN_COUNTING_BIAS.
 118 *
 119 *    FOLL_PIN on normal pages, or compound pages that are two pages long:
 120 *    page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS.
 121 *
 122 * Return: head page (with refcount appropriately incremented) for success, or
 123 * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
 124 * considered failure, and furthermore, a likely bug in the caller, so a warning
 125 * is also emitted.
 126 */
 127struct page *try_grab_compound_head(struct page *page,
 128                                    int refs, unsigned int flags)
 129{
 130        if (flags & FOLL_GET)
 131                return try_get_compound_head(page, refs);
 132        else if (flags & FOLL_PIN) {
 133                /*
 134                 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
 135                 * right zone, so fail and let the caller fall back to the slow
 136                 * path.
 137                 */
 138                if (unlikely((flags & FOLL_LONGTERM) &&
 139                             !is_pinnable_page(page)))
 140                        return NULL;
 141
 142                /*
 143                 * CAUTION: Don't use compound_head() on the page before this
 144                 * point, the result won't be stable.
 145                 */
 146                page = try_get_compound_head(page, refs);
 147                if (!page)
 148                        return NULL;
 149
 150                /*
 151                 * When pinning a compound page of order > 1 (which is what
 152                 * hpage_pincount_available() checks for), use an exact count to
 153                 * track it, via hpage_pincount_add/_sub().
 154                 *
 155                 * However, be sure to *also* increment the normal page refcount
 156                 * field at least once, so that the page really is pinned.
 157                 * That's why the refcount from the earlier
 158                 * try_get_compound_head() is left intact.
 159                 */
 160                if (hpage_pincount_available(page))
 161                        hpage_pincount_add(page, refs);
 162                else
 163                        page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
 164
 165                mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
 166                                    refs);
 167
 168                return page;
 169        }
 170
 171        WARN_ON_ONCE(1);
 172        return NULL;
 173}
 174
 175static void put_compound_head(struct page *page, int refs, unsigned int flags)
 176{
 177        if (flags & FOLL_PIN) {
 178                mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
 179                                    refs);
 180
 181                if (hpage_pincount_available(page))
 182                        hpage_pincount_sub(page, refs);
 183                else
 184                        refs *= GUP_PIN_COUNTING_BIAS;
 185        }
 186
 187        put_page_refs(page, refs);
 188}
 189
 190/**
 191 * try_grab_page() - elevate a page's refcount by a flag-dependent amount
 192 *
 193 * This might not do anything at all, depending on the flags argument.
 194 *
 195 * "grab" names in this file mean, "look at flags to decide whether to use
 196 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 197 *
 198 * @page:    pointer to page to be grabbed
 199 * @flags:   gup flags: these are the FOLL_* flag values.
 200 *
 201 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
 202 * time. Cases: please see the try_grab_compound_head() documentation, with
 203 * "refs=1".
 204 *
 205 * Return: true for success, or if no action was required (if neither FOLL_PIN
 206 * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
 207 * FOLL_PIN was set, but the page could not be grabbed.
 208 */
 209bool __must_check try_grab_page(struct page *page, unsigned int flags)
 210{
 211        if (!(flags & (FOLL_GET | FOLL_PIN)))
 212                return true;
 213
 214        return try_grab_compound_head(page, 1, flags);
 215}
 216
 217/**
 218 * unpin_user_page() - release a dma-pinned page
 219 * @page:            pointer to page to be released
 220 *
 221 * Pages that were pinned via pin_user_pages*() must be released via either
 222 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
 223 * that such pages can be separately tracked and uniquely handled. In
 224 * particular, interactions with RDMA and filesystems need special handling.
 225 */
 226void unpin_user_page(struct page *page)
 227{
 228        put_compound_head(compound_head(page), 1, FOLL_PIN);
 229}
 230EXPORT_SYMBOL(unpin_user_page);
 231
 232static inline void compound_range_next(unsigned long i, unsigned long npages,
 233                                       struct page **list, struct page **head,
 234                                       unsigned int *ntails)
 235{
 236        struct page *next, *page;
 237        unsigned int nr = 1;
 238
 239        if (i >= npages)
 240                return;
 241
 242        next = *list + i;
 243        page = compound_head(next);
 244        if (PageCompound(page) && compound_order(page) >= 1)
 245                nr = min_t(unsigned int,
 246                           page + compound_nr(page) - next, npages - i);
 247
 248        *head = page;
 249        *ntails = nr;
 250}
 251
 252#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \
 253        for (__i = 0, \
 254             compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \
 255             __i < __npages; __i += __ntails, \
 256             compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
 257
 258static inline void compound_next(unsigned long i, unsigned long npages,
 259                                 struct page **list, struct page **head,
 260                                 unsigned int *ntails)
 261{
 262        struct page *page;
 263        unsigned int nr;
 264
 265        if (i >= npages)
 266                return;
 267
 268        page = compound_head(list[i]);
 269        for (nr = i + 1; nr < npages; nr++) {
 270                if (compound_head(list[nr]) != page)
 271                        break;
 272        }
 273
 274        *head = page;
 275        *ntails = nr - i;
 276}
 277
 278#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \
 279        for (__i = 0, \
 280             compound_next(__i, __npages, __list, &(__head), &(__ntails)); \
 281             __i < __npages; __i += __ntails, \
 282             compound_next(__i, __npages, __list, &(__head), &(__ntails)))
 283
 284/**
 285 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
 286 * @pages:  array of pages to be maybe marked dirty, and definitely released.
 287 * @npages: number of pages in the @pages array.
 288 * @make_dirty: whether to mark the pages dirty
 289 *
 290 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
 291 * variants called on that page.
 292 *
 293 * For each page in the @pages array, make that page (or its head page, if a
 294 * compound page) dirty, if @make_dirty is true, and if the page was previously
 295 * listed as clean. In any case, releases all pages using unpin_user_page(),
 296 * possibly via unpin_user_pages(), for the non-dirty case.
 297 *
 298 * Please see the unpin_user_page() documentation for details.
 299 *
 300 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 301 * required, then the caller should a) verify that this is really correct,
 302 * because _lock() is usually required, and b) hand code it:
 303 * set_page_dirty_lock(), unpin_user_page().
 304 *
 305 */
 306void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 307                                 bool make_dirty)
 308{
 309        unsigned long index;
 310        struct page *head;
 311        unsigned int ntails;
 312
 313        if (!make_dirty) {
 314                unpin_user_pages(pages, npages);
 315                return;
 316        }
 317
 318        for_each_compound_head(index, pages, npages, head, ntails) {
 319                /*
 320                 * Checking PageDirty at this point may race with
 321                 * clear_page_dirty_for_io(), but that's OK. Two key
 322                 * cases:
 323                 *
 324                 * 1) This code sees the page as already dirty, so it
 325                 * skips the call to set_page_dirty(). That could happen
 326                 * because clear_page_dirty_for_io() called
 327                 * page_mkclean(), followed by set_page_dirty().
 328                 * However, now the page is going to get written back,
 329                 * which meets the original intention of setting it
 330                 * dirty, so all is well: clear_page_dirty_for_io() goes
 331                 * on to call TestClearPageDirty(), and write the page
 332                 * back.
 333                 *
 334                 * 2) This code sees the page as clean, so it calls
 335                 * set_page_dirty(). The page stays dirty, despite being
 336                 * written back, so it gets written back again in the
 337                 * next writeback cycle. This is harmless.
 338                 */
 339                if (!PageDirty(head))
 340                        set_page_dirty_lock(head);
 341                put_compound_head(head, ntails, FOLL_PIN);
 342        }
 343}
 344EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
 345
 346/**
 347 * unpin_user_page_range_dirty_lock() - release and optionally dirty
 348 * gup-pinned page range
 349 *
 350 * @page:  the starting page of a range maybe marked dirty, and definitely released.
 351 * @npages: number of consecutive pages to release.
 352 * @make_dirty: whether to mark the pages dirty
 353 *
 354 * "gup-pinned page range" refers to a range of pages that has had one of the
 355 * pin_user_pages() variants called on that page.
 356 *
 357 * For the page ranges defined by [page .. page+npages], make that range (or
 358 * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
 359 * page range was previously listed as clean.
 360 *
 361 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 362 * required, then the caller should a) verify that this is really correct,
 363 * because _lock() is usually required, and b) hand code it:
 364 * set_page_dirty_lock(), unpin_user_page().
 365 *
 366 */
 367void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
 368                                      bool make_dirty)
 369{
 370        unsigned long index;
 371        struct page *head;
 372        unsigned int ntails;
 373
 374        for_each_compound_range(index, &page, npages, head, ntails) {
 375                if (make_dirty && !PageDirty(head))
 376                        set_page_dirty_lock(head);
 377                put_compound_head(head, ntails, FOLL_PIN);
 378        }
 379}
 380EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
 381
 382/**
 383 * unpin_user_pages() - release an array of gup-pinned pages.
 384 * @pages:  array of pages to be marked dirty and released.
 385 * @npages: number of pages in the @pages array.
 386 *
 387 * For each page in the @pages array, release the page using unpin_user_page().
 388 *
 389 * Please see the unpin_user_page() documentation for details.
 390 */
 391void unpin_user_pages(struct page **pages, unsigned long npages)
 392{
 393        unsigned long index;
 394        struct page *head;
 395        unsigned int ntails;
 396
 397        /*
 398         * If this WARN_ON() fires, then the system *might* be leaking pages (by
 399         * leaving them pinned), but probably not. More likely, gup/pup returned
 400         * a hard -ERRNO error to the caller, who erroneously passed it here.
 401         */
 402        if (WARN_ON(IS_ERR_VALUE(npages)))
 403                return;
 404
 405        for_each_compound_head(index, pages, npages, head, ntails)
 406                put_compound_head(head, ntails, FOLL_PIN);
 407}
 408EXPORT_SYMBOL(unpin_user_pages);
 409
 410/*
 411 * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
 412 * lifecycle.  Avoid setting the bit unless necessary, or it might cause write
 413 * cache bouncing on large SMP machines for concurrent pinned gups.
 414 */
 415static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
 416{
 417        if (!test_bit(MMF_HAS_PINNED, mm_flags))
 418                set_bit(MMF_HAS_PINNED, mm_flags);
 419}
 420
 421#ifdef CONFIG_MMU
 422static struct page *no_page_table(struct vm_area_struct *vma,
 423                unsigned int flags)
 424{
 425        /*
 426         * When core dumping an enormous anonymous area that nobody
 427         * has touched so far, we don't want to allocate unnecessary pages or
 428         * page tables.  Return error instead of NULL to skip handle_mm_fault,
 429         * then get_dump_page() will return NULL to leave a hole in the dump.
 430         * But we can only make this optimization where a hole would surely
 431         * be zero-filled if handle_mm_fault() actually did handle it.
 432         */
 433        if ((flags & FOLL_DUMP) &&
 434                        (vma_is_anonymous(vma) || !vma->vm_ops->fault))
 435                return ERR_PTR(-EFAULT);
 436        return NULL;
 437}
 438
 439static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 440                pte_t *pte, unsigned int flags)
 441{
 442        /* No page to get reference */
 443        if (flags & FOLL_GET)
 444                return -EFAULT;
 445
 446        if (flags & FOLL_TOUCH) {
 447                pte_t entry = *pte;
 448
 449                if (flags & FOLL_WRITE)
 450                        entry = pte_mkdirty(entry);
 451                entry = pte_mkyoung(entry);
 452
 453                if (!pte_same(*pte, entry)) {
 454                        set_pte_at(vma->vm_mm, address, pte, entry);
 455                        update_mmu_cache(vma, address, pte);
 456                }
 457        }
 458
 459        /* Proper page table entry exists, but no corresponding struct page */
 460        return -EEXIST;
 461}
 462
 463/*
 464 * FOLL_FORCE can write to even unwritable pte's, but only
 465 * after we've gone through a COW cycle and they are dirty.
 466 */
 467static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
 468{
 469        return pte_write(pte) ||
 470                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
 471}
 472
 473static struct page *follow_page_pte(struct vm_area_struct *vma,
 474                unsigned long address, pmd_t *pmd, unsigned int flags,
 475                struct dev_pagemap **pgmap)
 476{
 477        struct mm_struct *mm = vma->vm_mm;
 478        struct page *page;
 479        spinlock_t *ptl;
 480        pte_t *ptep, pte;
 481        int ret;
 482
 483        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
 484        if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
 485                         (FOLL_PIN | FOLL_GET)))
 486                return ERR_PTR(-EINVAL);
 487retry:
 488        if (unlikely(pmd_bad(*pmd)))
 489                return no_page_table(vma, flags);
 490
 491        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 492        pte = *ptep;
 493        if (!pte_present(pte)) {
 494                swp_entry_t entry;
 495                /*
 496                 * KSM's break_ksm() relies upon recognizing a ksm page
 497                 * even while it is being migrated, so for that case we
 498                 * need migration_entry_wait().
 499                 */
 500                if (likely(!(flags & FOLL_MIGRATION)))
 501                        goto no_page;
 502                if (pte_none(pte))
 503                        goto no_page;
 504                entry = pte_to_swp_entry(pte);
 505                if (!is_migration_entry(entry))
 506                        goto no_page;
 507                pte_unmap_unlock(ptep, ptl);
 508                migration_entry_wait(mm, pmd, address);
 509                goto retry;
 510        }
 511        if ((flags & FOLL_NUMA) && pte_protnone(pte))
 512                goto no_page;
 513        if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
 514                pte_unmap_unlock(ptep, ptl);
 515                return NULL;
 516        }
 517
 518        page = vm_normal_page(vma, address, pte);
 519        if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
 520                /*
 521                 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
 522                 * case since they are only valid while holding the pgmap
 523                 * reference.
 524                 */
 525                *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
 526                if (*pgmap)
 527                        page = pte_page(pte);
 528                else
 529                        goto no_page;
 530        } else if (unlikely(!page)) {
 531                if (flags & FOLL_DUMP) {
 532                        /* Avoid special (like zero) pages in core dumps */
 533                        page = ERR_PTR(-EFAULT);
 534                        goto out;
 535                }
 536
 537                if (is_zero_pfn(pte_pfn(pte))) {
 538                        page = pte_page(pte);
 539                } else {
 540                        ret = follow_pfn_pte(vma, address, ptep, flags);
 541                        page = ERR_PTR(ret);
 542                        goto out;
 543                }
 544        }
 545
 546        /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
 547        if (unlikely(!try_grab_page(page, flags))) {
 548                page = ERR_PTR(-ENOMEM);
 549                goto out;
 550        }
 551        /*
 552         * We need to make the page accessible if and only if we are going
 553         * to access its content (the FOLL_PIN case).  Please see
 554         * Documentation/core-api/pin_user_pages.rst for details.
 555         */
 556        if (flags & FOLL_PIN) {
 557                ret = arch_make_page_accessible(page);
 558                if (ret) {
 559                        unpin_user_page(page);
 560                        page = ERR_PTR(ret);
 561                        goto out;
 562                }
 563        }
 564        if (flags & FOLL_TOUCH) {
 565                if ((flags & FOLL_WRITE) &&
 566                    !pte_dirty(pte) && !PageDirty(page))
 567                        set_page_dirty(page);
 568                /*
 569                 * pte_mkyoung() would be more correct here, but atomic care
 570                 * is needed to avoid losing the dirty bit: it is easier to use
 571                 * mark_page_accessed().
 572                 */
 573                mark_page_accessed(page);
 574        }
 575        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 576                /* Do not mlock pte-mapped THP */
 577                if (PageTransCompound(page))
 578                        goto out;
 579
 580                /*
 581                 * The preliminary mapping check is mainly to avoid the
 582                 * pointless overhead of lock_page on the ZERO_PAGE
 583                 * which might bounce very badly if there is contention.
 584                 *
 585                 * If the page is already locked, we don't need to
 586                 * handle it now - vmscan will handle it later if and
 587                 * when it attempts to reclaim the page.
 588                 */
 589                if (page->mapping && trylock_page(page)) {
 590                        lru_add_drain();  /* push cached pages to LRU */
 591                        /*
 592                         * Because we lock page here, and migration is
 593                         * blocked by the pte's page reference, and we
 594                         * know the page is still mapped, we don't even
 595                         * need to check for file-cache page truncation.
 596                         */
 597                        mlock_vma_page(page);
 598                        unlock_page(page);
 599                }
 600        }
 601out:
 602        pte_unmap_unlock(ptep, ptl);
 603        return page;
 604no_page:
 605        pte_unmap_unlock(ptep, ptl);
 606        if (!pte_none(pte))
 607                return NULL;
 608        return no_page_table(vma, flags);
 609}
 610
 611static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 612                                    unsigned long address, pud_t *pudp,
 613                                    unsigned int flags,
 614                                    struct follow_page_context *ctx)
 615{
 616        pmd_t *pmd, pmdval;
 617        spinlock_t *ptl;
 618        struct page *page;
 619        struct mm_struct *mm = vma->vm_mm;
 620
 621        pmd = pmd_offset(pudp, address);
 622        /*
 623         * The READ_ONCE() will stabilize the pmdval in a register or
 624         * on the stack so that it will stop changing under the code.
 625         */
 626        pmdval = READ_ONCE(*pmd);
 627        if (pmd_none(pmdval))
 628                return no_page_table(vma, flags);
 629        if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
 630                page = follow_huge_pmd(mm, address, pmd, flags);
 631                if (page)
 632                        return page;
 633                return no_page_table(vma, flags);
 634        }
 635        if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
 636                page = follow_huge_pd(vma, address,
 637                                      __hugepd(pmd_val(pmdval)), flags,
 638                                      PMD_SHIFT);
 639                if (page)
 640                        return page;
 641                return no_page_table(vma, flags);
 642        }
 643retry:
 644        if (!pmd_present(pmdval)) {
 645                if (likely(!(flags & FOLL_MIGRATION)))
 646                        return no_page_table(vma, flags);
 647                VM_BUG_ON(thp_migration_supported() &&
 648                                  !is_pmd_migration_entry(pmdval));
 649                if (is_pmd_migration_entry(pmdval))
 650                        pmd_migration_entry_wait(mm, pmd);
 651                pmdval = READ_ONCE(*pmd);
 652                /*
 653                 * MADV_DONTNEED may convert the pmd to null because
 654                 * mmap_lock is held in read mode
 655                 */
 656                if (pmd_none(pmdval))
 657                        return no_page_table(vma, flags);
 658                goto retry;
 659        }
 660        if (pmd_devmap(pmdval)) {
 661                ptl = pmd_lock(mm, pmd);
 662                page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
 663                spin_unlock(ptl);
 664                if (page)
 665                        return page;
 666        }
 667        if (likely(!pmd_trans_huge(pmdval)))
 668                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 669
 670        if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
 671                return no_page_table(vma, flags);
 672
 673retry_locked:
 674        ptl = pmd_lock(mm, pmd);
 675        if (unlikely(pmd_none(*pmd))) {
 676                spin_unlock(ptl);
 677                return no_page_table(vma, flags);
 678        }
 679        if (unlikely(!pmd_present(*pmd))) {
 680                spin_unlock(ptl);
 681                if (likely(!(flags & FOLL_MIGRATION)))
 682                        return no_page_table(vma, flags);
 683                pmd_migration_entry_wait(mm, pmd);
 684                goto retry_locked;
 685        }
 686        if (unlikely(!pmd_trans_huge(*pmd))) {
 687                spin_unlock(ptl);
 688                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 689        }
 690        if (flags & FOLL_SPLIT_PMD) {
 691                int ret;
 692                page = pmd_page(*pmd);
 693                if (is_huge_zero_page(page)) {
 694                        spin_unlock(ptl);
 695                        ret = 0;
 696                        split_huge_pmd(vma, pmd, address);
 697                        if (pmd_trans_unstable(pmd))
 698                                ret = -EBUSY;
 699                } else {
 700                        spin_unlock(ptl);
 701                        split_huge_pmd(vma, pmd, address);
 702                        ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
 703                }
 704
 705                return ret ? ERR_PTR(ret) :
 706                        follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 707        }
 708        page = follow_trans_huge_pmd(vma, address, pmd, flags);
 709        spin_unlock(ptl);
 710        ctx->page_mask = HPAGE_PMD_NR - 1;
 711        return page;
 712}
 713
 714static struct page *follow_pud_mask(struct vm_area_struct *vma,
 715                                    unsigned long address, p4d_t *p4dp,
 716                                    unsigned int flags,
 717                                    struct follow_page_context *ctx)
 718{
 719        pud_t *pud;
 720        spinlock_t *ptl;
 721        struct page *page;
 722        struct mm_struct *mm = vma->vm_mm;
 723
 724        pud = pud_offset(p4dp, address);
 725        if (pud_none(*pud))
 726                return no_page_table(vma, flags);
 727        if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
 728                page = follow_huge_pud(mm, address, pud, flags);
 729                if (page)
 730                        return page;
 731                return no_page_table(vma, flags);
 732        }
 733        if (is_hugepd(__hugepd(pud_val(*pud)))) {
 734                page = follow_huge_pd(vma, address,
 735                                      __hugepd(pud_val(*pud)), flags,
 736                                      PUD_SHIFT);
 737                if (page)
 738                        return page;
 739                return no_page_table(vma, flags);
 740        }
 741        if (pud_devmap(*pud)) {
 742                ptl = pud_lock(mm, pud);
 743                page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
 744                spin_unlock(ptl);
 745                if (page)
 746                        return page;
 747        }
 748        if (unlikely(pud_bad(*pud)))
 749                return no_page_table(vma, flags);
 750
 751        return follow_pmd_mask(vma, address, pud, flags, ctx);
 752}
 753
 754static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 755                                    unsigned long address, pgd_t *pgdp,
 756                                    unsigned int flags,
 757                                    struct follow_page_context *ctx)
 758{
 759        p4d_t *p4d;
 760        struct page *page;
 761
 762        p4d = p4d_offset(pgdp, address);
 763        if (p4d_none(*p4d))
 764                return no_page_table(vma, flags);
 765        BUILD_BUG_ON(p4d_huge(*p4d));
 766        if (unlikely(p4d_bad(*p4d)))
 767                return no_page_table(vma, flags);
 768
 769        if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
 770                page = follow_huge_pd(vma, address,
 771                                      __hugepd(p4d_val(*p4d)), flags,
 772                                      P4D_SHIFT);
 773                if (page)
 774                        return page;
 775                return no_page_table(vma, flags);
 776        }
 777        return follow_pud_mask(vma, address, p4d, flags, ctx);
 778}
 779
 780/**
 781 * follow_page_mask - look up a page descriptor from a user-virtual address
 782 * @vma: vm_area_struct mapping @address
 783 * @address: virtual address to look up
 784 * @flags: flags modifying lookup behaviour
 785 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
 786 *       pointer to output page_mask
 787 *
 788 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 789 *
 790 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
 791 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
 792 *
 793 * On output, the @ctx->page_mask is set according to the size of the page.
 794 *
 795 * Return: the mapped (struct page *), %NULL if no mapping exists, or
 796 * an error pointer if there is a mapping to something not represented
 797 * by a page descriptor (see also vm_normal_page()).
 798 */
 799static struct page *follow_page_mask(struct vm_area_struct *vma,
 800                              unsigned long address, unsigned int flags,
 801                              struct follow_page_context *ctx)
 802{
 803        pgd_t *pgd;
 804        struct page *page;
 805        struct mm_struct *mm = vma->vm_mm;
 806
 807        ctx->page_mask = 0;
 808
 809        /* make this handle hugepd */
 810        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 811        if (!IS_ERR(page)) {
 812                WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
 813                return page;
 814        }
 815
 816        pgd = pgd_offset(mm, address);
 817
 818        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 819                return no_page_table(vma, flags);
 820
 821        if (pgd_huge(*pgd)) {
 822                page = follow_huge_pgd(mm, address, pgd, flags);
 823                if (page)
 824                        return page;
 825                return no_page_table(vma, flags);
 826        }
 827        if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
 828                page = follow_huge_pd(vma, address,
 829                                      __hugepd(pgd_val(*pgd)), flags,
 830                                      PGDIR_SHIFT);
 831                if (page)
 832                        return page;
 833                return no_page_table(vma, flags);
 834        }
 835
 836        return follow_p4d_mask(vma, address, pgd, flags, ctx);
 837}
 838
 839struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 840                         unsigned int foll_flags)
 841{
 842        struct follow_page_context ctx = { NULL };
 843        struct page *page;
 844
 845        if (vma_is_secretmem(vma))
 846                return NULL;
 847
 848        page = follow_page_mask(vma, address, foll_flags, &ctx);
 849        if (ctx.pgmap)
 850                put_dev_pagemap(ctx.pgmap);
 851        return page;
 852}
 853
 854static int get_gate_page(struct mm_struct *mm, unsigned long address,
 855                unsigned int gup_flags, struct vm_area_struct **vma,
 856                struct page **page)
 857{
 858        pgd_t *pgd;
 859        p4d_t *p4d;
 860        pud_t *pud;
 861        pmd_t *pmd;
 862        pte_t *pte;
 863        int ret = -EFAULT;
 864
 865        /* user gate pages are read-only */
 866        if (gup_flags & FOLL_WRITE)
 867                return -EFAULT;
 868        if (address > TASK_SIZE)
 869                pgd = pgd_offset_k(address);
 870        else
 871                pgd = pgd_offset_gate(mm, address);
 872        if (pgd_none(*pgd))
 873                return -EFAULT;
 874        p4d = p4d_offset(pgd, address);
 875        if (p4d_none(*p4d))
 876                return -EFAULT;
 877        pud = pud_offset(p4d, address);
 878        if (pud_none(*pud))
 879                return -EFAULT;
 880        pmd = pmd_offset(pud, address);
 881        if (!pmd_present(*pmd))
 882                return -EFAULT;
 883        VM_BUG_ON(pmd_trans_huge(*pmd));
 884        pte = pte_offset_map(pmd, address);
 885        if (pte_none(*pte))
 886                goto unmap;
 887        *vma = get_gate_vma(mm);
 888        if (!page)
 889                goto out;
 890        *page = vm_normal_page(*vma, address, *pte);
 891        if (!*page) {
 892                if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
 893                        goto unmap;
 894                *page = pte_page(*pte);
 895        }
 896        if (unlikely(!try_grab_page(*page, gup_flags))) {
 897                ret = -ENOMEM;
 898                goto unmap;
 899        }
 900out:
 901        ret = 0;
 902unmap:
 903        pte_unmap(pte);
 904        return ret;
 905}
 906
 907/*
 908 * mmap_lock must be held on entry.  If @locked != NULL and *@flags
 909 * does not include FOLL_NOWAIT, the mmap_lock may be released.  If it
 910 * is, *@locked will be set to 0 and -EBUSY returned.
 911 */
 912static int faultin_page(struct vm_area_struct *vma,
 913                unsigned long address, unsigned int *flags, int *locked)
 914{
 915        unsigned int fault_flags = 0;
 916        vm_fault_t ret;
 917
 918        /* mlock all present pages, but do not fault in new pages */
 919        if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
 920                return -ENOENT;
 921        if (*flags & FOLL_NOFAULT)
 922                return -EFAULT;
 923        if (*flags & FOLL_WRITE)
 924                fault_flags |= FAULT_FLAG_WRITE;
 925        if (*flags & FOLL_REMOTE)
 926                fault_flags |= FAULT_FLAG_REMOTE;
 927        if (locked)
 928                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 929        if (*flags & FOLL_NOWAIT)
 930                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
 931        if (*flags & FOLL_TRIED) {
 932                /*
 933                 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
 934                 * can co-exist
 935                 */
 936                fault_flags |= FAULT_FLAG_TRIED;
 937        }
 938
 939        ret = handle_mm_fault(vma, address, fault_flags, NULL);
 940        if (ret & VM_FAULT_ERROR) {
 941                int err = vm_fault_to_errno(ret, *flags);
 942
 943                if (err)
 944                        return err;
 945                BUG();
 946        }
 947
 948        if (ret & VM_FAULT_RETRY) {
 949                if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
 950                        *locked = 0;
 951                return -EBUSY;
 952        }
 953
 954        /*
 955         * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
 956         * necessary, even if maybe_mkwrite decided not to set pte_write. We
 957         * can thus safely do subsequent page lookups as if they were reads.
 958         * But only do so when looping for pte_write is futile: in some cases
 959         * userspace may also be wanting to write to the gotten user page,
 960         * which a read fault here might prevent (a readonly page might get
 961         * reCOWed by userspace write).
 962         */
 963        if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
 964                *flags |= FOLL_COW;
 965        return 0;
 966}
 967
 968static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 969{
 970        vm_flags_t vm_flags = vma->vm_flags;
 971        int write = (gup_flags & FOLL_WRITE);
 972        int foreign = (gup_flags & FOLL_REMOTE);
 973
 974        if (vm_flags & (VM_IO | VM_PFNMAP))
 975                return -EFAULT;
 976
 977        if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
 978                return -EFAULT;
 979
 980        if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
 981                return -EOPNOTSUPP;
 982
 983        if (vma_is_secretmem(vma))
 984                return -EFAULT;
 985
 986        if (write) {
 987                if (!(vm_flags & VM_WRITE)) {
 988                        if (!(gup_flags & FOLL_FORCE))
 989                                return -EFAULT;
 990                        /*
 991                         * We used to let the write,force case do COW in a
 992                         * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
 993                         * set a breakpoint in a read-only mapping of an
 994                         * executable, without corrupting the file (yet only
 995                         * when that file had been opened for writing!).
 996                         * Anon pages in shared mappings are surprising: now
 997                         * just reject it.
 998                         */
 999                        if (!is_cow_mapping(vm_flags))
1000                                return -EFAULT;

1001                }
1002        } else if (!(vm_flags & VM_READ)) {
1003                if (!(gup_flags & FOLL_FORCE))
1004                        return -EFAULT;
1005                /*
1006                 * Is there actually any vma we can reach here which does not
1007                 * have VM_MAYREAD set?
1008                 */
1009                if (!(vm_flags & VM_MAYREAD))
1010                        return -EFAULT;
1011        }
1012        /*
1013         * gups are always data accesses, not instruction
1014         * fetches, so execute=false here
1015         */
1016        if (!arch_vma_access_permitted(vma, write, false, foreign))
1017                return -EFAULT;
1018        return 0;
1019}
1020
1021/**
1022 * __get_user_pages() - pin user pages in memory
1023 * @mm:         mm_struct of target mm
1024 * @start:      starting user address
1025 * @nr_pages:   number of pages from start to pin
1026 * @gup_flags:  flags modifying pin behaviour
1027 * @pages:      array that receives pointers to the pages pinned.
1028 *              Should be at least nr_pages long. Or NULL, if caller
1029 *              only intends to ensure the pages are faulted in.
1030 * @vmas:       array of pointers to vmas corresponding to each page.
1031 *              Or NULL if the caller does not require them.
1032 * @locked:     whether we're still with the mmap_lock held
1033 *
1034 * Returns either number of pages pinned (which may be less than the
1035 * number requested), or an error. Details about the return value:
1036 *
1037 * -- If nr_pages is 0, returns 0.
1038 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1039 * -- If nr_pages is >0, and some pages were pinned, returns the number of
1040 *    pages pinned. Again, this may be less than nr_pages.
1041 * -- 0 return value is possible when the fault would need to be retried.
1042 *
1043 * The caller is responsible for releasing returned @pages, via put_page().
1044 *
1045 * @vmas are valid only as long as mmap_lock is held.
1046 *
1047 * Must be called with mmap_lock held.  It may be released.  See below.
1048 *
1049 * __get_user_pages walks a process's page tables and takes a reference to
1050 * each struct page that each user address corresponds to at a given
1051 * instant. That is, it takes the page that would be accessed if a user
1052 * thread accesses the given user virtual address at that instant.
1053 *
1054 * This does not guarantee that the page exists in the user mappings when
1055 * __get_user_pages returns, and there may even be a completely different
1056 * page there in some cases (eg. if mmapped pagecache has been invalidated
1057 * and subsequently re faulted). However it does guarantee that the page
1058 * won't be freed completely. And mostly callers simply care that the page
1059 * contains data that was valid *at some point in time*. Typically, an IO
1060 * or similar operation cannot guarantee anything stronger anyway because
1061 * locks can't be held over the syscall boundary.
1062 *
1063 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1064 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1065 * appropriate) must be called after the page is finished with, and
1066 * before put_page is called.
1067 *
1068 * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
1069 * released by an up_read().  That can happen if @gup_flags does not
1070 * have FOLL_NOWAIT.
1071 *
1072 * A caller using such a combination of @locked and @gup_flags
1073 * must therefore hold the mmap_lock for reading only, and recognize
1074 * when it's been released.  Otherwise, it must be held for either
1075 * reading or writing and will not be released.
1076 *
1077 * In most cases, get_user_pages or get_user_pages_fast should be used
1078 * instead of __get_user_pages. __get_user_pages should be used only if
1079 * you need some special @gup_flags.
1080 */
1081static long __get_user_pages(struct mm_struct *mm,
1082                unsigned long start, unsigned long nr_pages,
1083                unsigned int gup_flags, struct page **pages,
1084                struct vm_area_struct **vmas, int *locked)
1085{
1086        long ret = 0, i = 0;
1087        struct vm_area_struct *vma = NULL;
1088        struct follow_page_context ctx = { NULL };
1089
1090        if (!nr_pages)
1091                return 0;
1092
1093        start = untagged_addr(start);
1094
1095        VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
1096
1097        /*
1098         * If FOLL_FORCE is set then do not force a full fault as the hinting
1099         * fault information is unrelated to the reference behaviour of a task
1100         * using the address space
1101         */
1102        if (!(gup_flags & FOLL_FORCE))
1103                gup_flags |= FOLL_NUMA;
1104
1105        do {
1106                struct page *page;
1107                unsigned int foll_flags = gup_flags;
1108                unsigned int page_increm;
1109
1110                /* first iteration or cross vma bound */
1111                if (!vma || start >= vma->vm_end) {
1112                        vma = find_extend_vma(mm, start);
1113                        if (!vma && in_gate_area(mm, start)) {
1114                                ret = get_gate_page(mm, start & PAGE_MASK,
1115                                                gup_flags, &vma,
1116                                                pages ? &pages[i] : NULL);
1117                                if (ret)
1118                                        goto out;
1119                                ctx.page_mask = 0;
1120                                goto next_page;
1121                        }
1122
1123                        if (!vma) {
1124                                ret = -EFAULT;
1125                                goto out;
1126                        }
1127                        ret = check_vma_flags(vma, gup_flags);
1128                        if (ret)
1129                                goto out;
1130
1131                        if (is_vm_hugetlb_page(vma)) {
1132                                i = follow_hugetlb_page(mm, vma, pages, vmas,
1133                                                &start, &nr_pages, i,
1134                                                gup_flags, locked);
1135                                if (locked && *locked == 0) {
1136                                        /*
1137                                         * We've got a VM_FAULT_RETRY
1138                                         * and we've lost mmap_lock.
1139                                         * We must stop here.
1140                                         */
1141                                        BUG_ON(gup_flags & FOLL_NOWAIT);
1142                                        goto out;
1143                                }
1144                                continue;
1145                        }
1146                }
1147retry:
1148                /*
1149                 * If we have a pending SIGKILL, don't keep faulting pages and
1150                 * potentially allocating memory.
1151                 */
1152                if (fatal_signal_pending(current)) {
1153                        ret = -EINTR;
1154                        goto out;
1155                }
1156                cond_resched();
1157
1158                page = follow_page_mask(vma, start, foll_flags, &ctx);
1159                if (!page) {
1160                        ret = faultin_page(vma, start, &foll_flags, locked);
1161                        switch (ret) {
1162                        case 0:
1163                                goto retry;
1164                        case -EBUSY:
1165                                ret = 0;
1166                                fallthrough;
1167                        case -EFAULT:
1168                        case -ENOMEM:
1169                        case -EHWPOISON:
1170                                goto out;
1171                        case -ENOENT:
1172                                goto next_page;
1173                        }
1174                        BUG();
1175                } else if (PTR_ERR(page) == -EEXIST) {
1176                        /*
1177                         * Proper page table entry exists, but no corresponding
1178                         * struct page.
1179                         */
1180                        goto next_page;
1181                } else if (IS_ERR(page)) {
1182                        ret = PTR_ERR(page);
1183                        goto out;
1184                }
1185                if (pages) {
1186                        pages[i] = page;
1187                        flush_anon_page(vma, page, start);
1188                        flush_dcache_page(page);
1189                        ctx.page_mask = 0;
1190                }
1191next_page:
1192                if (vmas) {
1193                        vmas[i] = vma;
1194                        ctx.page_mask = 0;
1195                }
1196                page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
1197                if (page_increm > nr_pages)
1198                        page_increm = nr_pages;
1199                i += page_increm;
1200                start += page_increm * PAGE_SIZE;
1201                nr_pages -= page_increm;
1202        } while (nr_pages);
1203out:
1204        if (ctx.pgmap)
1205                put_dev_pagemap(ctx.pgmap);
1206        return i ? i : ret;
1207}
1208
1209static bool vma_permits_fault(struct vm_area_struct *vma,
1210                              unsigned int fault_flags)
1211{
1212        bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
1213        bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
1214        vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
1215
1216        if (!(vm_flags & vma->vm_flags))
1217                return false;
1218
1219        /*
1220         * The architecture might have a hardware protection
1221         * mechanism other than read/write that can deny access.
1222         *
1223         * gup always represents data access, not instruction
1224         * fetches, so execute=false here:
1225         */
1226        if (!arch_vma_access_permitted(vma, write, false, foreign))
1227                return false;
1228
1229        return true;
1230}
1231
1232/**
1233 * fixup_user_fault() - manually resolve a user page fault
1234 * @mm:         mm_struct of target mm
1235 * @address:    user address
1236 * @fault_flags:flags to pass down to handle_mm_fault()
1237 * @unlocked:   did we unlock the mmap_lock while retrying, maybe NULL if caller
1238 *              does not allow retry. If NULL, the caller must guarantee
1239 *              that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
1240 *
1241 * This is meant to be called in the specific scenario where for locking reasons
1242 * we try to access user memory in atomic context (within a pagefault_disable()
1243 * section), this returns -EFAULT, and we want to resolve the user fault before
1244 * trying again.
1245 *
1246 * Typically this is meant to be used by the futex code.
1247 *
1248 * The main difference with get_user_pages() is that this function will
1249 * unconditionally call handle_mm_fault() which will in turn perform all the
1250 * necessary SW fixup of the dirty and young bits in the PTE, while
1251 * get_user_pages() only guarantees to update these in the struct page.
1252 *
1253 * This is important for some architectures where those bits also gate the
1254 * access permission to the page because they are maintained in software.  On
1255 * such architectures, gup() will not be enough to make a subsequent access
1256 * succeed.
1257 *
1258 * This function will not return with an unlocked mmap_lock. So it has not the
1259 * same semantics wrt the @mm->mmap_lock as does filemap_fault().
1260 */
1261int fixup_user_fault(struct mm_struct *mm,
1262                     unsigned long address, unsigned int fault_flags,
1263                     bool *unlocked)
1264{
1265        struct vm_area_struct *vma;
1266        vm_fault_t ret;
1267
1268        address = untagged_addr(address);
1269
1270        if (unlocked)
1271                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1272
1273retry:
1274        vma = find_extend_vma(mm, address);
1275        if (!vma || address < vma->vm_start)
1276                return -EFAULT;
1277
1278        if (!vma_permits_fault(vma, fault_flags))
1279                return -EFAULT;
1280
1281        if ((fault_flags & FAULT_FLAG_KILLABLE) &&
1282            fatal_signal_pending(current))
1283                return -EINTR;
1284
1285        ret = handle_mm_fault(vma, address, fault_flags, NULL);
1286        if (ret & VM_FAULT_ERROR) {
1287                int err = vm_fault_to_errno(ret, 0);
1288
1289                if (err)
1290                        return err;
1291                BUG();
1292        }
1293
1294        if (ret & VM_FAULT_RETRY) {
1295                mmap_read_lock(mm);
1296                *unlocked = true;
1297                fault_flags |= FAULT_FLAG_TRIED;
1298                goto retry;
1299        }
1300
1301        return 0;
1302}
1303EXPORT_SYMBOL_GPL(fixup_user_fault);
1304
1305/*
1306 * Please note that this function, unlike __get_user_pages will not
1307 * return 0 for nr_pages > 0 without FOLL_NOWAIT
1308 */
1309static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
1310                                                unsigned long start,
1311                                                unsigned long nr_pages,
1312                                                struct page **pages,
1313                                                struct vm_area_struct **vmas,
1314                                                int *locked,
1315                                                unsigned int flags)
1316{
1317        long ret, pages_done;
1318        bool lock_dropped;
1319
1320        if (locked) {
1321                /* if VM_FAULT_RETRY can be returned, vmas become invalid */
1322                BUG_ON(vmas);
1323                /* check caller initialized locked */
1324                BUG_ON(*locked != 1);
1325        }
1326
1327        if (flags & FOLL_PIN)
1328                mm_set_has_pinned_flag(&mm->flags);
1329
1330        /*
1331         * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
1332         * is to set FOLL_GET if the caller wants pages[] filled in (but has
1333         * carelessly failed to specify FOLL_GET), so keep doing that, but only
1334         * for FOLL_GET, not for the newer FOLL_PIN.
1335         *
1336         * FOLL_PIN always expects pages to be non-null, but no need to assert
1337         * that here, as any failures will be obvious enough.
1338         */
1339        if (pages && !(flags & FOLL_PIN))
1340                flags |= FOLL_GET;
1341
1342        pages_done = 0;
1343        lock_dropped = false;
1344        for (;;) {
1345                ret = __get_user_pages(mm, start, nr_pages, flags, pages,
1346                                       vmas, locked);
1347                if (!locked)
1348                        /* VM_FAULT_RETRY couldn't trigger, bypass */
1349                        return ret;
1350
1351                /* VM_FAULT_RETRY cannot return errors */
1352                if (!*locked) {
1353                        BUG_ON(ret < 0);
1354                        BUG_ON(ret >= nr_pages);
1355                }
1356
1357                if (ret > 0) {
1358                        nr_pages -= ret;
1359                        pages_done += ret;
1360                        if (!nr_pages)
1361                                break;
1362                }
1363                if (*locked) {
1364                        /*
1365                         * VM_FAULT_RETRY didn't trigger or it was a
1366                         * FOLL_NOWAIT.
1367                         */
1368                        if (!pages_done)
1369                                pages_done = ret;
1370                        break;
1371                }
1372                /*
1373                 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1374                 * For the prefault case (!pages) we only update counts.
1375                 */
1376                if (likely(pages))
1377                        pages += ret;
1378                start += ret << PAGE_SHIFT;
1379                lock_dropped = true;
1380
1381retry:
1382                /*
1383                 * Repeat on the address that fired VM_FAULT_RETRY
1384                 * with both FAULT_FLAG_ALLOW_RETRY and
1385                 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
1386                 * by fatal signals, so we need to check it before we
1387                 * start trying again otherwise it can loop forever.
1388                 */
1389
1390                if (fatal_signal_pending(current)) {
1391                        if (!pages_done)
1392                                pages_done = -EINTR;
1393                        break;
1394                }
1395
1396                ret = mmap_read_lock_killable(mm);
1397                if (ret) {
1398                        BUG_ON(ret > 0);
1399                        if (!pages_done)
1400                                pages_done = ret;
1401                        break;
1402                }
1403
1404                *locked = 1;
1405                ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
1406                                       pages, NULL, locked);
1407                if (!*locked) {
1408                        /* Continue to retry until we succeeded */
1409                        BUG_ON(ret != 0);
1410                        goto retry;
1411                }
1412                if (ret != 1) {
1413                        BUG_ON(ret > 1);
1414                        if (!pages_done)
1415                                pages_done = ret;
1416                        break;
1417                }
1418                nr_pages--;
1419                pages_done++;
1420                if (!nr_pages)
1421                        break;
1422                if (likely(pages))
1423                        pages++;
1424                start += PAGE_SIZE;
1425        }
1426        if (lock_dropped && *locked) {
1427                /*
1428                 * We must let the caller know we temporarily dropped the lock
1429                 * and so the critical section protected by it was lost.
1430                 */
1431                mmap_read_unlock(mm);
1432                *locked = 0;
1433        }
1434        return pages_done;
1435}
1436
1437/**
1438 * populate_vma_page_range() -  populate a range of pages in the vma.
1439 * @vma:   target vma
1440 * @start: start address
1441 * @end:   end address
1442 * @locked: whether the mmap_lock is still held
1443 *
1444 * This takes care of mlocking the pages too if VM_LOCKED is set.
1445 *
1446 * Return either number of pages pinned in the vma, or a negative error
1447 * code on error.
1448 *
1449 * vma->vm_mm->mmap_lock must be held.
1450 *
1451 * If @locked is NULL, it may be held for read or write and will
1452 * be unperturbed.
1453 *
1454 * If @locked is non-NULL, it must held for read only and may be
1455 * released.  If it's released, *@locked will be set to 0.
1456 */
1457long populate_vma_page_range(struct vm_area_struct *vma,
1458                unsigned long start, unsigned long end, int *locked)
1459{
1460        struct mm_struct *mm = vma->vm_mm;
1461        unsigned long nr_pages = (end - start) / PAGE_SIZE;
1462        int gup_flags;
1463
1464        VM_BUG_ON(!PAGE_ALIGNED(start));
1465        VM_BUG_ON(!PAGE_ALIGNED(end));
1466        VM_BUG_ON_VMA(start < vma->vm_start, vma);
1467        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
1468        mmap_assert_locked(mm);
1469
1470        gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
1471        if (vma->vm_flags & VM_LOCKONFAULT)
1472                gup_flags &= ~FOLL_POPULATE;
1473        /*
1474         * We want to touch writable mappings with a write fault in order
1475         * to break COW, except for shared mappings because these don't COW
1476         * and we would not want to dirty them for nothing.
1477         */
1478        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1479                gup_flags |= FOLL_WRITE;
1480
1481        /*
1482         * We want mlock to succeed for regions that have any permissions
1483         * other than PROT_NONE.
1484         */
1485        if (vma_is_accessible(vma))
1486                gup_flags |= FOLL_FORCE;
1487
1488        /*
1489         * We made sure addr is within a VMA, so the following will
1490         * not result in a stack expansion that recurses back here.
1491         */
1492        return __get_user_pages(mm, start, nr_pages, gup_flags,
1493                                NULL, NULL, locked);
1494}
1495
1496/*
1497 * faultin_vma_page_range() - populate (prefault) page tables inside the
1498 *                            given VMA range readable/writable
1499 *
1500 * This takes care of mlocking the pages, too, if VM_LOCKED is set.
1501 *
1502 * @vma: target vma
1503 * @start: start address
1504 * @end: end address
1505 * @write: whether to prefault readable or writable
1506 * @locked: whether the mmap_lock is still held
1507 *
1508 * Returns either number of processed pages in the vma, or a negative error
1509 * code on error (see __get_user_pages()).
1510 *
1511 * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
1512 * covered by the VMA.
1513 *
1514 * If @locked is NULL, it may be held for read or write and will be unperturbed.
1515 *
1516 * If @locked is non-NULL, it must held for read only and may be released.  If
1517 * it's released, *@locked will be set to 0.
1518 */
1519long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
1520                            unsigned long end, bool write, int *locked)
1521{
1522        struct mm_struct *mm = vma->vm_mm;
1523        unsigned long nr_pages = (end - start) / PAGE_SIZE;
1524        int gup_flags;
1525
1526        VM_BUG_ON(!PAGE_ALIGNED(start));
1527        VM_BUG_ON(!PAGE_ALIGNED(end));
1528        VM_BUG_ON_VMA(start < vma->vm_start, vma);
1529        VM_BUG_ON_VMA(end > vma->vm_end, vma);
1530        mmap_assert_locked(mm);
1531
1532        /*
1533         * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
1534         *             the page dirty with FOLL_WRITE -- which doesn't make a
1535         *             difference with !FOLL_FORCE, because the page is writable
1536         *             in the page table.
1537         * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
1538         *                a poisoned page.
1539         * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT.
1540         * !FOLL_FORCE: Require proper access permissions.
1541         */
1542        gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON;
1543        if (write)
1544                gup_flags |= FOLL_WRITE;
1545
1546        /*
1547         * We want to report -EINVAL instead of -EFAULT for any permission
1548         * problems or incompatible mappings.
1549         */
1550        if (check_vma_flags(vma, gup_flags))
1551                return -EINVAL;
1552
1553        return __get_user_pages(mm, start, nr_pages, gup_flags,
1554                                NULL, NULL, locked);
1555}
1556
1557/*
1558 * __mm_populate - populate and/or mlock pages within a range of address space.
1559 *
1560 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1561 * flags. VMAs must be already marked with the desired vm_flags, and
1562 * mmap_lock must not be held.
1563 */
1564int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1565{
1566        struct mm_struct *mm = current->mm;
1567        unsigned long end, nstart, nend;
1568        struct vm_area_struct *vma = NULL;
1569        int locked = 0;
1570        long ret = 0;
1571
1572        end = start + len;
1573
1574        for (nstart = start; nstart < end; nstart = nend) {
1575                /*
1576                 * We want to fault in pages for [nstart; end) address range.
1577                 * Find first corresponding VMA.
1578                 */
1579                if (!locked) {
1580                        locked = 1;
1581                        mmap_read_lock(mm);
1582                        vma = find_vma(mm, nstart);
1583                } else if (nstart >= vma->vm_end)
1584                        vma = vma->vm_next;
1585                if (!vma || vma->vm_start >= end)
1586                        break;
1587                /*
1588                 * Set [nstart; nend) to intersection of desired address
1589                 * range with the first VMA. Also, skip undesirable VMA types.
1590                 */
1591                nend = min(end, vma->vm_end);
1592                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1593                        continue;
1594                if (nstart < vma->vm_start)
1595                        nstart = vma->vm_start;
1596                /*
1597                 * Now fault in a range of pages. populate_vma_page_range()
1598                 * double checks the vma flags, so that it won't mlock pages
1599                 * if the vma was already munlocked.
1600                 */
1601                ret = populate_vma_page_range(vma, nstart, nend, &locked);
1602                if (ret < 0) {
1603                        if (ignore_errors) {
1604                                ret = 0;
1605                                continue;       /* continue at next VMA */
1606                        }
1607                        break;
1608                }
1609                nend = nstart + ret * PAGE_SIZE;
1610                ret = 0;
1611        }
1612        if (locked)
1613                mmap_read_unlock(mm);
1614        return ret;     /* 0 or negative error code */
1615}
1616#else /* CONFIG_MMU */
1617static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
1618                unsigned long nr_pages, struct page **pages,
1619                struct vm_area_struct **vmas, int *locked,
1620                unsigned int foll_flags)
1621{
1622        struct vm_area_struct *vma;
1623        unsigned long vm_flags;
1624        long i;
1625
1626        /* calculate required read or write permissions.
1627         * If FOLL_FORCE is set, we only require the "MAY" flags.
1628         */
1629        vm_flags  = (foll_flags & FOLL_WRITE) ?
1630                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1631        vm_flags &= (foll_flags & FOLL_FORCE) ?
1632                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1633
1634        for (i = 0; i < nr_pages; i++) {
1635                vma = find_vma(mm, start);
1636                if (!vma)
1637                        goto finish_or_fault;
1638
1639                /* protect what we can, including chardevs */
1640                if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1641                    !(vm_flags & vma->vm_flags))
1642                        goto finish_or_fault;
1643
1644                if (pages) {
1645                        pages[i] = virt_to_page(start);
1646                        if (pages[i])
1647                                get_page(pages[i]);
1648                }
1649                if (vmas)
1650                        vmas[i] = vma;
1651                start = (start + PAGE_SIZE) & PAGE_MASK;
1652        }
1653
1654        return i;
1655
1656finish_or_fault:
1657        return i ? : -EFAULT;
1658}
1659#endif /* !CONFIG_MMU */
1660
1661/**
1662 * fault_in_writeable - fault in userspace address range for writing
1663 * @uaddr: start of address range
1664 * @size: size of address range
1665 *
1666 * Returns the number of bytes not faulted in (like copy_to_user() and
1667 * copy_from_user()).
1668 */
1669size_t fault_in_writeable(char __user *uaddr, size_t size)
1670{
1671        char __user *start = uaddr, *end;
1672
1673        if (unlikely(size == 0))
1674                return 0;
1675        if (!PAGE_ALIGNED(uaddr)) {
1676                if (unlikely(__put_user(0, uaddr) != 0))
1677                        return size;
1678                uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
1679        }
1680        end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
1681        if (unlikely(end < start))
1682                end = NULL;
1683        while (uaddr != end) {
1684                if (unlikely(__put_user(0, uaddr) != 0))
1685                        goto out;
1686                uaddr += PAGE_SIZE;
1687        }
1688
1689out:
1690        if (size > uaddr - start)
1691                return size - (uaddr - start);
1692        return 0;
1693}
1694EXPORT_SYMBOL(fault_in_writeable);
1695
1696/*
1697 * fault_in_safe_writeable - fault in an address range for writing
1698 * @uaddr: start of address range
1699 * @size: length of address range
1700 *
1701 * Faults in an address range using get_user_pages, i.e., without triggering
1702 * hardware page faults.  This is primarily useful when we already know that
1703 * some or all of the pages in the address range aren't in memory.
1704 *
1705 * Other than fault_in_writeable(), this function is non-destructive.
1706 *
1707 * Note that we don't pin or otherwise hold the pages referenced that we fault
1708 * in.  There's no guarantee that they'll stay in memory for any duration of
1709 * time.
1710 *
1711 * Returns the number of bytes not faulted in, like copy_to_user() and
1712 * copy_from_user().
1713 */
1714size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
1715{
1716        unsigned long start = (unsigned long)untagged_addr(uaddr);
1717        unsigned long end, nstart, nend;
1718        struct mm_struct *mm = current->mm;
1719        struct vm_area_struct *vma = NULL;
1720        int locked = 0;
1721
1722        nstart = start & PAGE_MASK;
1723        end = PAGE_ALIGN(start + size);
1724        if (end < nstart)
1725                end = 0;
1726        for (; nstart != end; nstart = nend) {
1727                unsigned long nr_pages;
1728                long ret;
1729
1730                if (!locked) {
1731                        locked = 1;
1732                        mmap_read_lock(mm);
1733                        vma = find_vma(mm, nstart);
1734                } else if (nstart >= vma->vm_end)
1735                        vma = vma->vm_next;
1736                if (!vma || vma->vm_start >= end)
1737                        break;
1738                nend = end ? min(end, vma->vm_end) : vma->vm_end;
1739                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1740                        continue;
1741                if (nstart < vma->vm_start)
1742                        nstart = vma->vm_start;
1743                nr_pages = (nend - nstart) / PAGE_SIZE;
1744                ret = __get_user_pages_locked(mm, nstart, nr_pages,
1745                                              NULL, NULL, &locked,
1746                                              FOLL_TOUCH | FOLL_WRITE);
1747                if (ret <= 0)
1748                        break;
1749                nend = nstart + ret * PAGE_SIZE;
1750        }
1751        if (locked)
1752                mmap_read_unlock(mm);
1753        if (nstart == end)
1754                return 0;
1755        return size - min_t(size_t, nstart - start, size);
1756}
1757EXPORT_SYMBOL(fault_in_safe_writeable);
1758
1759/**
1760 * fault_in_readable - fault in userspace address range for reading
1761 * @uaddr: start of user address range
1762 * @size: size of user address range
1763 *
1764 * Returns the number of bytes not faulted in (like copy_to_user() and
1765 * copy_from_user()).
1766 */
1767size_t fault_in_readable(const char __user *uaddr, size_t size)
1768{
1769        const char __user *start = uaddr, *end;
1770        volatile char c;
1771
1772        if (unlikely(size == 0))
1773                return 0;
1774        if (!PAGE_ALIGNED(uaddr)) {
1775                if (unlikely(__get_user(c, uaddr) != 0))
1776                        return size;
1777                uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
1778        }
1779        end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
1780        if (unlikely(end < start))
1781                end = NULL;
1782        while (uaddr != end) {
1783                if (unlikely(__get_user(c, uaddr) != 0))
1784                        goto out;
1785                uaddr += PAGE_SIZE;
1786        }
1787
1788out:
1789        (void)c;
1790        if (size > uaddr - start)
1791                return size - (uaddr - start);
1792        return 0;
1793}
1794EXPORT_SYMBOL(fault_in_readable);
1795
1796/**
1797 * get_dump_page() - pin user page in memory while writing it to core dump
1798 * @addr: user address
1799 *
1800 * Returns struct page pointer of user page pinned for dump,
1801 * to be freed afterwards by put_page().
1802 *
1803 * Returns NULL on any kind of failure - a hole must then be inserted into
1804 * the corefile, to preserve alignment with its headers; and also returns
1805 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1806 * allowing a hole to be left in the corefile to save disk space.
1807 *
1808 * Called without mmap_lock (takes and releases the mmap_lock by itself).
1809 */
1810#ifdef CONFIG_ELF_CORE
1811struct page *get_dump_page(unsigned long addr)
1812{
1813        struct mm_struct *mm = current->mm;
1814        struct page *page;
1815        int locked = 1;
1816        int ret;
1817
1818        if (mmap_read_lock_killable(mm))
1819                return NULL;
1820        ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
1821                                      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
1822        if (locked)
1823                mmap_read_unlock(mm);
1824        return (ret == 1) ? page : NULL;
1825}
1826#endif /* CONFIG_ELF_CORE */
1827
1828#ifdef CONFIG_MIGRATION
1829/*
1830 * Check whether all pages are pinnable, if so return number of pages.  If some
1831 * pages are not pinnable, migrate them, and unpin all pages. Return zero if
1832 * pages were migrated, or if some pages were not successfully isolated.
1833 * Return negative error if migration fails.
1834 */
1835static long check_and_migrate_movable_pages(unsigned long nr_pages,
1836                                            struct page **pages,
1837                                            unsigned int gup_flags)
1838{
1839        unsigned long i;
1840        unsigned long isolation_error_count = 0;
1841        bool drain_allow = true;
1842        LIST_HEAD(movable_page_list);
1843        long ret = 0;
1844        struct page *prev_head = NULL;
1845        struct page *head;
1846        struct migration_target_control mtc = {
1847                .nid = NUMA_NO_NODE,
1848                .gfp_mask = GFP_USER | __GFP_NOWARN,
1849        };
1850
1851        for (i = 0; i < nr_pages; i++) {
1852                head = compound_head(pages[i]);
1853                if (head == prev_head)
1854                        continue;
1855                prev_head = head;
1856                /*
1857                 * If we get a movable page, since we are going to be pinning
1858                 * these entries, try to move them out if possible.
1859                 */
1860                if (!is_pinnable_page(head)) {
1861                        if (PageHuge(head)) {
1862                                if (!isolate_huge_page(head, &movable_page_list))
1863                                        isolation_error_count++;
1864                        } else {
1865                                if (!PageLRU(head) && drain_allow) {
1866                                        lru_add_drain_all();
1867                                        drain_allow = false;
1868                                }
1869
1870                                if (isolate_lru_page(head)) {
1871                                        isolation_error_count++;
1872                                        continue;
1873                                }
1874                                list_add_tail(&head->lru, &movable_page_list);
1875                                mod_node_page_state(page_pgdat(head),
1876                                                    NR_ISOLATED_ANON +
1877                                                    page_is_file_lru(head),
1878                                                    thp_nr_pages(head));
1879                        }
1880                }
1881        }
1882
1883        /*
1884         * If list is empty, and no isolation errors, means that all pages are
1885         * in the correct zone.
1886         */
1887        if (list_empty(&movable_page_list) && !isolation_error_count)
1888                return nr_pages;
1889
1890        if (gup_flags & FOLL_PIN) {
1891                unpin_user_pages(pages, nr_pages);
1892        } else {
1893                for (i = 0; i < nr_pages; i++)
1894                        put_page(pages[i]);
1895        }
1896        if (!list_empty(&movable_page_list)) {
1897                ret = migrate_pages(&movable_page_list, alloc_migration_target,
1898                                    NULL, (unsigned long)&mtc, MIGRATE_SYNC,
1899                                    MR_LONGTERM_PIN, NULL);
1900                if (ret && !list_empty(&movable_page_list))
1901                        putback_movable_pages(&movable_page_list);
1902        }
1903
1904        return ret > 0 ? -ENOMEM : ret;
1905}
1906#else
1907static long check_and_migrate_movable_pages(unsigned long nr_pages,
1908                                            struct page **pages,
1909                                            unsigned int gup_flags)
1910{
1911        return nr_pages;
1912}
1913#endif /* CONFIG_MIGRATION */
1914
1915/*
1916 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
1917 * allows us to process the FOLL_LONGTERM flag.
1918 */
1919static long __gup_longterm_locked(struct mm_struct *mm,
1920                                  unsigned long start,
1921                                  unsigned long nr_pages,
1922                                  struct page **pages,
1923                                  struct vm_area_struct **vmas,
1924                                  unsigned int gup_flags)
1925{
1926        unsigned int flags;
1927        long rc;
1928
1929        if (!(gup_flags & FOLL_LONGTERM))
1930                return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
1931                                               NULL, gup_flags);
1932        flags = memalloc_pin_save();
1933        do {
1934                rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
1935                                             NULL, gup_flags);
1936                if (rc <= 0)
1937                        break;
1938                rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
1939        } while (!rc);
1940        memalloc_pin_restore(flags);
1941
1942        return rc;
1943}
1944
1945static bool is_valid_gup_flags(unsigned int gup_flags)
1946{
1947        /*
1948         * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
1949         * never directly by the caller, so enforce that with an assertion:
1950         */
1951        if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
1952                return false;
1953        /*
1954         * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
1955         * that is, FOLL_LONGTERM is a specific case, more restrictive case of
1956         * FOLL_PIN.
1957         */
1958        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1959                return false;
1960
1961        return true;
1962}
1963
1964#ifdef CONFIG_MMU
1965static long __get_user_pages_remote(struct mm_struct *mm,
1966                                    unsigned long start, unsigned long nr_pages,
1967                                    unsigned int gup_flags, struct page **pages,
1968                                    struct vm_area_struct **vmas, int *locked)
1969{
1970        /*
1971         * Parts of FOLL_LONGTERM behavior are incompatible with
1972         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1973         * vmas. However, this only comes up if locked is set, and there are
1974         * callers that do request FOLL_LONGTERM, but do not set locked. So,
1975         * allow what we can.
1976         */
1977        if (gup_flags & FOLL_LONGTERM) {
1978                if (WARN_ON_ONCE(locked))
1979                        return -EINVAL;
1980                /*
1981                 * This will check the vmas (even if our vmas arg is NULL)
1982                 * and return -ENOTSUPP if DAX isn't allowed in this case:
1983                 */
1984                return __gup_longterm_locked(mm, start, nr_pages, pages,
1985                                             vmas, gup_flags | FOLL_TOUCH |
1986                                             FOLL_REMOTE);
1987        }
1988
1989        return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
1990                                       locked,
1991                                       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
1992}
1993
1994/**
1995 * get_user_pages_remote() - pin user pages in memory
1996 * @mm:         mm_struct of target mm
1997 * @start:      starting user address
1998 * @nr_pages:   number of pages from start to pin
1999 * @gup_flags:  flags modifying lookup behaviour
2000 * @pages:      array that receives pointers to the pages pinned.

2001 *              Should be at least nr_pages long. Or NULL, if caller
2002 *              only intends to ensure the pages are faulted in.
2003 * @vmas:       array of pointers to vmas corresponding to each page.
2004 *              Or NULL if the caller does not require them.
2005 * @locked:     pointer to lock flag indicating whether lock is held and
2006 *              subsequently whether VM_FAULT_RETRY functionality can be
2007 *              utilised. Lock must initially be held.
2008 *
2009 * Returns either number of pages pinned (which may be less than the
2010 * number requested), or an error. Details about the return value:
2011 *
2012 * -- If nr_pages is 0, returns 0.
2013 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
2014 * -- If nr_pages is >0, and some pages were pinned, returns the number of
2015 *    pages pinned. Again, this may be less than nr_pages.
2016 *
2017 * The caller is responsible for releasing returned @pages, via put_page().
2018 *
2019 * @vmas are valid only as long as mmap_lock is held.
2020 *
2021 * Must be called with mmap_lock held for read or write.
2022 *
2023 * get_user_pages_remote walks a process's page tables and takes a reference
2024 * to each struct page that each user address corresponds to at a given
2025 * instant. That is, it takes the page that would be accessed if a user
2026 * thread accesses the given user virtual address at that instant.
2027 *
2028 * This does not guarantee that the page exists in the user mappings when
2029 * get_user_pages_remote returns, and there may even be a completely different
2030 * page there in some cases (eg. if mmapped pagecache has been invalidated
2031 * and subsequently re faulted). However it does guarantee that the page
2032 * won't be freed completely. And mostly callers simply care that the page
2033 * contains data that was valid *at some point in time*. Typically, an IO
2034 * or similar operation cannot guarantee anything stronger anyway because
2035 * locks can't be held over the syscall boundary.
2036 *
2037 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
2038 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
2039 * be called after the page is finished with, and before put_page is called.
2040 *
2041 * get_user_pages_remote is typically used for fewer-copy IO operations,
2042 * to get a handle on the memory by some means other than accesses
2043 * via the user virtual addresses. The pages may be submitted for
2044 * DMA to devices or accessed via their kernel linear mapping (via the
2045 * kmap APIs). Care should be taken to use the correct cache flushing APIs.
2046 *
2047 * See also get_user_pages_fast, for performance critical applications.
2048 *
2049 * get_user_pages_remote should be phased out in favor of
2050 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
2051 * should use get_user_pages_remote because it cannot pass
2052 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
2053 */
2054long get_user_pages_remote(struct mm_struct *mm,
2055                unsigned long start, unsigned long nr_pages,
2056                unsigned int gup_flags, struct page **pages,
2057                struct vm_area_struct **vmas, int *locked)
2058{
2059        if (!is_valid_gup_flags(gup_flags))
2060                return -EINVAL;
2061
2062        return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
2063                                       pages, vmas, locked);
2064}
2065EXPORT_SYMBOL(get_user_pages_remote);
2066
2067#else /* CONFIG_MMU */
2068long get_user_pages_remote(struct mm_struct *mm,
2069                           unsigned long start, unsigned long nr_pages,
2070                           unsigned int gup_flags, struct page **pages,
2071                           struct vm_area_struct **vmas, int *locked)
2072{
2073        return 0;
2074}
2075
2076static long __get_user_pages_remote(struct mm_struct *mm,
2077                                    unsigned long start, unsigned long nr_pages,
2078                                    unsigned int gup_flags, struct page **pages,
2079                                    struct vm_area_struct **vmas, int *locked)
2080{
2081        return 0;
2082}
2083#endif /* !CONFIG_MMU */
2084
2085/**
2086 * get_user_pages() - pin user pages in memory
2087 * @start:      starting user address
2088 * @nr_pages:   number of pages from start to pin
2089 * @gup_flags:  flags modifying lookup behaviour
2090 * @pages:      array that receives pointers to the pages pinned.
2091 *              Should be at least nr_pages long. Or NULL, if caller
2092 *              only intends to ensure the pages are faulted in.
2093 * @vmas:       array of pointers to vmas corresponding to each page.
2094 *              Or NULL if the caller does not require them.
2095 *
2096 * This is the same as get_user_pages_remote(), just with a less-flexible
2097 * calling convention where we assume that the mm being operated on belongs to
2098 * the current task, and doesn't allow passing of a locked parameter.  We also
2099 * obviously don't pass FOLL_REMOTE in here.
2100 */
2101long get_user_pages(unsigned long start, unsigned long nr_pages,
2102                unsigned int gup_flags, struct page **pages,
2103                struct vm_area_struct **vmas)
2104{
2105        if (!is_valid_gup_flags(gup_flags))
2106                return -EINVAL;
2107
2108        return __gup_longterm_locked(current->mm, start, nr_pages,
2109                                     pages, vmas, gup_flags | FOLL_TOUCH);
2110}
2111EXPORT_SYMBOL(get_user_pages);
2112
2113/**
2114 * get_user_pages_locked() - variant of get_user_pages()
2115 *
2116 * @start:      starting user address
2117 * @nr_pages:   number of pages from start to pin
2118 * @gup_flags:  flags modifying lookup behaviour
2119 * @pages:      array that receives pointers to the pages pinned.
2120 *              Should be at least nr_pages long. Or NULL, if caller
2121 *              only intends to ensure the pages are faulted in.
2122 * @locked:     pointer to lock flag indicating whether lock is held and
2123 *              subsequently whether VM_FAULT_RETRY functionality can be
2124 *              utilised. Lock must initially be held.
2125 *
2126 * It is suitable to replace the form:
2127 *
2128 *      mmap_read_lock(mm);
2129 *      do_something()
2130 *      get_user_pages(mm, ..., pages, NULL);
2131 *      mmap_read_unlock(mm);
2132 *
2133 *  to:
2134 *
2135 *      int locked = 1;
2136 *      mmap_read_lock(mm);
2137 *      do_something()
2138 *      get_user_pages_locked(mm, ..., pages, &locked);
2139 *      if (locked)
2140 *          mmap_read_unlock(mm);
2141 *
2142 * We can leverage the VM_FAULT_RETRY functionality in the page fault
2143 * paths better by using either get_user_pages_locked() or
2144 * get_user_pages_unlocked().
2145 *
2146 */
2147long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
2148                           unsigned int gup_flags, struct page **pages,
2149                           int *locked)
2150{
2151        /*
2152         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
2153         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
2154         * vmas.  As there are no users of this flag in this call we simply
2155         * disallow this option for now.
2156         */
2157        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
2158                return -EINVAL;
2159        /*
2160         * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
2161         * never directly by the caller, so enforce that:
2162         */
2163        if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
2164                return -EINVAL;
2165
2166        return __get_user_pages_locked(current->mm, start, nr_pages,
2167                                       pages, NULL, locked,
2168                                       gup_flags | FOLL_TOUCH);
2169}
2170EXPORT_SYMBOL(get_user_pages_locked);
2171
2172/*
2173 * get_user_pages_unlocked() is suitable to replace the form:
2174 *
2175 *      mmap_read_lock(mm);
2176 *      get_user_pages(mm, ..., pages, NULL);
2177 *      mmap_read_unlock(mm);
2178 *
2179 *  with:
2180 *
2181 *      get_user_pages_unlocked(mm, ..., pages);
2182 *
2183 * It is functionally equivalent to get_user_pages_fast so
2184 * get_user_pages_fast should be used instead if specific gup_flags
2185 * (e.g. FOLL_FORCE) are not required.
2186 */
2187long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2188                             struct page **pages, unsigned int gup_flags)
2189{
2190        struct mm_struct *mm = current->mm;
2191        int locked = 1;
2192        long ret;
2193
2194        /*
2195         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
2196         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
2197         * vmas.  As there are no users of this flag in this call we simply
2198         * disallow this option for now.
2199         */
2200        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
2201                return -EINVAL;
2202
2203        mmap_read_lock(mm);
2204        ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
2205                                      &locked, gup_flags | FOLL_TOUCH);
2206        if (locked)
2207                mmap_read_unlock(mm);
2208        return ret;
2209}
2210EXPORT_SYMBOL(get_user_pages_unlocked);
2211
2212/*
2213 * Fast GUP
2214 *
2215 * get_user_pages_fast attempts to pin user pages by walking the page
2216 * tables directly and avoids taking locks. Thus the walker needs to be
2217 * protected from page table pages being freed from under it, and should
2218 * block any THP splits.
2219 *
2220 * One way to achieve this is to have the walker disable interrupts, and
2221 * rely on IPIs from the TLB flushing code blocking before the page table
2222 * pages are freed. This is unsuitable for architectures that do not need
2223 * to broadcast an IPI when invalidating TLBs.
2224 *
2225 * Another way to achieve this is to batch up page table containing pages
2226 * belonging to more than one mm_user, then rcu_sched a callback to free those
2227 * pages. Disabling interrupts will allow the fast_gup walker to both block
2228 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
2229 * (which is a relatively rare event). The code below adopts this strategy.
2230 *
2231 * Before activating this code, please be aware that the following assumptions
2232 * are currently made:
2233 *
2234 *  *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
2235 *  free pages containing page tables or TLB flushing requires IPI broadcast.
2236 *
2237 *  *) ptes can be read atomically by the architecture.
2238 *
2239 *  *) access_ok is sufficient to validate userspace address ranges.
2240 *
2241 * The last two assumptions can be relaxed by the addition of helper functions.
2242 *
2243 * This code is based heavily on the PowerPC implementation by Nick Piggin.
2244 */
2245#ifdef CONFIG_HAVE_FAST_GUP
2246
2247static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
2248                                            unsigned int flags,
2249                                            struct page **pages)
2250{
2251        while ((*nr) - nr_start) {
2252                struct page *page = pages[--(*nr)];
2253
2254                ClearPageReferenced(page);
2255                if (flags & FOLL_PIN)
2256                        unpin_user_page(page);
2257                else
2258                        put_page(page);
2259        }
2260}
2261
2262#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
2263static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
2264                         unsigned int flags, struct page **pages, int *nr)
2265{
2266        struct dev_pagemap *pgmap = NULL;
2267        int nr_start = *nr, ret = 0;
2268        pte_t *ptep, *ptem;
2269
2270        ptem = ptep = pte_offset_map(&pmd, addr);
2271        do {
2272                pte_t pte = ptep_get_lockless(ptep);
2273                struct page *head, *page;
2274
2275                /*
2276                 * Similar to the PMD case below, NUMA hinting must take slow
2277                 * path using the pte_protnone check.
2278                 */
2279                if (pte_protnone(pte))
2280                        goto pte_unmap;
2281
2282                if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2283                        goto pte_unmap;
2284
2285                if (pte_devmap(pte)) {
2286                        if (unlikely(flags & FOLL_LONGTERM))
2287                                goto pte_unmap;
2288
2289                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
2290                        if (unlikely(!pgmap)) {
2291                                undo_dev_pagemap(nr, nr_start, flags, pages);
2292                                goto pte_unmap;
2293                        }
2294                } else if (pte_special(pte))
2295                        goto pte_unmap;
2296
2297                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2298                page = pte_page(pte);
2299
2300                head = try_grab_compound_head(page, 1, flags);
2301                if (!head)
2302                        goto pte_unmap;
2303
2304                if (unlikely(page_is_secretmem(page))) {
2305                        put_compound_head(head, 1, flags);
2306                        goto pte_unmap;
2307                }
2308
2309                if (unlikely(pte_val(pte) != pte_val(*ptep))) {
2310                        put_compound_head(head, 1, flags);
2311                        goto pte_unmap;
2312                }
2313
2314                VM_BUG_ON_PAGE(compound_head(page) != head, page);
2315
2316                /*
2317                 * We need to make the page accessible if and only if we are
2318                 * going to access its content (the FOLL_PIN case).  Please
2319                 * see Documentation/core-api/pin_user_pages.rst for
2320                 * details.
2321                 */
2322                if (flags & FOLL_PIN) {
2323                        ret = arch_make_page_accessible(page);
2324                        if (ret) {
2325                                unpin_user_page(page);
2326                                goto pte_unmap;
2327                        }
2328                }
2329                SetPageReferenced(page);
2330                pages[*nr] = page;
2331                (*nr)++;
2332
2333        } while (ptep++, addr += PAGE_SIZE, addr != end);
2334
2335        ret = 1;
2336
2337pte_unmap:
2338        if (pgmap)
2339                put_dev_pagemap(pgmap);
2340        pte_unmap(ptem);
2341        return ret;
2342}
2343#else
2344
2345/*
2346 * If we can't determine whether or not a pte is special, then fail immediately
2347 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
2348 * to be special.
2349 *
2350 * For a futex to be placed on a THP tail page, get_futex_key requires a
2351 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
2352 * useful to have gup_huge_pmd even if we can't operate on ptes.
2353 */
2354static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
2355                         unsigned int flags, struct page **pages, int *nr)
2356{
2357        return 0;
2358}
2359#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
2360
2361#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
2362static int __gup_device_huge(unsigned long pfn, unsigned long addr,
2363                             unsigned long end, unsigned int flags,
2364                             struct page **pages, int *nr)
2365{
2366        int nr_start = *nr;
2367        struct dev_pagemap *pgmap = NULL;
2368
2369        do {
2370                struct page *page = pfn_to_page(pfn);
2371
2372                pgmap = get_dev_pagemap(pfn, pgmap);
2373                if (unlikely(!pgmap)) {
2374                        undo_dev_pagemap(nr, nr_start, flags, pages);
2375                        break;
2376                }
2377                SetPageReferenced(page);
2378                pages[*nr] = page;
2379                if (unlikely(!try_grab_page(page, flags))) {
2380                        undo_dev_pagemap(nr, nr_start, flags, pages);
2381                        break;
2382                }
2383                (*nr)++;
2384                pfn++;
2385        } while (addr += PAGE_SIZE, addr != end);
2386
2387        put_dev_pagemap(pgmap);
2388        return addr == end;
2389}
2390
2391static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2392                                 unsigned long end, unsigned int flags,
2393                                 struct page **pages, int *nr)
2394{
2395        unsigned long fault_pfn;
2396        int nr_start = *nr;
2397
2398        fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
2399        if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
2400                return 0;
2401
2402        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2403                undo_dev_pagemap(nr, nr_start, flags, pages);
2404                return 0;
2405        }
2406        return 1;
2407}
2408
2409static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
2410                                 unsigned long end, unsigned int flags,
2411                                 struct page **pages, int *nr)
2412{
2413        unsigned long fault_pfn;
2414        int nr_start = *nr;
2415
2416        fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
2417        if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
2418                return 0;
2419
2420        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
2421                undo_dev_pagemap(nr, nr_start, flags, pages);
2422                return 0;
2423        }
2424        return 1;
2425}
2426#else
2427static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2428                                 unsigned long end, unsigned int flags,
2429                                 struct page **pages, int *nr)
2430{
2431        BUILD_BUG();
2432        return 0;
2433}
2434
2435static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
2436                                 unsigned long end, unsigned int flags,
2437                                 struct page **pages, int *nr)
2438{
2439        BUILD_BUG();
2440        return 0;
2441}
2442#endif
2443
2444static int record_subpages(struct page *page, unsigned long addr,
2445                           unsigned long end, struct page **pages)
2446{
2447        int nr;
2448
2449        for (nr = 0; addr != end; addr += PAGE_SIZE)
2450                pages[nr++] = page++;
2451
2452        return nr;
2453}
2454
2455#ifdef CONFIG_ARCH_HAS_HUGEPD
2456static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
2457                                      unsigned long sz)
2458{
2459        unsigned long __boundary = (addr + sz) & ~(sz-1);
2460        return (__boundary - 1 < end - 1) ? __boundary : end;
2461}
2462
2463static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
2464                       unsigned long end, unsigned int flags,
2465                       struct page **pages, int *nr)
2466{
2467        unsigned long pte_end;
2468        struct page *head, *page;
2469        pte_t pte;
2470        int refs;
2471
2472        pte_end = (addr + sz) & ~(sz-1);
2473        if (pte_end < end)
2474                end = pte_end;
2475
2476        pte = huge_ptep_get(ptep);
2477
2478        if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2479                return 0;
2480
2481        /* hugepages are never "special" */
2482        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2483
2484        head = pte_page(pte);
2485        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
2486        refs = record_subpages(page, addr, end, pages + *nr);
2487
2488        head = try_grab_compound_head(head, refs, flags);
2489        if (!head)
2490                return 0;
2491
2492        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
2493                put_compound_head(head, refs, flags);
2494                return 0;
2495        }
2496
2497        *nr += refs;
2498        SetPageReferenced(head);
2499        return 1;
2500}
2501
2502static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2503                unsigned int pdshift, unsigned long end, unsigned int flags,
2504                struct page **pages, int *nr)
2505{
2506        pte_t *ptep;
2507        unsigned long sz = 1UL << hugepd_shift(hugepd);
2508        unsigned long next;
2509
2510        ptep = hugepte_offset(hugepd, addr, pdshift);
2511        do {
2512                next = hugepte_addr_end(addr, end, sz);
2513                if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
2514                        return 0;
2515        } while (ptep++, addr = next, addr != end);
2516
2517        return 1;
2518}
2519#else
2520static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2521                unsigned int pdshift, unsigned long end, unsigned int flags,
2522                struct page **pages, int *nr)
2523{
2524        return 0;
2525}
2526#endif /* CONFIG_ARCH_HAS_HUGEPD */
2527
2528static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2529                        unsigned long end, unsigned int flags,
2530                        struct page **pages, int *nr)
2531{
2532        struct page *head, *page;
2533        int refs;
2534
2535        if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
2536                return 0;
2537
2538        if (pmd_devmap(orig)) {
2539                if (unlikely(flags & FOLL_LONGTERM))
2540                        return 0;
2541                return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
2542                                             pages, nr);
2543        }
2544
2545        page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
2546        refs = record_subpages(page, addr, end, pages + *nr);
2547
2548        head = try_grab_compound_head(pmd_page(orig), refs, flags);
2549        if (!head)
2550                return 0;
2551
2552        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2553                put_compound_head(head, refs, flags);
2554                return 0;
2555        }
2556
2557        *nr += refs;
2558        SetPageReferenced(head);
2559        return 1;
2560}
2561
2562static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
2563                        unsigned long end, unsigned int flags,
2564                        struct page **pages, int *nr)
2565{
2566        struct page *head, *page;
2567        int refs;
2568
2569        if (!pud_access_permitted(orig, flags & FOLL_WRITE))
2570                return 0;
2571
2572        if (pud_devmap(orig)) {
2573                if (unlikely(flags & FOLL_LONGTERM))
2574                        return 0;
2575                return __gup_device_huge_pud(orig, pudp, addr, end, flags,
2576                                             pages, nr);
2577        }
2578
2579        page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
2580        refs = record_subpages(page, addr, end, pages + *nr);
2581
2582        head = try_grab_compound_head(pud_page(orig), refs, flags);
2583        if (!head)
2584                return 0;
2585
2586        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
2587                put_compound_head(head, refs, flags);
2588                return 0;
2589        }
2590
2591        *nr += refs;
2592        SetPageReferenced(head);
2593        return 1;
2594}
2595
2596static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
2597                        unsigned long end, unsigned int flags,
2598                        struct page **pages, int *nr)
2599{
2600        int refs;
2601        struct page *head, *page;
2602
2603        if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
2604                return 0;
2605
2606        BUILD_BUG_ON(pgd_devmap(orig));
2607
2608        page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
2609        refs = record_subpages(page, addr, end, pages + *nr);
2610
2611        head = try_grab_compound_head(pgd_page(orig), refs, flags);
2612        if (!head)
2613                return 0;
2614
2615        if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
2616                put_compound_head(head, refs, flags);
2617                return 0;
2618        }
2619
2620        *nr += refs;
2621        SetPageReferenced(head);
2622        return 1;
2623}
2624
2625static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
2626                unsigned int flags, struct page **pages, int *nr)
2627{
2628        unsigned long next;
2629        pmd_t *pmdp;
2630
2631        pmdp = pmd_offset_lockless(pudp, pud, addr);
2632        do {
2633                pmd_t pmd = READ_ONCE(*pmdp);
2634
2635                next = pmd_addr_end(addr, end);
2636                if (!pmd_present(pmd))
2637                        return 0;
2638
2639                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
2640                             pmd_devmap(pmd))) {
2641                        /*
2642                         * NUMA hinting faults need to be handled in the GUP
2643                         * slowpath for accounting purposes and so that they
2644                         * can be serialised against THP migration.
2645                         */
2646                        if (pmd_protnone(pmd))
2647                                return 0;
2648
2649                        if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
2650                                pages, nr))
2651                                return 0;
2652
2653                } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
2654                        /*
2655                         * architecture have different format for hugetlbfs
2656                         * pmd format and THP pmd format
2657                         */
2658                        if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
2659                                         PMD_SHIFT, next, flags, pages, nr))
2660                                return 0;
2661                } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
2662                        return 0;
2663        } while (pmdp++, addr = next, addr != end);
2664
2665        return 1;
2666}
2667
2668static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
2669                         unsigned int flags, struct page **pages, int *nr)
2670{
2671        unsigned long next;
2672        pud_t *pudp;
2673
2674        pudp = pud_offset_lockless(p4dp, p4d, addr);
2675        do {
2676                pud_t pud = READ_ONCE(*pudp);
2677
2678                next = pud_addr_end(addr, end);
2679                if (unlikely(!pud_present(pud)))
2680                        return 0;
2681                if (unlikely(pud_huge(pud))) {
2682                        if (!gup_huge_pud(pud, pudp, addr, next, flags,
2683                                          pages, nr))
2684                                return 0;
2685                } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
2686                        if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
2687                                         PUD_SHIFT, next, flags, pages, nr))
2688                                return 0;
2689                } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
2690                        return 0;
2691        } while (pudp++, addr = next, addr != end);
2692
2693        return 1;
2694}
2695
2696static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
2697                         unsigned int flags, struct page **pages, int *nr)
2698{
2699        unsigned long next;
2700        p4d_t *p4dp;
2701
2702        p4dp = p4d_offset_lockless(pgdp, pgd, addr);
2703        do {
2704                p4d_t p4d = READ_ONCE(*p4dp);
2705
2706                next = p4d_addr_end(addr, end);
2707                if (p4d_none(p4d))
2708                        return 0;
2709                BUILD_BUG_ON(p4d_huge(p4d));
2710                if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
2711                        if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
2712                                         P4D_SHIFT, next, flags, pages, nr))
2713                                return 0;
2714                } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
2715                        return 0;
2716        } while (p4dp++, addr = next, addr != end);
2717
2718        return 1;
2719}
2720
2721static void gup_pgd_range(unsigned long addr, unsigned long end,
2722                unsigned int flags, struct page **pages, int *nr)
2723{
2724        unsigned long next;
2725        pgd_t *pgdp;
2726
2727        pgdp = pgd_offset(current->mm, addr);
2728        do {
2729                pgd_t pgd = READ_ONCE(*pgdp);
2730
2731                next = pgd_addr_end(addr, end);
2732                if (pgd_none(pgd))
2733                        return;
2734                if (unlikely(pgd_huge(pgd))) {
2735                        if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
2736                                          pages, nr))
2737                                return;
2738                } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
2739                        if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
2740                                         PGDIR_SHIFT, next, flags, pages, nr))
2741                                return;
2742                } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
2743                        return;
2744        } while (pgdp++, addr = next, addr != end);
2745}
2746#else
2747static inline void gup_pgd_range(unsigned long addr, unsigned long end,
2748                unsigned int flags, struct page **pages, int *nr)
2749{
2750}
2751#endif /* CONFIG_HAVE_FAST_GUP */
2752
2753#ifndef gup_fast_permitted
2754/*
2755 * Check if it's allowed to use get_user_pages_fast_only() for the range, or
2756 * we need to fall back to the slow version:
2757 */
2758static bool gup_fast_permitted(unsigned long start, unsigned long end)
2759{
2760        return true;
2761}
2762#endif
2763
2764static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
2765                                   unsigned int gup_flags, struct page **pages)
2766{
2767        int ret;
2768
2769        /*
2770         * FIXME: FOLL_LONGTERM does not work with
2771         * get_user_pages_unlocked() (see comments in that function)
2772         */
2773        if (gup_flags & FOLL_LONGTERM) {
2774                mmap_read_lock(current->mm);
2775                ret = __gup_longterm_locked(current->mm,
2776                                            start, nr_pages,
2777                                            pages, NULL, gup_flags);
2778                mmap_read_unlock(current->mm);
2779        } else {
2780                ret = get_user_pages_unlocked(start, nr_pages,
2781                                              pages, gup_flags);
2782        }
2783
2784        return ret;
2785}
2786
2787static unsigned long lockless_pages_from_mm(unsigned long start,
2788                                            unsigned long end,
2789                                            unsigned int gup_flags,
2790                                            struct page **pages)
2791{
2792        unsigned long flags;
2793        int nr_pinned = 0;
2794        unsigned seq;
2795
2796        if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
2797            !gup_fast_permitted(start, end))
2798                return 0;
2799
2800        if (gup_flags & FOLL_PIN) {
2801                seq = raw_read_seqcount(&current->mm->write_protect_seq);
2802                if (seq & 1)
2803                        return 0;
2804        }
2805
2806        /*
2807         * Disable interrupts. The nested form is used, in order to allow full,
2808         * general purpose use of this routine.
2809         *
2810         * With interrupts disabled, we block page table pages from being freed
2811         * from under us. See struct mmu_table_batch comments in
2812         * include/asm-generic/tlb.h for more details.
2813         *
2814         * We do not adopt an rcu_read_lock() here as we also want to block IPIs
2815         * that come from THPs splitting.
2816         */
2817        local_irq_save(flags);
2818        gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
2819        local_irq_restore(flags);
2820
2821        /*
2822         * When pinning pages for DMA there could be a concurrent write protect
2823         * from fork() via copy_page_range(), in this case always fail fast GUP.
2824         */
2825        if (gup_flags & FOLL_PIN) {
2826                if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
2827                        unpin_user_pages(pages, nr_pinned);
2828                        return 0;
2829                }
2830        }
2831        return nr_pinned;
2832}
2833
2834static int internal_get_user_pages_fast(unsigned long start,
2835                                        unsigned long nr_pages,
2836                                        unsigned int gup_flags,
2837                                        struct page **pages)
2838{
2839        unsigned long len, end;
2840        unsigned long nr_pinned;
2841        int ret;
2842
2843        if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
2844                                       FOLL_FORCE | FOLL_PIN | FOLL_GET |
2845                                       FOLL_FAST_ONLY | FOLL_NOFAULT)))
2846                return -EINVAL;
2847
2848        if (gup_flags & FOLL_PIN)
2849                mm_set_has_pinned_flag(&current->mm->flags);
2850
2851        if (!(gup_flags & FOLL_FAST_ONLY))
2852                might_lock_read(&current->mm->mmap_lock);
2853
2854        start = untagged_addr(start) & PAGE_MASK;
2855        len = nr_pages << PAGE_SHIFT;
2856        if (check_add_overflow(start, len, &end))
2857                return 0;
2858        if (unlikely(!access_ok((void __user *)start, len)))
2859                return -EFAULT;
2860
2861        nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
2862        if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
2863                return nr_pinned;
2864
2865        /* Slow path: try to get the remaining pages with get_user_pages */
2866        start += nr_pinned << PAGE_SHIFT;
2867        pages += nr_pinned;
2868        ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
2869                                      pages);
2870        if (ret < 0) {
2871                /*
2872                 * The caller has to unpin the pages we already pinned so
2873                 * returning -errno is not an option
2874                 */
2875                if (nr_pinned)
2876                        return nr_pinned;
2877                return ret;
2878        }
2879        return ret + nr_pinned;
2880}
2881
2882/**
2883 * get_user_pages_fast_only() - pin user pages in memory
2884 * @start:      starting user address
2885 * @nr_pages:   number of pages from start to pin
2886 * @gup_flags:  flags modifying pin behaviour
2887 * @pages:      array that receives pointers to the pages pinned.
2888 *              Should be at least nr_pages long.
2889 *
2890 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
2891 * the regular GUP.
2892 * Note a difference with get_user_pages_fast: this always returns the
2893 * number of pages pinned, 0 if no pages were pinned.
2894 *
2895 * If the architecture does not support this function, simply return with no
2896 * pages pinned.
2897 *
2898 * Careful, careful! COW breaking can go either way, so a non-write
2899 * access can get ambiguous page results. If you call this function without
2900 * 'write' set, you'd better be sure that you're ok with that ambiguity.
2901 */
2902int get_user_pages_fast_only(unsigned long start, int nr_pages,
2903                             unsigned int gup_flags, struct page **pages)
2904{
2905        int nr_pinned;
2906        /*
2907         * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
2908         * because gup fast is always a "pin with a +1 page refcount" request.
2909         *
2910         * FOLL_FAST_ONLY is required in order to match the API description of
2911         * this routine: no fall back to regular ("slow") GUP.
2912         */
2913        gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
2914
2915        nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
2916                                                 pages);
2917
2918        /*
2919         * As specified in the API description above, this routine is not
2920         * allowed to return negative values. However, the common core
2921         * routine internal_get_user_pages_fast() *can* return -errno.
2922         * Therefore, correct for that here:
2923         */
2924        if (nr_pinned < 0)
2925                nr_pinned = 0;
2926
2927        return nr_pinned;
2928}
2929EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
2930
2931/**
2932 * get_user_pages_fast() - pin user pages in memory
2933 * @start:      starting user address
2934 * @nr_pages:   number of pages from start to pin
2935 * @gup_flags:  flags modifying pin behaviour
2936 * @pages:      array that receives pointers to the pages pinned.
2937 *              Should be at least nr_pages long.
2938 *
2939 * Attempt to pin user pages in memory without taking mm->mmap_lock.
2940 * If not successful, it will fall back to taking the lock and
2941 * calling get_user_pages().
2942 *
2943 * Returns number of pages pinned. This may be fewer than the number requested.
2944 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
2945 * -errno.
2946 */
2947int get_user_pages_fast(unsigned long start, int nr_pages,
2948                        unsigned int gup_flags, struct page **pages)
2949{
2950        if (!is_valid_gup_flags(gup_flags))
2951                return -EINVAL;
2952
2953        /*
2954         * The caller may or may not have explicitly set FOLL_GET; either way is
2955         * OK. However, internally (within mm/gup.c), gup fast variants must set
2956         * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
2957         * request.
2958         */
2959        gup_flags |= FOLL_GET;
2960        return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
2961}
2962EXPORT_SYMBOL_GPL(get_user_pages_fast);
2963
2964/**
2965 * pin_user_pages_fast() - pin user pages in memory without taking locks
2966 *
2967 * @start:      starting user address
2968 * @nr_pages:   number of pages from start to pin
2969 * @gup_flags:  flags modifying pin behaviour
2970 * @pages:      array that receives pointers to the pages pinned.
2971 *              Should be at least nr_pages long.
2972 *
2973 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
2974 * get_user_pages_fast() for documentation on the function arguments, because
2975 * the arguments here are identical.
2976 *
2977 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
2978 * see Documentation/core-api/pin_user_pages.rst for further details.
2979 */
2980int pin_user_pages_fast(unsigned long start, int nr_pages,
2981                        unsigned int gup_flags, struct page **pages)
2982{
2983        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2984        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2985                return -EINVAL;
2986
2987        gup_flags |= FOLL_PIN;
2988        return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
2989}
2990EXPORT_SYMBOL_GPL(pin_user_pages_fast);
2991
2992/*
2993 * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
2994 * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
2995 *
2996 * The API rules are the same, too: no negative values may be returned.
2997 */
2998int pin_user_pages_fast_only(unsigned long start, int nr_pages,
2999                             unsigned int gup_flags, struct page **pages)
3000{

3001        int nr_pinned;
3002
3003        /*
3004         * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
3005         * rules require returning 0, rather than -errno:
3006         */
3007        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3008                return 0;
3009        /*
3010         * FOLL_FAST_ONLY is required in order to match the API description of
3011         * this routine: no fall back to regular ("slow") GUP.
3012         */
3013        gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
3014        nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
3015                                                 pages);
3016        /*
3017         * This routine is not allowed to return negative values. However,
3018         * internal_get_user_pages_fast() *can* return -errno. Therefore,
3019         * correct for that here:
3020         */
3021        if (nr_pinned < 0)
3022                nr_pinned = 0;
3023
3024        return nr_pinned;
3025}
3026EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
3027
3028/**
3029 * pin_user_pages_remote() - pin pages of a remote process
3030 *
3031 * @mm:         mm_struct of target mm
3032 * @start:      starting user address
3033 * @nr_pages:   number of pages from start to pin
3034 * @gup_flags:  flags modifying lookup behaviour
3035 * @pages:      array that receives pointers to the pages pinned.
3036 *              Should be at least nr_pages long. Or NULL, if caller
3037 *              only intends to ensure the pages are faulted in.
3038 * @vmas:       array of pointers to vmas corresponding to each page.
3039 *              Or NULL if the caller does not require them.
3040 * @locked:     pointer to lock flag indicating whether lock is held and
3041 *              subsequently whether VM_FAULT_RETRY functionality can be
3042 *              utilised. Lock must initially be held.
3043 *
3044 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
3045 * get_user_pages_remote() for documentation on the function arguments, because
3046 * the arguments here are identical.
3047 *
3048 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
3049 * see Documentation/core-api/pin_user_pages.rst for details.
3050 */
3051long pin_user_pages_remote(struct mm_struct *mm,
3052                           unsigned long start, unsigned long nr_pages,
3053                           unsigned int gup_flags, struct page **pages,
3054                           struct vm_area_struct **vmas, int *locked)
3055{
3056        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
3057        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3058                return -EINVAL;
3059
3060        gup_flags |= FOLL_PIN;
3061        return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
3062                                       pages, vmas, locked);
3063}
3064EXPORT_SYMBOL(pin_user_pages_remote);
3065
3066/**
3067 * pin_user_pages() - pin user pages in memory for use by other devices
3068 *
3069 * @start:      starting user address
3070 * @nr_pages:   number of pages from start to pin
3071 * @gup_flags:  flags modifying lookup behaviour
3072 * @pages:      array that receives pointers to the pages pinned.
3073 *              Should be at least nr_pages long. Or NULL, if caller
3074 *              only intends to ensure the pages are faulted in.
3075 * @vmas:       array of pointers to vmas corresponding to each page.
3076 *              Or NULL if the caller does not require them.
3077 *
3078 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
3079 * FOLL_PIN is set.
3080 *
3081 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
3082 * see Documentation/core-api/pin_user_pages.rst for details.
3083 */
3084long pin_user_pages(unsigned long start, unsigned long nr_pages,
3085                    unsigned int gup_flags, struct page **pages,
3086                    struct vm_area_struct **vmas)
3087{
3088        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
3089        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3090                return -EINVAL;
3091
3092        gup_flags |= FOLL_PIN;
3093        return __gup_longterm_locked(current->mm, start, nr_pages,
3094                                     pages, vmas, gup_flags);
3095}
3096EXPORT_SYMBOL(pin_user_pages);
3097
3098/*
3099 * pin_user_pages_unlocked() is the FOLL_PIN variant of
3100 * get_user_pages_unlocked(). Behavior is the same, except that this one sets
3101 * FOLL_PIN and rejects FOLL_GET.
3102 */
3103long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
3104                             struct page **pages, unsigned int gup_flags)
3105{
3106        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
3107        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3108                return -EINVAL;
3109
3110        gup_flags |= FOLL_PIN;
3111        return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
3112}
3113EXPORT_SYMBOL(pin_user_pages_unlocked);
3114
3115/*
3116 * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
3117 * Behavior is the same, except that this one sets FOLL_PIN and rejects
3118 * FOLL_GET.
3119 */
3120long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
3121                           unsigned int gup_flags, struct page **pages,
3122                           int *locked)
3123{
3124        /*
3125         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
3126         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
3127         * vmas.  As there are no users of this flag in this call we simply
3128         * disallow this option for now.
3129         */
3130        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
3131                return -EINVAL;
3132
3133        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
3134        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3135                return -EINVAL;
3136
3137        gup_flags |= FOLL_PIN;
3138        return __get_user_pages_locked(current->mm, start, nr_pages,
3139                                       pages, NULL, locked,
3140                                       gup_flags | FOLL_TOUCH);
3141}
3142EXPORT_SYMBOL(pin_user_pages_locked);
3143