linux/mm/gup.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2#include <linux/kernel.h>
   3#include <linux/errno.h>
   4#include <linux/err.h>
   5#include <linux/spinlock.h>
   6
   7#include <linux/mm.h>
   8#include <linux/memremap.h>
   9#include <linux/pagemap.h>
  10#include <linux/rmap.h>
  11#include <linux/swap.h>
  12#include <linux/swapops.h>
  13#include <linux/secretmem.h>
  14
  15#include <linux/sched/signal.h>
  16#include <linux/rwsem.h>
  17#include <linux/hugetlb.h>
  18#include <linux/migrate.h>
  19#include <linux/mm_inline.h>
  20#include <linux/sched/mm.h>
  21
  22#include <asm/mmu_context.h>
  23#include <asm/tlbflush.h>
  24
  25#include "internal.h"
  26
  27struct follow_page_context {
  28        struct dev_pagemap *pgmap;
  29        unsigned int page_mask;
  30};
  31
  32static inline void sanity_check_pinned_pages(struct page **pages,
  33                                             unsigned long npages)
  34{
  35        if (!IS_ENABLED(CONFIG_DEBUG_VM))
  36                return;
  37
  38        /*
  39         * We only pin anonymous pages if they are exclusive. Once pinned, we
  40         * can no longer turn them possibly shared and PageAnonExclusive() will
  41         * stick around until the page is freed.
  42         *
  43         * We'd like to verify that our pinned anonymous pages are still mapped
  44         * exclusively. The issue with anon THP is that we don't know how
  45         * they are/were mapped when pinning them. However, for anon
  46         * THP we can assume that either the given page (PTE-mapped THP) or
  47         * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
  48         * neither is the case, there is certainly something wrong.
  49         */
  50        for (; npages; npages--, pages++) {
  51                struct page *page = *pages;
  52                struct folio *folio = page_folio(page);
  53
  54                if (!folio_test_anon(folio))
  55                        continue;
  56                if (!folio_test_large(folio) || folio_test_hugetlb(folio))
  57                        VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
  58                else
  59                        /* Either a PTE-mapped or a PMD-mapped THP. */
  60                        VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
  61                                       !PageAnonExclusive(page), page);
  62        }
  63}
  64
  65/*
  66 * Return the folio with ref appropriately incremented,
  67 * or NULL if that failed.
  68 */
  69static inline struct folio *try_get_folio(struct page *page, int refs)
  70{
  71        struct folio *folio;
  72
  73retry:
  74        folio = page_folio(page);
  75        if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
  76                return NULL;
  77        if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
  78                return NULL;
  79
  80        /*
  81         * At this point we have a stable reference to the folio; but it
  82         * could be that between calling page_folio() and the refcount
  83         * increment, the folio was split, in which case we'd end up
  84         * holding a reference on a folio that has nothing to do with the page
  85         * we were given anymore.
  86         * So now that the folio is stable, recheck that the page still
  87         * belongs to this folio.
  88         */
  89        if (unlikely(page_folio(page) != folio)) {
  90                if (!put_devmap_managed_page_refs(&folio->page, refs))
  91                        folio_put_refs(folio, refs);
  92                goto retry;
  93        }
  94
  95        return folio;
  96}
  97
  98/**
  99 * try_grab_folio() - Attempt to get or pin a folio.
 100 * @page:  pointer to page to be grabbed
 101 * @refs:  the value to (effectively) add to the folio's refcount
 102 * @flags: gup flags: these are the FOLL_* flag values.
 103 *
 104 * "grab" names in this file mean, "look at flags to decide whether to use
 105 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
 106 *
 107 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
 108 * same time. (That's true throughout the get_user_pages*() and
 109 * pin_user_pages*() APIs.) Cases:
 110 *
 111 *    FOLL_GET: folio's refcount will be incremented by @refs.
 112 *
 113 *    FOLL_PIN on large folios: folio's refcount will be incremented by
 114 *    @refs, and its compound_pincount will be incremented by @refs.
 115 *
 116 *    FOLL_PIN on single-page folios: folio's refcount will be incremented by
 117 *    @refs * GUP_PIN_COUNTING_BIAS.
 118 *
 119 * Return: The folio containing @page (with refcount appropriately
 120 * incremented) for success, or NULL upon failure. If neither FOLL_GET
 121 * nor FOLL_PIN was set, that's considered failure, and furthermore,
 122 * a likely bug in the caller, so a warning is also emitted.
 123 */
 124struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
 125{
 126        if (flags & FOLL_GET)
 127                return try_get_folio(page, refs);
 128        else if (flags & FOLL_PIN) {
 129                struct folio *folio;
 130
 131                /*
 132                 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
 133                 * right zone, so fail and let the caller fall back to the slow
 134                 * path.
 135                 */
 136                if (unlikely((flags & FOLL_LONGTERM) &&
 137                             !is_pinnable_page(page)))
 138                        return NULL;
 139
 140                /*
 141                 * CAUTION: Don't use compound_head() on the page before this
 142                 * point, the result won't be stable.
 143                 */
 144                folio = try_get_folio(page, refs);
 145                if (!folio)
 146                        return NULL;
 147
 148                /*
 149                 * When pinning a large folio, use an exact count to track it.
 150                 *
 151                 * However, be sure to *also* increment the normal folio
 152                 * refcount field at least once, so that the folio really
 153                 * is pinned.  That's why the refcount from the earlier
 154                 * try_get_folio() is left intact.
 155                 */
 156                if (folio_test_large(folio))
 157                        atomic_add(refs, folio_pincount_ptr(folio));
 158                else
 159                        folio_ref_add(folio,
 160                                        refs * (GUP_PIN_COUNTING_BIAS - 1));
 161                node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
 162
 163                return folio;
 164        }
 165
 166        WARN_ON_ONCE(1);
 167        return NULL;
 168}
 169
 170static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
 171{
 172        if (flags & FOLL_PIN) {
 173                node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
 174                if (folio_test_large(folio))
 175                        atomic_sub(refs, folio_pincount_ptr(folio));
 176                else
 177                        refs *= GUP_PIN_COUNTING_BIAS;
 178        }
 179
 180        if (!put_devmap_managed_page_refs(&folio->page, refs))
 181                folio_put_refs(folio, refs);
 182}
 183
 184/**
 185 * try_grab_page() - elevate a page's refcount by a flag-dependent amount
 186 * @page:    pointer to page to be grabbed
 187 * @flags:   gup flags: these are the FOLL_* flag values.
 188 *
 189 * This might not do anything at all, depending on the flags argument.
 190 *
 191 * "grab" names in this file mean, "look at flags to decide whether to use
 192 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 193 *
 194 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
 195 * time. Cases: please see the try_grab_folio() documentation, with
 196 * "refs=1".
 197 *
 198 * Return: true for success, or if no action was required (if neither FOLL_PIN
 199 * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
 200 * FOLL_PIN was set, but the page could not be grabbed.
 201 */
 202bool __must_check try_grab_page(struct page *page, unsigned int flags)
 203{
 204        struct folio *folio = page_folio(page);
 205
 206        WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
 207        if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
 208                return false;
 209
 210        if (flags & FOLL_GET)
 211                folio_ref_inc(folio);
 212        else if (flags & FOLL_PIN) {
 213                /*
 214                 * Similar to try_grab_folio(): be sure to *also*
 215                 * increment the normal page refcount field at least once,
 216                 * so that the page really is pinned.
 217                 */
 218                if (folio_test_large(folio)) {
 219                        folio_ref_add(folio, 1);
 220                        atomic_add(1, folio_pincount_ptr(folio));
 221                } else {
 222                        folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
 223                }
 224
 225                node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
 226        }
 227
 228        return true;
 229}
 230
 231/**
 232 * unpin_user_page() - release a dma-pinned page
 233 * @page:            pointer to page to be released
 234 *
 235 * Pages that were pinned via pin_user_pages*() must be released via either
 236 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
 237 * that such pages can be separately tracked and uniquely handled. In
 238 * particular, interactions with RDMA and filesystems need special handling.
 239 */
 240void unpin_user_page(struct page *page)
 241{
 242        sanity_check_pinned_pages(&page, 1);
 243        gup_put_folio(page_folio(page), 1, FOLL_PIN);
 244}
 245EXPORT_SYMBOL(unpin_user_page);
 246
 247static inline struct folio *gup_folio_range_next(struct page *start,
 248                unsigned long npages, unsigned long i, unsigned int *ntails)
 249{
 250        struct page *next = nth_page(start, i);
 251        struct folio *folio = page_folio(next);
 252        unsigned int nr = 1;
 253
 254        if (folio_test_large(folio))
 255                nr = min_t(unsigned int, npages - i,
 256                           folio_nr_pages(folio) - folio_page_idx(folio, next));
 257
 258        *ntails = nr;
 259        return folio;
 260}
 261
 262static inline struct folio *gup_folio_next(struct page **list,
 263                unsigned long npages, unsigned long i, unsigned int *ntails)
 264{
 265        struct folio *folio = page_folio(list[i]);
 266        unsigned int nr;
 267
 268        for (nr = i + 1; nr < npages; nr++) {
 269                if (page_folio(list[nr]) != folio)
 270                        break;
 271        }
 272
 273        *ntails = nr - i;
 274        return folio;
 275}
 276
 277/**
 278 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
 279 * @pages:  array of pages to be maybe marked dirty, and definitely released.
 280 * @npages: number of pages in the @pages array.
 281 * @make_dirty: whether to mark the pages dirty
 282 *
 283 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
 284 * variants called on that page.
 285 *
 286 * For each page in the @pages array, make that page (or its head page, if a
 287 * compound page) dirty, if @make_dirty is true, and if the page was previously
 288 * listed as clean. In any case, releases all pages using unpin_user_page(),
 289 * possibly via unpin_user_pages(), for the non-dirty case.
 290 *
 291 * Please see the unpin_user_page() documentation for details.
 292 *
 293 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 294 * required, then the caller should a) verify that this is really correct,
 295 * because _lock() is usually required, and b) hand code it:
 296 * set_page_dirty_lock(), unpin_user_page().
 297 *
 298 */
 299void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 300                                 bool make_dirty)
 301{
 302        unsigned long i;
 303        struct folio *folio;
 304        unsigned int nr;
 305
 306        if (!make_dirty) {
 307                unpin_user_pages(pages, npages);
 308                return;
 309        }
 310
 311        sanity_check_pinned_pages(pages, npages);
 312        for (i = 0; i < npages; i += nr) {
 313                folio = gup_folio_next(pages, npages, i, &nr);
 314                /*
 315                 * Checking PageDirty at this point may race with
 316                 * clear_page_dirty_for_io(), but that's OK. Two key
 317                 * cases:
 318                 *
 319                 * 1) This code sees the page as already dirty, so it
 320                 * skips the call to set_page_dirty(). That could happen
 321                 * because clear_page_dirty_for_io() called
 322                 * page_mkclean(), followed by set_page_dirty().
 323                 * However, now the page is going to get written back,
 324                 * which meets the original intention of setting it
 325                 * dirty, so all is well: clear_page_dirty_for_io() goes
 326                 * on to call TestClearPageDirty(), and write the page
 327                 * back.
 328                 *
 329                 * 2) This code sees the page as clean, so it calls
 330                 * set_page_dirty(). The page stays dirty, despite being
 331                 * written back, so it gets written back again in the
 332                 * next writeback cycle. This is harmless.
 333                 */
 334                if (!folio_test_dirty(folio)) {
 335                        folio_lock(folio);
 336                        folio_mark_dirty(folio);
 337                        folio_unlock(folio);
 338                }
 339                gup_put_folio(folio, nr, FOLL_PIN);
 340        }
 341}
 342EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
 343
 344/**
 345 * unpin_user_page_range_dirty_lock() - release and optionally dirty
 346 * gup-pinned page range
 347 *
 348 * @page:  the starting page of a range maybe marked dirty, and definitely released.
 349 * @npages: number of consecutive pages to release.
 350 * @make_dirty: whether to mark the pages dirty
 351 *
 352 * "gup-pinned page range" refers to a range of pages that has had one of the
 353 * pin_user_pages() variants called on that page.
 354 *
 355 * For the page ranges defined by [page .. page+npages], make that range (or
 356 * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
 357 * page range was previously listed as clean.
 358 *
 359 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 360 * required, then the caller should a) verify that this is really correct,
 361 * because _lock() is usually required, and b) hand code it:
 362 * set_page_dirty_lock(), unpin_user_page().
 363 *
 364 */
 365void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
 366                                      bool make_dirty)
 367{
 368        unsigned long i;
 369        struct folio *folio;
 370        unsigned int nr;
 371
 372        for (i = 0; i < npages; i += nr) {
 373                folio = gup_folio_range_next(page, npages, i, &nr);
 374                if (make_dirty && !folio_test_dirty(folio)) {
 375                        folio_lock(folio);
 376                        folio_mark_dirty(folio);
 377                        folio_unlock(folio);
 378                }
 379                gup_put_folio(folio, nr, FOLL_PIN);
 380        }
 381}
 382EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
 383
 384static void unpin_user_pages_lockless(struct page **pages, unsigned long npages)
 385{
 386        unsigned long i;
 387        struct folio *folio;
 388        unsigned int nr;
 389
 390        /*
 391         * Don't perform any sanity checks because we might have raced with
 392         * fork() and some anonymous pages might now actually be shared --
 393         * which is why we're unpinning after all.
 394         */
 395        for (i = 0; i < npages; i += nr) {
 396                folio = gup_folio_next(pages, npages, i, &nr);
 397                gup_put_folio(folio, nr, FOLL_PIN);
 398        }
 399}
 400
 401/**
 402 * unpin_user_pages() - release an array of gup-pinned pages.
 403 * @pages:  array of pages to be marked dirty and released.
 404 * @npages: number of pages in the @pages array.
 405 *
 406 * For each page in the @pages array, release the page using unpin_user_page().
 407 *
 408 * Please see the unpin_user_page() documentation for details.
 409 */
 410void unpin_user_pages(struct page **pages, unsigned long npages)
 411{
 412        unsigned long i;
 413        struct folio *folio;
 414        unsigned int nr;
 415
 416        /*
 417         * If this WARN_ON() fires, then the system *might* be leaking pages (by
 418         * leaving them pinned), but probably not. More likely, gup/pup returned
 419         * a hard -ERRNO error to the caller, who erroneously passed it here.
 420         */
 421        if (WARN_ON(IS_ERR_VALUE(npages)))
 422                return;
 423
 424        sanity_check_pinned_pages(pages, npages);
 425        for (i = 0; i < npages; i += nr) {
 426                folio = gup_folio_next(pages, npages, i, &nr);
 427                gup_put_folio(folio, nr, FOLL_PIN);
 428        }
 429}
 430EXPORT_SYMBOL(unpin_user_pages);
 431
 432/*
 433 * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
 434 * lifecycle.  Avoid setting the bit unless necessary, or it might cause write
 435 * cache bouncing on large SMP machines for concurrent pinned gups.
 436 */
 437static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
 438{
 439        if (!test_bit(MMF_HAS_PINNED, mm_flags))
 440                set_bit(MMF_HAS_PINNED, mm_flags);
 441}
 442
 443#ifdef CONFIG_MMU
 444static struct page *no_page_table(struct vm_area_struct *vma,
 445                unsigned int flags)
 446{
 447        /*
 448         * When core dumping an enormous anonymous area that nobody
 449         * has touched so far, we don't want to allocate unnecessary pages or
 450         * page tables.  Return error instead of NULL to skip handle_mm_fault,
 451         * then get_dump_page() will return NULL to leave a hole in the dump.
 452         * But we can only make this optimization where a hole would surely
 453         * be zero-filled if handle_mm_fault() actually did handle it.
 454         */
 455        if ((flags & FOLL_DUMP) &&
 456                        (vma_is_anonymous(vma) || !vma->vm_ops->fault))
 457                return ERR_PTR(-EFAULT);
 458        return NULL;
 459}
 460
 461static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 462                pte_t *pte, unsigned int flags)
 463{
 464        if (flags & FOLL_TOUCH) {
 465                pte_t entry = *pte;
 466
 467                if (flags & FOLL_WRITE)
 468                        entry = pte_mkdirty(entry);
 469                entry = pte_mkyoung(entry);
 470
 471                if (!pte_same(*pte, entry)) {
 472                        set_pte_at(vma->vm_mm, address, pte, entry);
 473                        update_mmu_cache(vma, address, pte);
 474                }
 475        }
 476
 477        /* Proper page table entry exists, but no corresponding struct page */
 478        return -EEXIST;
 479}
 480
 481/*
 482 * FOLL_FORCE can write to even unwritable pte's, but only
 483 * after we've gone through a COW cycle and they are dirty.
 484 */
 485static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
 486{
 487        return pte_write(pte) ||
 488                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
 489}
 490
 491static struct page *follow_page_pte(struct vm_area_struct *vma,
 492                unsigned long address, pmd_t *pmd, unsigned int flags,
 493                struct dev_pagemap **pgmap)
 494{
 495        struct mm_struct *mm = vma->vm_mm;
 496        struct page *page;
 497        spinlock_t *ptl;
 498        pte_t *ptep, pte;
 499        int ret;
 500
 501        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
 502        if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
 503                         (FOLL_PIN | FOLL_GET)))
 504                return ERR_PTR(-EINVAL);
 505retry:
 506        if (unlikely(pmd_bad(*pmd)))
 507                return no_page_table(vma, flags);
 508
 509        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 510        pte = *ptep;
 511        if (!pte_present(pte)) {
 512                swp_entry_t entry;
 513                /*
 514                 * KSM's break_ksm() relies upon recognizing a ksm page
 515                 * even while it is being migrated, so for that case we
 516                 * need migration_entry_wait().
 517                 */
 518                if (likely(!(flags & FOLL_MIGRATION)))
 519                        goto no_page;
 520                if (pte_none(pte))
 521                        goto no_page;
 522                entry = pte_to_swp_entry(pte);
 523                if (!is_migration_entry(entry))
 524                        goto no_page;
 525                pte_unmap_unlock(ptep, ptl);
 526                migration_entry_wait(mm, pmd, address);
 527                goto retry;
 528        }
 529        if ((flags & FOLL_NUMA) && pte_protnone(pte))
 530                goto no_page;
 531        if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
 532                pte_unmap_unlock(ptep, ptl);
 533                return NULL;
 534        }
 535
 536        page = vm_normal_page(vma, address, pte);
 537        if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
 538                /*
 539                 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
 540                 * case since they are only valid while holding the pgmap
 541                 * reference.
 542                 */
 543                *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
 544                if (*pgmap)
 545                        page = pte_page(pte);
 546                else
 547                        goto no_page;
 548        } else if (unlikely(!page)) {
 549                if (flags & FOLL_DUMP) {
 550                        /* Avoid special (like zero) pages in core dumps */
 551                        page = ERR_PTR(-EFAULT);
 552                        goto out;
 553                }
 554
 555                if (is_zero_pfn(pte_pfn(pte))) {
 556                        page = pte_page(pte);
 557                } else {
 558                        ret = follow_pfn_pte(vma, address, ptep, flags);
 559                        page = ERR_PTR(ret);
 560                        goto out;
 561                }
 562        }
 563
 564        if (!pte_write(pte) && gup_must_unshare(flags, page)) {
 565                page = ERR_PTR(-EMLINK);
 566                goto out;
 567        }
 568
 569        VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
 570                       !PageAnonExclusive(page), page);
 571
 572        /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
 573        if (unlikely(!try_grab_page(page, flags))) {
 574                page = ERR_PTR(-ENOMEM);
 575                goto out;
 576        }
 577        /*
 578         * We need to make the page accessible if and only if we are going
 579         * to access its content (the FOLL_PIN case).  Please see
 580         * Documentation/core-api/pin_user_pages.rst for details.
 581         */
 582        if (flags & FOLL_PIN) {
 583                ret = arch_make_page_accessible(page);
 584                if (ret) {
 585                        unpin_user_page(page);
 586                        page = ERR_PTR(ret);
 587                        goto out;
 588                }
 589        }
 590        if (flags & FOLL_TOUCH) {
 591                if ((flags & FOLL_WRITE) &&
 592                    !pte_dirty(pte) && !PageDirty(page))
 593                        set_page_dirty(page);
 594                /*
 595                 * pte_mkyoung() would be more correct here, but atomic care
 596                 * is needed to avoid losing the dirty bit: it is easier to use
 597                 * mark_page_accessed().
 598                 */
 599                mark_page_accessed(page);
 600        }
 601out:
 602        pte_unmap_unlock(ptep, ptl);
 603        return page;
 604no_page:
 605        pte_unmap_unlock(ptep, ptl);
 606        if (!pte_none(pte))
 607                return NULL;
 608        return no_page_table(vma, flags);
 609}
 610
 611static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 612                                    unsigned long address, pud_t *pudp,
 613                                    unsigned int flags,
 614                                    struct follow_page_context *ctx)
 615{
 616        pmd_t *pmd, pmdval;
 617        spinlock_t *ptl;
 618        struct page *page;
 619        struct mm_struct *mm = vma->vm_mm;
 620
 621        pmd = pmd_offset(pudp, address);
 622        /*
 623         * The READ_ONCE() will stabilize the pmdval in a register or
 624         * on the stack so that it will stop changing under the code.
 625         */
 626        pmdval = READ_ONCE(*pmd);
 627        if (pmd_none(pmdval))
 628                return no_page_table(vma, flags);
 629        if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
 630                page = follow_huge_pmd(mm, address, pmd, flags);
 631                if (page)
 632                        return page;
 633                return no_page_table(vma, flags);
 634        }
 635        if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
 636                page = follow_huge_pd(vma, address,
 637                                      __hugepd(pmd_val(pmdval)), flags,
 638                                      PMD_SHIFT);
 639                if (page)
 640                        return page;
 641                return no_page_table(vma, flags);
 642        }
 643retry:
 644        if (!pmd_present(pmdval)) {
 645                /*
 646                 * Should never reach here, if thp migration is not supported;
 647                 * Otherwise, it must be a thp migration entry.
 648                 */
 649                VM_BUG_ON(!thp_migration_supported() ||
 650                                  !is_pmd_migration_entry(pmdval));
 651
 652                if (likely(!(flags & FOLL_MIGRATION)))
 653                        return no_page_table(vma, flags);
 654
 655                pmd_migration_entry_wait(mm, pmd);
 656                pmdval = READ_ONCE(*pmd);
 657                /*
 658                 * MADV_DONTNEED may convert the pmd to null because
 659                 * mmap_lock is held in read mode
 660                 */
 661                if (pmd_none(pmdval))
 662                        return no_page_table(vma, flags);
 663                goto retry;
 664        }
 665        if (pmd_devmap(pmdval)) {
 666                ptl = pmd_lock(mm, pmd);
 667                page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
 668                spin_unlock(ptl);
 669                if (page)
 670                        return page;
 671        }
 672        if (likely(!pmd_trans_huge(pmdval)))
 673                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 674
 675        if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
 676                return no_page_table(vma, flags);
 677
 678retry_locked:
 679        ptl = pmd_lock(mm, pmd);
 680        if (unlikely(pmd_none(*pmd))) {
 681                spin_unlock(ptl);
 682                return no_page_table(vma, flags);
 683        }
 684        if (unlikely(!pmd_present(*pmd))) {
 685                spin_unlock(ptl);
 686                if (likely(!(flags & FOLL_MIGRATION)))
 687                        return no_page_table(vma, flags);
 688                pmd_migration_entry_wait(mm, pmd);
 689                goto retry_locked;
 690        }
 691        if (unlikely(!pmd_trans_huge(*pmd))) {
 692                spin_unlock(ptl);
 693                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 694        }
 695        if (flags & FOLL_SPLIT_PMD) {
 696                int ret;
 697                page = pmd_page(*pmd);
 698                if (is_huge_zero_page(page)) {
 699                        spin_unlock(ptl);
 700                        ret = 0;
 701                        split_huge_pmd(vma, pmd, address);
 702                        if (pmd_trans_unstable(pmd))
 703                                ret = -EBUSY;
 704                } else {
 705                        spin_unlock(ptl);
 706                        split_huge_pmd(vma, pmd, address);
 707                        ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
 708                }
 709
 710                return ret ? ERR_PTR(ret) :
 711                        follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 712        }
 713        page = follow_trans_huge_pmd(vma, address, pmd, flags);
 714        spin_unlock(ptl);
 715        ctx->page_mask = HPAGE_PMD_NR - 1;
 716        return page;
 717}
 718
 719static struct page *follow_pud_mask(struct vm_area_struct *vma,
 720                                    unsigned long address, p4d_t *p4dp,
 721                                    unsigned int flags,
 722                                    struct follow_page_context *ctx)
 723{
 724        pud_t *pud;
 725        spinlock_t *ptl;
 726        struct page *page;
 727        struct mm_struct *mm = vma->vm_mm;
 728
 729        pud = pud_offset(p4dp, address);
 730        if (pud_none(*pud))
 731                return no_page_table(vma, flags);
 732        if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
 733                page = follow_huge_pud(mm, address, pud, flags);
 734                if (page)
 735                        return page;
 736                return no_page_table(vma, flags);
 737        }
 738        if (is_hugepd(__hugepd(pud_val(*pud)))) {
 739                page = follow_huge_pd(vma, address,
 740                                      __hugepd(pud_val(*pud)), flags,
 741                                      PUD_SHIFT);
 742                if (page)
 743                        return page;
 744                return no_page_table(vma, flags);
 745        }
 746        if (pud_devmap(*pud)) {
 747                ptl = pud_lock(mm, pud);
 748                page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
 749                spin_unlock(ptl);
 750                if (page)
 751                        return page;
 752        }
 753        if (unlikely(pud_bad(*pud)))
 754                return no_page_table(vma, flags);
 755
 756        return follow_pmd_mask(vma, address, pud, flags, ctx);
 757}
 758
 759static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 760                                    unsigned long address, pgd_t *pgdp,
 761                                    unsigned int flags,
 762                                    struct follow_page_context *ctx)
 763{
 764        p4d_t *p4d;
 765        struct page *page;
 766
 767        p4d = p4d_offset(pgdp, address);
 768        if (p4d_none(*p4d))
 769                return no_page_table(vma, flags);
 770        BUILD_BUG_ON(p4d_huge(*p4d));
 771        if (unlikely(p4d_bad(*p4d)))
 772                return no_page_table(vma, flags);
 773
 774        if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
 775                page = follow_huge_pd(vma, address,
 776                                      __hugepd(p4d_val(*p4d)), flags,
 777                                      P4D_SHIFT);
 778                if (page)
 779                        return page;
 780                return no_page_table(vma, flags);
 781        }
 782        return follow_pud_mask(vma, address, p4d, flags, ctx);
 783}
 784
 785/**
 786 * follow_page_mask - look up a page descriptor from a user-virtual address
 787 * @vma: vm_area_struct mapping @address
 788 * @address: virtual address to look up
 789 * @flags: flags modifying lookup behaviour
 790 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
 791 *       pointer to output page_mask
 792 *
 793 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 794 *
 795 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
 796 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
 797 *
 798 * When getting an anonymous page and the caller has to trigger unsharing
 799 * of a shared anonymous page first, -EMLINK is returned. The caller should
 800 * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
 801 * relevant with FOLL_PIN and !FOLL_WRITE.
 802 *
 803 * On output, the @ctx->page_mask is set according to the size of the page.
 804 *
 805 * Return: the mapped (struct page *), %NULL if no mapping exists, or
 806 * an error pointer if there is a mapping to something not represented
 807 * by a page descriptor (see also vm_normal_page()).
 808 */
 809static struct page *follow_page_mask(struct vm_area_struct *vma,
 810                              unsigned long address, unsigned int flags,
 811                              struct follow_page_context *ctx)
 812{
 813        pgd_t *pgd;
 814        struct page *page;
 815        struct mm_struct *mm = vma->vm_mm;
 816
 817        ctx->page_mask = 0;
 818
 819        /* make this handle hugepd */
 820        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 821        if (!IS_ERR(page)) {
 822                WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
 823                return page;
 824        }
 825
 826        pgd = pgd_offset(mm, address);
 827
 828        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 829                return no_page_table(vma, flags);
 830
 831        if (pgd_huge(*pgd)) {
 832                page = follow_huge_pgd(mm, address, pgd, flags);
 833                if (page)
 834                        return page;
 835                return no_page_table(vma, flags);
 836        }
 837        if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
 838                page = follow_huge_pd(vma, address,
 839                                      __hugepd(pgd_val(*pgd)), flags,
 840                                      PGDIR_SHIFT);
 841                if (page)
 842                        return page;
 843                return no_page_table(vma, flags);
 844        }
 845
 846        return follow_p4d_mask(vma, address, pgd, flags, ctx);
 847}
 848
 849struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 850                         unsigned int foll_flags)
 851{
 852        struct follow_page_context ctx = { NULL };
 853        struct page *page;
 854
 855        if (vma_is_secretmem(vma))
 856                return NULL;
 857
 858        if (foll_flags & FOLL_PIN)
 859                return NULL;
 860
 861        page = follow_page_mask(vma, address, foll_flags, &ctx);
 862        if (ctx.pgmap)
 863                put_dev_pagemap(ctx.pgmap);
 864        return page;
 865}
 866
 867static int get_gate_page(struct mm_struct *mm, unsigned long address,
 868                unsigned int gup_flags, struct vm_area_struct **vma,
 869                struct page **page)
 870{
 871        pgd_t *pgd;
 872        p4d_t *p4d;
 873        pud_t *pud;
 874        pmd_t *pmd;
 875        pte_t *pte;
 876        int ret = -EFAULT;
 877
 878        /* user gate pages are read-only */
 879        if (gup_flags & FOLL_WRITE)
 880                return -EFAULT;
 881        if (address > TASK_SIZE)
 882                pgd = pgd_offset_k(address);
 883        else
 884                pgd = pgd_offset_gate(mm, address);
 885        if (pgd_none(*pgd))
 886                return -EFAULT;
 887        p4d = p4d_offset(pgd, address);
 888        if (p4d_none(*p4d))
 889                return -EFAULT;
 890        pud = pud_offset(p4d, address);
 891        if (pud_none(*pud))
 892                return -EFAULT;
 893        pmd = pmd_offset(pud, address);
 894        if (!pmd_present(*pmd))
 895                return -EFAULT;
 896        VM_BUG_ON(pmd_trans_huge(*pmd));
 897        pte = pte_offset_map(pmd, address);
 898        if (pte_none(*pte))
 899                goto unmap;
 900        *vma = get_gate_vma(mm);
 901        if (!page)
 902                goto out;
 903        *page = vm_normal_page(*vma, address, *pte);
 904        if (!*page) {
 905                if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
 906                        goto unmap;
 907                *page = pte_page(*pte);
 908        }
 909        if (unlikely(!try_grab_page(*page, gup_flags))) {
 910                ret = -ENOMEM;
 911                goto unmap;
 912        }
 913out:
 914        ret = 0;
 915unmap:
 916        pte_unmap(pte);
 917        return ret;
 918}
 919
 920/*
 921 * mmap_lock must be held on entry.  If @locked != NULL and *@flags
 922 * does not include FOLL_NOWAIT, the mmap_lock may be released.  If it
 923 * is, *@locked will be set to 0 and -EBUSY returned.
 924 */
 925static int faultin_page(struct vm_area_struct *vma,
 926                unsigned long address, unsigned int *flags, bool unshare,
 927                int *locked)
 928{
 929        unsigned int fault_flags = 0;
 930        vm_fault_t ret;
 931
 932        if (*flags & FOLL_NOFAULT)
 933                return -EFAULT;
 934        if (*flags & FOLL_WRITE)
 935                fault_flags |= FAULT_FLAG_WRITE;
 936        if (*flags & FOLL_REMOTE)
 937                fault_flags |= FAULT_FLAG_REMOTE;
 938        if (locked)
 939                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 940        if (*flags & FOLL_NOWAIT)
 941                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
 942        if (*flags & FOLL_TRIED) {
 943                /*
 944                 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
 945                 * can co-exist
 946                 */
 947                fault_flags |= FAULT_FLAG_TRIED;
 948        }
 949        if (unshare) {
 950                fault_flags |= FAULT_FLAG_UNSHARE;
 951                /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
 952                VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
 953        }
 954
 955        ret = handle_mm_fault(vma, address, fault_flags, NULL);
 956        if (ret & VM_FAULT_ERROR) {
 957                int err = vm_fault_to_errno(ret, *flags);
 958
 959                if (err)
 960                        return err;
 961                BUG();
 962        }
 963
 964        if (ret & VM_FAULT_RETRY) {
 965                if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
 966                        *locked = 0;
 967                return -EBUSY;
 968        }
 969
 970        /*
 971         * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
 972         * necessary, even if maybe_mkwrite decided not to set pte_write. We
 973         * can thus safely do subsequent page lookups as if they were reads.
 974         * But only do so when looping for pte_write is futile: in some cases
 975         * userspace may also be wanting to write to the gotten user page,
 976         * which a read fault here might prevent (a readonly page might get
 977         * reCOWed by userspace write).
 978         */
 979        if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
 980                *flags |= FOLL_COW;
 981        return 0;
 982}
 983
 984static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 985{
 986        vm_flags_t vm_flags = vma->vm_flags;
 987        int write = (gup_flags & FOLL_WRITE);
 988        int foreign = (gup_flags & FOLL_REMOTE);
 989
 990        if (vm_flags & (VM_IO | VM_PFNMAP))
 991                return -EFAULT;
 992
 993        if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
 994                return -EFAULT;
 995
 996        if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
 997                return -EOPNOTSUPP;
 998
 999        if (vma_is_secretmem(vma))
1000                return -EFAULT;
1001
1002        if (write) {
1003                if (!(vm_flags & VM_WRITE)) {
1004                        if (!(gup_flags & FOLL_FORCE))
1005                                return -EFAULT;
1006                        /*
1007                         * We used to let the write,force case do COW in a
1008                         * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
1009                         * set a breakpoint in a read-only mapping of an
1010                         * executable, without corrupting the file (yet only
1011                         * when that file had been opened for writing!).
1012                         * Anon pages in shared mappings are surprising: now
1013                         * just reject it.
1014                         */
1015                        if (!is_cow_mapping(vm_flags))
1016                                return -EFAULT;
1017                }
1018        } else if (!(vm_flags & VM_READ)) {
1019                if (!(gup_flags & FOLL_FORCE))
1020                        return -EFAULT;
1021                /*
1022                 * Is there actually any vma we can reach here which does not
1023                 * have VM_MAYREAD set?
1024                 */
1025                if (!(vm_flags & VM_MAYREAD))
1026                        return -EFAULT;
1027        }
1028        /*
1029         * gups are always data accesses, not instruction
1030         * fetches, so execute=false here
1031         */
1032        if (!arch_vma_access_permitted(vma, write, false, foreign))
1033                return -EFAULT;
1034        return 0;
1035}
1036
1037/**
1038 * __get_user_pages() - pin user pages in memory
1039 * @mm:         mm_struct of target mm
1040 * @start:      starting user address
1041 * @nr_pages:   number of pages from start to pin
1042 * @gup_flags:  flags modifying pin behaviour
1043 * @pages:      array that receives pointers to the pages pinned.
1044 *              Should be at least nr_pages long. Or NULL, if caller
1045 *              only intends to ensure the pages are faulted in.
1046 * @vmas:       array of pointers to vmas corresponding to each page.
1047 *              Or NULL if the caller does not require them.
1048 * @locked:     whether we're still with the mmap_lock held
1049 *
1050 * Returns either number of pages pinned (which may be less than the
1051 * number requested), or an error. Details about the return value:
1052 *
1053 * -- If nr_pages is 0, returns 0.
1054 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1055 * -- If nr_pages is >0, and some pages were pinned, returns the number of
1056 *    pages pinned. Again, this may be less than nr_pages.
1057 * -- 0 return value is possible when the fault would need to be retried.
1058 *
1059 * The caller is responsible for releasing returned @pages, via put_page().
1060 *
1061 * @vmas are valid only as long as mmap_lock is held.
1062 *
1063 * Must be called with mmap_lock held.  It may be released.  See below.
1064 *
1065 * __get_user_pages walks a process's page tables and takes a reference to
1066 * each struct page that each user address corresponds to at a given
1067 * instant. That is, it takes the page that would be accessed if a user
1068 * thread accesses the given user virtual address at that instant.
1069 *
1070 * This does not guarantee that the page exists in the user mappings when
1071 * __get_user_pages returns, and there may even be a completely different
1072 * page there in some cases (eg. if mmapped pagecache has been invalidated
1073 * and subsequently re faulted). However it does guarantee that the page
1074 * won't be freed completely. And mostly callers simply care that the page
1075 * contains data that was valid *at some point in time*. Typically, an IO
1076 * or similar operation cannot guarantee anything stronger anyway because
1077 * locks can't be held over the syscall boundary.
1078 *
1079 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1080 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1081 * appropriate) must be called after the page is finished with, and
1082 * before put_page is called.
1083 *
1084 * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
1085 * released by an up_read().  That can happen if @gup_flags does not
1086 * have FOLL_NOWAIT.
1087 *
1088 * A caller using such a combination of @locked and @gup_flags
1089 * must therefore hold the mmap_lock for reading only, and recognize
1090 * when it's been released.  Otherwise, it must be held for either
1091 * reading or writing and will not be released.
1092 *
1093 * In most cases, get_user_pages or get_user_pages_fast should be used
1094 * instead of __get_user_pages. __get_user_pages should be used only if
1095 * you need some special @gup_flags.
1096 */
1097static long __get_user_pages(struct mm_struct *mm,
1098                unsigned long start, unsigned long nr_pages,
1099                unsigned int gup_flags, struct page **pages,
1100                struct vm_area_struct **vmas, int *locked)
1101{
1102        long ret = 0, i = 0;
1103        struct vm_area_struct *vma = NULL;
1104        struct follow_page_context ctx = { NULL };
1105
1106        if (!nr_pages)
1107                return 0;
1108
1109        start = untagged_addr(start);
1110
1111        VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
1112
1113        /*
1114         * If FOLL_FORCE is set then do not force a full fault as the hinting
1115         * fault information is unrelated to the reference behaviour of a task
1116         * using the address space
1117         */
1118        if (!(gup_flags & FOLL_FORCE))
1119                gup_flags |= FOLL_NUMA;
1120
1121        do {
1122                struct page *page;
1123                unsigned int foll_flags = gup_flags;
1124                unsigned int page_increm;
1125
1126                /* first iteration or cross vma bound */
1127                if (!vma || start >= vma->vm_end) {
1128                        vma = find_extend_vma(mm, start);
1129                        if (!vma && in_gate_area(mm, start)) {
1130                                ret = get_gate_page(mm, start & PAGE_MASK,
1131                                                gup_flags, &vma,
1132                                                pages ? &pages[i] : NULL);
1133                                if (ret)
1134                                        goto out;
1135                                ctx.page_mask = 0;
1136                                goto next_page;
1137                        }
1138
1139                        if (!vma) {
1140                                ret = -EFAULT;
1141                                goto out;
1142                        }
1143                        ret = check_vma_flags(vma, gup_flags);
1144                        if (ret)
1145                                goto out;
1146
1147                        if (is_vm_hugetlb_page(vma)) {
1148                                i = follow_hugetlb_page(mm, vma, pages, vmas,
1149                                                &start, &nr_pages, i,
1150                                                gup_flags, locked);
1151                                if (locked && *locked == 0) {
1152                                        /*
1153                                         * We've got a VM_FAULT_RETRY
1154                                         * and we've lost mmap_lock.
1155                                         * We must stop here.
1156                                         */
1157                                        BUG_ON(gup_flags & FOLL_NOWAIT);
1158                                        goto out;
1159                                }
1160                                continue;
1161                        }
1162                }
1163retry:
1164                /*
1165                 * If we have a pending SIGKILL, don't keep faulting pages and
1166                 * potentially allocating memory.
1167                 */
1168                if (fatal_signal_pending(current)) {
1169                        ret = -EINTR;
1170                        goto out;
1171                }
1172                cond_resched();
1173
1174                page = follow_page_mask(vma, start, foll_flags, &ctx);
1175                if (!page || PTR_ERR(page) == -EMLINK) {
1176                        ret = faultin_page(vma, start, &foll_flags,
1177                                           PTR_ERR(page) == -EMLINK, locked);
1178                        switch (ret) {
1179                        case 0:
1180                                goto retry;
1181                        case -EBUSY:
1182                                ret = 0;
1183                                fallthrough;
1184                        case -EFAULT:
1185                        case -ENOMEM:
1186                        case -EHWPOISON:
1187                                goto out;
1188                        }
1189                        BUG();
1190                } else if (PTR_ERR(page) == -EEXIST) {
1191                        /*
1192                         * Proper page table entry exists, but no corresponding
1193                         * struct page. If the caller expects **pages to be
1194                         * filled in, bail out now, because that can't be done
1195                         * for this page.
1196                         */
1197                        if (pages) {
1198                                ret = PTR_ERR(page);
1199                                goto out;
1200                        }
1201
1202                        goto next_page;
1203                } else if (IS_ERR(page)) {
1204                        ret = PTR_ERR(page);
1205                        goto out;
1206                }
1207                if (pages) {
1208                        pages[i] = page;
1209                        flush_anon_page(vma, page, start);
1210                        flush_dcache_page(page);
1211                        ctx.page_mask = 0;
1212                }
1213next_page:
1214                if (vmas) {
1215                        vmas[i] = vma;
1216                        ctx.page_mask = 0;
1217                }
1218                page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
1219                if (page_increm > nr_pages)
1220                        page_increm = nr_pages;
1221                i += page_increm;
1222                start += page_increm * PAGE_SIZE;
1223                nr_pages -= page_increm;
1224        } while (nr_pages);
1225out:
1226        if (ctx.pgmap)
1227                put_dev_pagemap(ctx.pgmap);
1228        return i ? i : ret;
1229}
1230
1231static bool vma_permits_fault(struct vm_area_struct *vma,
1232                              unsigned int fault_flags)
1233{
1234        bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
1235        bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
1236        vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
1237
1238        if (!(vm_flags & vma->vm_flags))
1239                return false;
1240
1241        /*
1242         * The architecture might have a hardware protection
1243         * mechanism other than read/write that can deny access.
1244         *
1245         * gup always represents data access, not instruction
1246         * fetches, so execute=false here:
1247         */
1248        if (!arch_vma_access_permitted(vma, write, false, foreign))
1249                return false;
1250
1251        return true;
1252}
1253
1254/**
1255 * fixup_user_fault() - manually resolve a user page fault
1256 * @mm:         mm_struct of target mm
1257 * @address:    user address
1258 * @fault_flags:flags to pass down to handle_mm_fault()
1259 * @unlocked:   did we unlock the mmap_lock while retrying, maybe NULL if caller
1260 *              does not allow retry. If NULL, the caller must guarantee
1261 *              that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
1262 *
1263 * This is meant to be called in the specific scenario where for locking reasons
1264 * we try to access user memory in atomic context (within a pagefault_disable()
1265 * section), this returns -EFAULT, and we want to resolve the user fault before
1266 * trying again.
1267 *
1268 * Typically this is meant to be used by the futex code.
1269 *
1270 * The main difference with get_user_pages() is that this function will
1271 * unconditionally call handle_mm_fault() which will in turn perform all the
1272 * necessary SW fixup of the dirty and young bits in the PTE, while
1273 * get_user_pages() only guarantees to update these in the struct page.
1274 *
1275 * This is important for some architectures where those bits also gate the
1276 * access permission to the page because they are maintained in software.  On
1277 * such architectures, gup() will not be enough to make a subsequent access
1278 * succeed.
1279 *
1280 * This function will not return with an unlocked mmap_lock. So it has not the
1281 * same semantics wrt the @mm->mmap_lock as does filemap_fault().
1282 */
1283int fixup_user_fault(struct mm_struct *mm,
1284                     unsigned long address, unsigned int fault_flags,
1285                     bool *unlocked)
1286{
1287        struct vm_area_struct *vma;
1288        vm_fault_t ret;
1289
1290        address = untagged_addr(address);
1291
1292        if (unlocked)
1293                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1294
1295retry:
1296        vma = find_extend_vma(mm, address);
1297        if (!vma || address < vma->vm_start)
1298                return -EFAULT;
1299
1300        if (!vma_permits_fault(vma, fault_flags))
1301                return -EFAULT;
1302
1303        if ((fault_flags & FAULT_FLAG_KILLABLE) &&
1304            fatal_signal_pending(current))
1305                return -EINTR;
1306
1307        ret = handle_mm_fault(vma, address, fault_flags, NULL);
1308        if (ret & VM_FAULT_ERROR) {
1309                int err = vm_fault_to_errno(ret, 0);
1310
1311                if (err)
1312                        return err;
1313                BUG();
1314        }
1315
1316        if (ret & VM_FAULT_RETRY) {
1317                mmap_read_lock(mm);
1318                *unlocked = true;
1319                fault_flags |= FAULT_FLAG_TRIED;
1320                goto retry;
1321        }
1322
1323        return 0;
1324}
1325EXPORT_SYMBOL_GPL(fixup_user_fault);
1326
1327/*
1328 * Please note that this function, unlike __get_user_pages will not
1329 * return 0 for nr_pages > 0 without FOLL_NOWAIT
1330 */
1331static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
1332                                                unsigned long start,
1333                                                unsigned long nr_pages,
1334                                                struct page **pages,
1335                                                struct vm_area_struct **vmas,
1336                                                int *locked,
1337                                                unsigned int flags)
1338{
1339        long ret, pages_done;
1340        bool lock_dropped;
1341
1342        if (locked) {
1343                /* if VM_FAULT_RETRY can be returned, vmas become invalid */
1344                BUG_ON(vmas);
1345                /* check caller initialized locked */
1346                BUG_ON(*locked != 1);
1347        }
1348
1349        if (flags & FOLL_PIN)
1350                mm_set_has_pinned_flag(&mm->flags);
1351
1352        /*
1353         * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
1354         * is to set FOLL_GET if the caller wants pages[] filled in (but has
1355         * carelessly failed to specify FOLL_GET), so keep doing that, but only
1356         * for FOLL_GET, not for the newer FOLL_PIN.
1357         *
1358         * FOLL_PIN always expects pages to be non-null, but no need to assert
1359         * that here, as any failures will be obvious enough.
1360         */
1361        if (pages && !(flags & FOLL_PIN))
1362                flags |= FOLL_GET;
1363
1364        pages_done = 0;
1365        lock_dropped = false;
1366        for (;;) {
1367                ret = __get_user_pages(mm, start, nr_pages, flags, pages,
1368                                       vmas, locked);
1369                if (!locked)
1370                        /* VM_FAULT_RETRY couldn't trigger, bypass */
1371                        return ret;
1372
1373                /* VM_FAULT_RETRY cannot return errors */
1374                if (!*locked) {
1375                        BUG_ON(ret < 0);
1376                        BUG_ON(ret >= nr_pages);
1377                }
1378
1379                if (ret > 0) {
1380                        nr_pages -= ret;
1381                        pages_done += ret;
1382                        if (!nr_pages)
1383                                break;
1384                }
1385                if (*locked) {
1386                        /*
1387                         * VM_FAULT_RETRY didn't trigger or it was a
1388                         * FOLL_NOWAIT.
1389                         */
1390                        if (!pages_done)
1391                                pages_done = ret;
1392                        break;
1393                }
1394                /*
1395                 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1396                 * For the prefault case (!pages) we only update counts.
1397                 */
1398                if (likely(pages))
1399                        pages += ret;
1400                start += ret << PAGE_SHIFT;
1401                lock_dropped = true;
1402
1403retry:
1404                /*
1405                 * Repeat on the address that fired VM_FAULT_RETRY
1406                 * with both FAULT_FLAG_ALLOW_RETRY and
1407                 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
1408                 * by fatal signals, so we need to check it before we
1409                 * start trying again otherwise it can loop forever.
1410                 */
1411
1412                if (fatal_signal_pending(current)) {
1413                        if (!pages_done)
1414                                pages_done = -EINTR;
1415                        break;
1416                }
1417
1418                ret = mmap_read_lock_killable(mm);
1419                if (ret) {
1420                        BUG_ON(ret > 0);
1421                        if (!pages_done)
1422                                pages_done = ret;
1423                        break;
1424                }
1425
1426                *locked = 1;
1427                ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
1428                                       pages, NULL, locked);
1429                if (!*locked) {
1430                        /* Continue to retry until we succeeded */
1431                        BUG_ON(ret != 0);
1432                        goto retry;
1433                }
1434                if (ret != 1) {
1435                        BUG_ON(ret > 1);
1436                        if (!pages_done)
1437                                pages_done = ret;
1438                        break;
1439                }
1440                nr_pages--;
1441                pages_done++;
1442                if (!nr_pages)
1443                        break;
1444                if (likely(pages))
1445                        pages++;
1446                start += PAGE_SIZE;
1447        }
1448        if (lock_dropped && *locked) {
1449                /*
1450                 * We must let the caller know we temporarily dropped the lock
1451                 * and so the critical section protected by it was lost.
1452                 */
1453                mmap_read_unlock(mm);
1454                *locked = 0;
1455        }
1456        return pages_done;
1457}
1458
1459/**
1460 * populate_vma_page_range() -  populate a range of pages in the vma.
1461 * @vma:   target vma
1462 * @start: start address
1463 * @end:   end address
1464 * @locked: whether the mmap_lock is still held
1465 *
1466 * This takes care of mlocking the pages too if VM_LOCKED is set.
1467 *
1468 * Return either number of pages pinned in the vma, or a negative error
1469 * code on error.
1470 *
1471 * vma->vm_mm->mmap_lock must be held.
1472 *
1473 * If @locked is NULL, it may be held for read or write and will
1474 * be unperturbed.
1475 *
1476 * If @locked is non-NULL, it must held for read only and may be
1477 * released.  If it's released, *@locked will be set to 0.
1478 */
1479long populate_vma_page_range(struct vm_area_struct *vma,
1480                unsigned long start, unsigned long end, int *locked)
1481{
1482        struct mm_struct *mm = vma->vm_mm;
1483        unsigned long nr_pages = (end - start) / PAGE_SIZE;
1484        int gup_flags;
1485        long ret;
1486
1487        VM_BUG_ON(!PAGE_ALIGNED(start));
1488        VM_BUG_ON(!PAGE_ALIGNED(end));
1489        VM_BUG_ON_VMA(start < vma->vm_start, vma);
1490        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
1491        mmap_assert_locked(mm);
1492
1493        /*
1494         * Rightly or wrongly, the VM_LOCKONFAULT case has never used
1495         * faultin_page() to break COW, so it has no work to do here.
1496         */
1497        if (vma->vm_flags & VM_LOCKONFAULT)
1498                return nr_pages;
1499
1500        gup_flags = FOLL_TOUCH;
1501        /*
1502         * We want to touch writable mappings with a write fault in order
1503         * to break COW, except for shared mappings because these don't COW
1504         * and we would not want to dirty them for nothing.
1505         */
1506        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1507                gup_flags |= FOLL_WRITE;
1508
1509        /*
1510         * We want mlock to succeed for regions that have any permissions
1511         * other than PROT_NONE.
1512         */
1513        if (vma_is_accessible(vma))
1514                gup_flags |= FOLL_FORCE;
1515
1516        /*
1517         * We made sure addr is within a VMA, so the following will
1518         * not result in a stack expansion that recurses back here.
1519         */
1520        ret = __get_user_pages(mm, start, nr_pages, gup_flags,
1521                                NULL, NULL, locked);
1522        lru_add_drain();
1523        return ret;
1524}
1525
1526/*
1527 * faultin_vma_page_range() - populate (prefault) page tables inside the
1528 *                            given VMA range readable/writable
1529 *
1530 * This takes care of mlocking the pages, too, if VM_LOCKED is set.
1531 *
1532 * @vma: target vma
1533 * @start: start address
1534 * @end: end address
1535 * @write: whether to prefault readable or writable
1536 * @locked: whether the mmap_lock is still held
1537 *
1538 * Returns either number of processed pages in the vma, or a negative error
1539 * code on error (see __get_user_pages()).
1540 *
1541 * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
1542 * covered by the VMA.
1543 *
1544 * If @locked is NULL, it may be held for read or write and will be unperturbed.
1545 *
1546 * If @locked is non-NULL, it must held for read only and may be released.  If
1547 * it's released, *@locked will be set to 0.
1548 */
1549long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
1550                            unsigned long end, bool write, int *locked)
1551{
1552        struct mm_struct *mm = vma->vm_mm;
1553        unsigned long nr_pages = (end - start) / PAGE_SIZE;
1554        int gup_flags;
1555        long ret;
1556
1557        VM_BUG_ON(!PAGE_ALIGNED(start));
1558        VM_BUG_ON(!PAGE_ALIGNED(end));
1559        VM_BUG_ON_VMA(start < vma->vm_start, vma);
1560        VM_BUG_ON_VMA(end > vma->vm_end, vma);
1561        mmap_assert_locked(mm);
1562
1563        /*
1564         * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
1565         *             the page dirty with FOLL_WRITE -- which doesn't make a
1566         *             difference with !FOLL_FORCE, because the page is writable
1567         *             in the page table.
1568         * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
1569         *                a poisoned page.
1570         * !FOLL_FORCE: Require proper access permissions.
1571         */
1572        gup_flags = FOLL_TOUCH | FOLL_HWPOISON;
1573        if (write)
1574                gup_flags |= FOLL_WRITE;
1575
1576        /*
1577         * We want to report -EINVAL instead of -EFAULT for any permission
1578         * problems or incompatible mappings.
1579         */
1580        if (check_vma_flags(vma, gup_flags))
1581                return -EINVAL;
1582
1583        ret = __get_user_pages(mm, start, nr_pages, gup_flags,
1584                                NULL, NULL, locked);
1585        lru_add_drain();
1586        return ret;
1587}
1588
1589/*
1590 * __mm_populate - populate and/or mlock pages within a range of address space.
1591 *
1592 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1593 * flags. VMAs must be already marked with the desired vm_flags, and
1594 * mmap_lock must not be held.
1595 */
1596int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1597{
1598        struct mm_struct *mm = current->mm;
1599        unsigned long end, nstart, nend;
1600        struct vm_area_struct *vma = NULL;
1601        int locked = 0;
1602        long ret = 0;
1603
1604        end = start + len;
1605
1606        for (nstart = start; nstart < end; nstart = nend) {
1607                /*
1608                 * We want to fault in pages for [nstart; end) address range.
1609                 * Find first corresponding VMA.
1610                 */
1611                if (!locked) {
1612                        locked = 1;
1613                        mmap_read_lock(mm);
1614                        vma = find_vma(mm, nstart);
1615                } else if (nstart >= vma->vm_end)
1616                        vma = vma->vm_next;
1617                if (!vma || vma->vm_start >= end)
1618                        break;
1619                /*
1620                 * Set [nstart; nend) to intersection of desired address
1621                 * range with the first VMA. Also, skip undesirable VMA types.
1622                 */
1623                nend = min(end, vma->vm_end);
1624                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1625                        continue;
1626                if (nstart < vma->vm_start)
1627                        nstart = vma->vm_start;
1628                /*
1629                 * Now fault in a range of pages. populate_vma_page_range()
1630                 * double checks the vma flags, so that it won't mlock pages
1631                 * if the vma was already munlocked.
1632                 */
1633                ret = populate_vma_page_range(vma, nstart, nend, &locked);
1634                if (ret < 0) {
1635                        if (ignore_errors) {
1636                                ret = 0;
1637                                continue;       /* continue at next VMA */
1638                        }
1639                        break;
1640                }
1641                nend = nstart + ret * PAGE_SIZE;
1642                ret = 0;
1643        }
1644        if (locked)
1645                mmap_read_unlock(mm);
1646        return ret;     /* 0 or negative error code */
1647}
1648#else /* CONFIG_MMU */
1649static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
1650                unsigned long nr_pages, struct page **pages,
1651                struct vm_area_struct **vmas, int *locked,
1652                unsigned int foll_flags)
1653{
1654        struct vm_area_struct *vma;
1655        unsigned long vm_flags;
1656        long i;
1657
1658        /* calculate required read or write permissions.
1659         * If FOLL_FORCE is set, we only require the "MAY" flags.
1660         */
1661        vm_flags  = (foll_flags & FOLL_WRITE) ?
1662                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1663        vm_flags &= (foll_flags & FOLL_FORCE) ?
1664                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1665
1666        for (i = 0; i < nr_pages; i++) {
1667                vma = find_vma(mm, start);
1668                if (!vma)
1669                        goto finish_or_fault;
1670
1671                /* protect what we can, including chardevs */
1672                if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1673                    !(vm_flags & vma->vm_flags))
1674                        goto finish_or_fault;
1675
1676                if (pages) {
1677                        pages[i] = virt_to_page(start);
1678                        if (pages[i])
1679                                get_page(pages[i]);
1680                }
1681                if (vmas)
1682                        vmas[i] = vma;
1683                start = (start + PAGE_SIZE) & PAGE_MASK;
1684        }
1685
1686        return i;
1687
1688finish_or_fault:
1689        return i ? : -EFAULT;
1690}
1691#endif /* !CONFIG_MMU */
1692
1693/**
1694 * fault_in_writeable - fault in userspace address range for writing
1695 * @uaddr: start of address range
1696 * @size: size of address range
1697 *
1698 * Returns the number of bytes not faulted in (like copy_to_user() and
1699 * copy_from_user()).
1700 */
1701size_t fault_in_writeable(char __user *uaddr, size_t size)
1702{
1703        char __user *start = uaddr, *end;
1704
1705        if (unlikely(size == 0))
1706                return 0;
1707        if (!user_write_access_begin(uaddr, size))
1708                return size;
1709        if (!PAGE_ALIGNED(uaddr)) {
1710                unsafe_put_user(0, uaddr, out);
1711                uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
1712        }
1713        end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
1714        if (unlikely(end < start))
1715                end = NULL;
1716        while (uaddr != end) {
1717                unsafe_put_user(0, uaddr, out);
1718                uaddr += PAGE_SIZE;
1719        }
1720
1721out:
1722        user_write_access_end();
1723        if (size > uaddr - start)
1724                return size - (uaddr - start);
1725        return 0;
1726}
1727EXPORT_SYMBOL(fault_in_writeable);
1728
1729/**
1730 * fault_in_subpage_writeable - fault in an address range for writing
1731 * @uaddr: start of address range
1732 * @size: size of address range
1733 *
1734 * Fault in a user address range for writing while checking for permissions at
1735 * sub-page granularity (e.g. arm64 MTE). This function should be used when
1736 * the caller cannot guarantee forward progress of a copy_to_user() loop.
1737 *
1738 * Returns the number of bytes not faulted in (like copy_to_user() and
1739 * copy_from_user()).
1740 */
1741size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
1742{
1743        size_t faulted_in;
1744
1745        /*
1746         * Attempt faulting in at page granularity first for page table
1747         * permission checking. The arch-specific probe_subpage_writeable()
1748         * functions may not check for this.
1749         */
1750        faulted_in = size - fault_in_writeable(uaddr, size);
1751        if (faulted_in)
1752                faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
1753
1754        return size - faulted_in;
1755}
1756EXPORT_SYMBOL(fault_in_subpage_writeable);
1757
1758/*
1759 * fault_in_safe_writeable - fault in an address range for writing
1760 * @uaddr: start of address range
1761 * @size: length of address range
1762 *
1763 * Faults in an address range for writing.  This is primarily useful when we
1764 * already know that some or all of the pages in the address range aren't in
1765 * memory.
1766 *
1767 * Unlike fault_in_writeable(), this function is non-destructive.
1768 *
1769 * Note that we don't pin or otherwise hold the pages referenced that we fault
1770 * in.  There's no guarantee that they'll stay in memory for any duration of
1771 * time.
1772 *
1773 * Returns the number of bytes not faulted in, like copy_to_user() and
1774 * copy_from_user().
1775 */
1776size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
1777{
1778        unsigned long start = (unsigned long)uaddr, end;
1779        struct mm_struct *mm = current->mm;
1780        bool unlocked = false;
1781
1782        if (unlikely(size == 0))
1783                return 0;
1784        end = PAGE_ALIGN(start + size);
1785        if (end < start)
1786                end = 0;
1787
1788        mmap_read_lock(mm);
1789        do {
1790                if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
1791                        break;
1792                start = (start + PAGE_SIZE) & PAGE_MASK;
1793        } while (start != end);
1794        mmap_read_unlock(mm);
1795
1796        if (size > (unsigned long)uaddr - start)
1797                return size - ((unsigned long)uaddr - start);
1798        return 0;
1799}
1800EXPORT_SYMBOL(fault_in_safe_writeable);
1801
1802/**
1803 * fault_in_readable - fault in userspace address range for reading
1804 * @uaddr: start of user address range
1805 * @size: size of user address range
1806 *
1807 * Returns the number of bytes not faulted in (like copy_to_user() and
1808 * copy_from_user()).
1809 */
1810size_t fault_in_readable(const char __user *uaddr, size_t size)
1811{
1812        const char __user *start = uaddr, *end;
1813        volatile char c;
1814
1815        if (unlikely(size == 0))
1816                return 0;
1817        if (!user_read_access_begin(uaddr, size))
1818                return size;
1819        if (!PAGE_ALIGNED(uaddr)) {
1820                unsafe_get_user(c, uaddr, out);
1821                uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
1822        }
1823        end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
1824        if (unlikely(end < start))
1825                end = NULL;
1826        while (uaddr != end) {
1827                unsafe_get_user(c, uaddr, out);
1828                uaddr += PAGE_SIZE;
1829        }
1830
1831out:
1832        user_read_access_end();
1833        (void)c;
1834        if (size > uaddr - start)
1835                return size - (uaddr - start);
1836        return 0;
1837}
1838EXPORT_SYMBOL(fault_in_readable);
1839
1840/**
1841 * get_dump_page() - pin user page in memory while writing it to core dump
1842 * @addr: user address
1843 *
1844 * Returns struct page pointer of user page pinned for dump,
1845 * to be freed afterwards by put_page().
1846 *
1847 * Returns NULL on any kind of failure - a hole must then be inserted into
1848 * the corefile, to preserve alignment with its headers; and also returns
1849 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1850 * allowing a hole to be left in the corefile to save disk space.
1851 *
1852 * Called without mmap_lock (takes and releases the mmap_lock by itself).
1853 */
1854#ifdef CONFIG_ELF_CORE
1855struct page *get_dump_page(unsigned long addr)
1856{
1857        struct mm_struct *mm = current->mm;
1858        struct page *page;
1859        int locked = 1;
1860        int ret;
1861
1862        if (mmap_read_lock_killable(mm))
1863                return NULL;
1864        ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
1865                                      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
1866        if (locked)
1867                mmap_read_unlock(mm);
1868        return (ret == 1) ? page : NULL;
1869}
1870#endif /* CONFIG_ELF_CORE */
1871
1872#ifdef CONFIG_MIGRATION
1873/*
1874 * Check whether all pages are pinnable, if so return number of pages.  If some
1875 * pages are not pinnable, migrate them, and unpin all pages. Return zero if
1876 * pages were migrated, or if some pages were not successfully isolated.
1877 * Return negative error if migration fails.
1878 */
1879static long check_and_migrate_movable_pages(unsigned long nr_pages,
1880                                            struct page **pages,
1881                                            unsigned int gup_flags)
1882{
1883        unsigned long isolation_error_count = 0, i;
1884        struct folio *prev_folio = NULL;
1885        LIST_HEAD(movable_page_list);
1886        bool drain_allow = true;
1887        int ret = 0;
1888
1889        for (i = 0; i < nr_pages; i++) {
1890                struct folio *folio = page_folio(pages[i]);
1891
1892                if (folio == prev_folio)
1893                        continue;
1894                prev_folio = folio;
1895
1896                if (folio_is_pinnable(folio))
1897                        continue;
1898
1899                /*
1900                 * Try to move out any movable page before pinning the range.
1901                 */
1902                if (folio_test_hugetlb(folio)) {
1903                        if (!isolate_huge_page(&folio->page,
1904                                                &movable_page_list))
1905                                isolation_error_count++;
1906                        continue;
1907                }
1908
1909                if (!folio_test_lru(folio) && drain_allow) {
1910                        lru_add_drain_all();
1911                        drain_allow = false;
1912                }
1913
1914                if (folio_isolate_lru(folio)) {
1915                        isolation_error_count++;
1916                        continue;
1917                }
1918                list_add_tail(&folio->lru, &movable_page_list);
1919                node_stat_mod_folio(folio,
1920                                    NR_ISOLATED_ANON + folio_is_file_lru(folio),
1921                                    folio_nr_pages(folio));
1922        }
1923
1924        if (!list_empty(&movable_page_list) || isolation_error_count)
1925                goto unpin_pages;
1926
1927        /*
1928         * If list is empty, and no isolation errors, means that all pages are
1929         * in the correct zone.
1930         */
1931        return nr_pages;
1932
1933unpin_pages:
1934        if (gup_flags & FOLL_PIN) {
1935                unpin_user_pages(pages, nr_pages);
1936        } else {
1937                for (i = 0; i < nr_pages; i++)
1938                        put_page(pages[i]);
1939        }
1940
1941        if (!list_empty(&movable_page_list)) {
1942                struct migration_target_control mtc = {
1943                        .nid = NUMA_NO_NODE,
1944                        .gfp_mask = GFP_USER | __GFP_NOWARN,
1945                };
1946
1947                ret = migrate_pages(&movable_page_list, alloc_migration_target,
1948                                    NULL, (unsigned long)&mtc, MIGRATE_SYNC,
1949                                    MR_LONGTERM_PIN, NULL);
1950                if (ret > 0) /* number of pages not migrated */
1951                        ret = -ENOMEM;
1952        }
1953
1954        if (ret && !list_empty(&movable_page_list))
1955                putback_movable_pages(&movable_page_list);
1956        return ret;
1957}
1958#else
1959static long check_and_migrate_movable_pages(unsigned long nr_pages,
1960                                            struct page **pages,
1961                                            unsigned int gup_flags)
1962{
1963        return nr_pages;
1964}
1965#endif /* CONFIG_MIGRATION */
1966
1967/*
1968 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
1969 * allows us to process the FOLL_LONGTERM flag.
1970 */
1971static long __gup_longterm_locked(struct mm_struct *mm,
1972                                  unsigned long start,
1973                                  unsigned long nr_pages,
1974                                  struct page **pages,
1975                                  struct vm_area_struct **vmas,
1976                                  unsigned int gup_flags)
1977{
1978        unsigned int flags;
1979        long rc;
1980
1981        if (!(gup_flags & FOLL_LONGTERM))
1982                return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
1983                                               NULL, gup_flags);
1984        flags = memalloc_pin_save();
1985        do {
1986                rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
1987                                             NULL, gup_flags);
1988                if (rc <= 0)
1989                        break;
1990                rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
1991        } while (!rc);
1992        memalloc_pin_restore(flags);
1993
1994        return rc;
1995}
1996
1997static bool is_valid_gup_flags(unsigned int gup_flags)
1998{
1999        /*
2000         * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
2001         * never directly by the caller, so enforce that with an assertion:
2002         */
2003        if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
2004                return false;
2005        /*
2006         * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
2007         * that is, FOLL_LONGTERM is a specific case, more restrictive case of
2008         * FOLL_PIN.
2009         */
2010        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
2011                return false;
2012
2013        return true;
2014}
2015
2016#ifdef CONFIG_MMU
2017static long __get_user_pages_remote(struct mm_struct *mm,
2018                                    unsigned long start, unsigned long nr_pages,
2019                                    unsigned int gup_flags, struct page **pages,
2020                                    struct vm_area_struct **vmas, int *locked)
2021{
2022        /*
2023         * Parts of FOLL_LONGTERM behavior are incompatible with
2024         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
2025         * vmas. However, this only comes up if locked is set, and there are
2026         * callers that do request FOLL_LONGTERM, but do not set locked. So,
2027         * allow what we can.
2028         */
2029        if (gup_flags & FOLL_LONGTERM) {
2030                if (WARN_ON_ONCE(locked))
2031                        return -EINVAL;
2032                /*
2033                 * This will check the vmas (even if our vmas arg is NULL)
2034                 * and return -ENOTSUPP if DAX isn't allowed in this case:
2035                 */
2036                return __gup_longterm_locked(mm, start, nr_pages, pages,
2037                                             vmas, gup_flags | FOLL_TOUCH |
2038                                             FOLL_REMOTE);
2039        }
2040
2041        return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
2042                                       locked,
2043                                       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
2044}
2045
2046/**
2047 * get_user_pages_remote() - pin user pages in memory
2048 * @mm:         mm_struct of target mm
2049 * @start:      starting user address
2050 * @nr_pages:   number of pages from start to pin
2051 * @gup_flags:  flags modifying lookup behaviour
2052 * @pages:      array that receives pointers to the pages pinned.
2053 *              Should be at least nr_pages long. Or NULL, if caller
2054 *              only intends to ensure the pages are faulted in.
2055 * @vmas:       array of pointers to vmas corresponding to each page.
2056 *              Or NULL if the caller does not require them.
2057 * @locked:     pointer to lock flag indicating whether lock is held and
2058 *              subsequently whether VM_FAULT_RETRY functionality can be
2059 *              utilised. Lock must initially be held.
2060 *
2061 * Returns either number of pages pinned (which may be less than the
2062 * number requested), or an error. Details about the return value:
2063 *
2064 * -- If nr_pages is 0, returns 0.
2065 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
2066 * -- If nr_pages is >0, and some pages were pinned, returns the number of
2067 *    pages pinned. Again, this may be less than nr_pages.
2068 *
2069 * The caller is responsible for releasing returned @pages, via put_page().
2070 *
2071 * @vmas are valid only as long as mmap_lock is held.
2072 *
2073 * Must be called with mmap_lock held for read or write.
2074 *
2075 * get_user_pages_remote walks a process's page tables and takes a reference
2076 * to each struct page that each user address corresponds to at a given
2077 * instant. That is, it takes the page that would be accessed if a user
2078 * thread accesses the given user virtual address at that instant.
2079 *
2080 * This does not guarantee that the page exists in the user mappings when
2081 * get_user_pages_remote returns, and there may even be a completely different
2082 * page there in some cases (eg. if mmapped pagecache has been invalidated
2083 * and subsequently re faulted). However it does guarantee that the page
2084 * won't be freed completely. And mostly callers simply care that the page
2085 * contains data that was valid *at some point in time*. Typically, an IO
2086 * or similar operation cannot guarantee anything stronger anyway because
2087 * locks can't be held over the syscall boundary.
2088 *
2089 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
2090 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
2091 * be called after the page is finished with, and before put_page is called.
2092 *
2093 * get_user_pages_remote is typically used for fewer-copy IO operations,
2094 * to get a handle on the memory by some means other than accesses
2095 * via the user virtual addresses. The pages may be submitted for
2096 * DMA to devices or accessed via their kernel linear mapping (via the
2097 * kmap APIs). Care should be taken to use the correct cache flushing APIs.
2098 *
2099 * See also get_user_pages_fast, for performance critical applications.
2100 *
2101 * get_user_pages_remote should be phased out in favor of
2102 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
2103 * should use get_user_pages_remote because it cannot pass
2104 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
2105 */
2106long get_user_pages_remote(struct mm_struct *mm,
2107                unsigned long start, unsigned long nr_pages,
2108                unsigned int gup_flags, struct page **pages,
2109                struct vm_area_struct **vmas, int *locked)
2110{
2111        if (!is_valid_gup_flags(gup_flags))
2112                return -EINVAL;
2113
2114        return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
2115                                       pages, vmas, locked);
2116}
2117EXPORT_SYMBOL(get_user_pages_remote);
2118
2119#else /* CONFIG_MMU */
2120long get_user_pages_remote(struct mm_struct *mm,
2121                           unsigned long start, unsigned long nr_pages,
2122                           unsigned int gup_flags, struct page **pages,
2123                           struct vm_area_struct **vmas, int *locked)
2124{
2125        return 0;
2126}
2127
2128static long __get_user_pages_remote(struct mm_struct *mm,
2129                                    unsigned long start, unsigned long nr_pages,
2130                                    unsigned int gup_flags, struct page **pages,
2131                                    struct vm_area_struct **vmas, int *locked)
2132{
2133        return 0;
2134}
2135#endif /* !CONFIG_MMU */
2136
2137/**
2138 * get_user_pages() - pin user pages in memory
2139 * @start:      starting user address
2140 * @nr_pages:   number of pages from start to pin
2141 * @gup_flags:  flags modifying lookup behaviour
2142 * @pages:      array that receives pointers to the pages pinned.
2143 *              Should be at least nr_pages long. Or NULL, if caller
2144 *              only intends to ensure the pages are faulted in.
2145 * @vmas:       array of pointers to vmas corresponding to each page.
2146 *              Or NULL if the caller does not require them.
2147 *
2148 * This is the same as get_user_pages_remote(), just with a less-flexible
2149 * calling convention where we assume that the mm being operated on belongs to
2150 * the current task, and doesn't allow passing of a locked parameter.  We also
2151 * obviously don't pass FOLL_REMOTE in here.
2152 */
2153long get_user_pages(unsigned long start, unsigned long nr_pages,
2154                unsigned int gup_flags, struct page **pages,
2155                struct vm_area_struct **vmas)
2156{
2157        if (!is_valid_gup_flags(gup_flags))
2158                return -EINVAL;
2159
2160        return __gup_longterm_locked(current->mm, start, nr_pages,
2161                                     pages, vmas, gup_flags | FOLL_TOUCH);
2162}
2163EXPORT_SYMBOL(get_user_pages);
2164
2165/*
2166 * get_user_pages_unlocked() is suitable to replace the form:
2167 *
2168 *      mmap_read_lock(mm);
2169 *      get_user_pages(mm, ..., pages, NULL);
2170 *      mmap_read_unlock(mm);
2171 *
2172 *  with:
2173 *
2174 *      get_user_pages_unlocked(mm, ..., pages);
2175 *
2176 * It is functionally equivalent to get_user_pages_fast so
2177 * get_user_pages_fast should be used instead if specific gup_flags
2178 * (e.g. FOLL_FORCE) are not required.
2179 */
2180long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2181                             struct page **pages, unsigned int gup_flags)
2182{
2183        struct mm_struct *mm = current->mm;
2184        int locked = 1;
2185        long ret;
2186
2187        /*
2188         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
2189         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
2190         * vmas.  As there are no users of this flag in this call we simply
2191         * disallow this option for now.
2192         */
2193        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
2194                return -EINVAL;
2195
2196        mmap_read_lock(mm);
2197        ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
2198                                      &locked, gup_flags | FOLL_TOUCH);
2199        if (locked)
2200                mmap_read_unlock(mm);
2201        return ret;
2202}
2203EXPORT_SYMBOL(get_user_pages_unlocked);
2204
2205/*
2206 * Fast GUP
2207 *
2208 * get_user_pages_fast attempts to pin user pages by walking the page
2209 * tables directly and avoids taking locks. Thus the walker needs to be
2210 * protected from page table pages being freed from under it, and should
2211 * block any THP splits.
2212 *
2213 * One way to achieve this is to have the walker disable interrupts, and
2214 * rely on IPIs from the TLB flushing code blocking before the page table
2215 * pages are freed. This is unsuitable for architectures that do not need
2216 * to broadcast an IPI when invalidating TLBs.
2217 *
2218 * Another way to achieve this is to batch up page table containing pages
2219 * belonging to more than one mm_user, then rcu_sched a callback to free those
2220 * pages. Disabling interrupts will allow the fast_gup walker to both block
2221 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
2222 * (which is a relatively rare event). The code below adopts this strategy.
2223 *
2224 * Before activating this code, please be aware that the following assumptions
2225 * are currently made:
2226 *
2227 *  *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
2228 *  free pages containing page tables or TLB flushing requires IPI broadcast.
2229 *
2230 *  *) ptes can be read atomically by the architecture.
2231 *
2232 *  *) access_ok is sufficient to validate userspace address ranges.
2233 *
2234 * The last two assumptions can be relaxed by the addition of helper functions.
2235 *
2236 * This code is based heavily on the PowerPC implementation by Nick Piggin.
2237 */
2238#ifdef CONFIG_HAVE_FAST_GUP
2239
2240static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
2241                                            unsigned int flags,
2242                                            struct page **pages)
2243{
2244        while ((*nr) - nr_start) {
2245                struct page *page = pages[--(*nr)];
2246
2247                ClearPageReferenced(page);
2248                if (flags & FOLL_PIN)
2249                        unpin_user_page(page);
2250                else
2251                        put_page(page);
2252        }
2253}
2254
2255#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
2256static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
2257                         unsigned int flags, struct page **pages, int *nr)
2258{
2259        struct dev_pagemap *pgmap = NULL;
2260        int nr_start = *nr, ret = 0;
2261        pte_t *ptep, *ptem;
2262
2263        ptem = ptep = pte_offset_map(&pmd, addr);
2264        do {
2265                pte_t pte = ptep_get_lockless(ptep);
2266                struct page *page;
2267                struct folio *folio;
2268
2269                /*
2270                 * Similar to the PMD case below, NUMA hinting must take slow
2271                 * path using the pte_protnone check.
2272                 */
2273                if (pte_protnone(pte))
2274                        goto pte_unmap;
2275
2276                if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2277                        goto pte_unmap;
2278
2279                if (pte_devmap(pte)) {
2280                        if (unlikely(flags & FOLL_LONGTERM))
2281                                goto pte_unmap;
2282
2283                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
2284                        if (unlikely(!pgmap)) {
2285                                undo_dev_pagemap(nr, nr_start, flags, pages);
2286                                goto pte_unmap;
2287                        }
2288                } else if (pte_special(pte))
2289                        goto pte_unmap;
2290
2291                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2292                page = pte_page(pte);
2293
2294                folio = try_grab_folio(page, 1, flags);
2295                if (!folio)
2296                        goto pte_unmap;
2297
2298                if (unlikely(page_is_secretmem(page))) {
2299                        gup_put_folio(folio, 1, flags);
2300                        goto pte_unmap;
2301                }
2302
2303                if (unlikely(pte_val(pte) != pte_val(*ptep))) {
2304                        gup_put_folio(folio, 1, flags);
2305                        goto pte_unmap;
2306                }
2307
2308                if (!pte_write(pte) && gup_must_unshare(flags, page)) {
2309                        gup_put_folio(folio, 1, flags);
2310                        goto pte_unmap;
2311                }
2312
2313                /*
2314                 * We need to make the page accessible if and only if we are
2315                 * going to access its content (the FOLL_PIN case).  Please
2316                 * see Documentation/core-api/pin_user_pages.rst for
2317                 * details.
2318                 */
2319                if (flags & FOLL_PIN) {
2320                        ret = arch_make_page_accessible(page);
2321                        if (ret) {
2322                                gup_put_folio(folio, 1, flags);
2323                                goto pte_unmap;
2324                        }
2325                }
2326                folio_set_referenced(folio);
2327                pages[*nr] = page;
2328                (*nr)++;
2329        } while (ptep++, addr += PAGE_SIZE, addr != end);
2330
2331        ret = 1;
2332
2333pte_unmap:
2334        if (pgmap)
2335                put_dev_pagemap(pgmap);
2336        pte_unmap(ptem);
2337        return ret;
2338}
2339#else
2340
2341/*
2342 * If we can't determine whether or not a pte is special, then fail immediately
2343 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
2344 * to be special.
2345 *
2346 * For a futex to be placed on a THP tail page, get_futex_key requires a
2347 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
2348 * useful to have gup_huge_pmd even if we can't operate on ptes.
2349 */
2350static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
2351                         unsigned int flags, struct page **pages, int *nr)
2352{
2353        return 0;
2354}
2355#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
2356
2357#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
2358static int __gup_device_huge(unsigned long pfn, unsigned long addr,
2359                             unsigned long end, unsigned int flags,
2360                             struct page **pages, int *nr)
2361{
2362        int nr_start = *nr;
2363        struct dev_pagemap *pgmap = NULL;
2364
2365        do {
2366                struct page *page = pfn_to_page(pfn);
2367
2368                pgmap = get_dev_pagemap(pfn, pgmap);
2369                if (unlikely(!pgmap)) {
2370                        undo_dev_pagemap(nr, nr_start, flags, pages);
2371                        break;
2372                }
2373                SetPageReferenced(page);
2374                pages[*nr] = page;
2375                if (unlikely(!try_grab_page(page, flags))) {
2376                        undo_dev_pagemap(nr, nr_start, flags, pages);
2377                        break;
2378                }
2379                (*nr)++;
2380                pfn++;
2381        } while (addr += PAGE_SIZE, addr != end);
2382
2383        put_dev_pagemap(pgmap);
2384        return addr == end;
2385}
2386
2387static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2388                                 unsigned long end, unsigned int flags,
2389                                 struct page **pages, int *nr)
2390{
2391        unsigned long fault_pfn;
2392        int nr_start = *nr;
2393
2394        fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
2395        if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
2396                return 0;
2397
2398        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2399                undo_dev_pagemap(nr, nr_start, flags, pages);
2400                return 0;
2401        }
2402        return 1;
2403}
2404
2405static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
2406                                 unsigned long end, unsigned int flags,
2407                                 struct page **pages, int *nr)
2408{
2409        unsigned long fault_pfn;
2410        int nr_start = *nr;
2411
2412        fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
2413        if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
2414                return 0;
2415
2416        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
2417                undo_dev_pagemap(nr, nr_start, flags, pages);
2418                return 0;
2419        }
2420        return 1;
2421}
2422#else
2423static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2424                                 unsigned long end, unsigned int flags,
2425                                 struct page **pages, int *nr)
2426{
2427        BUILD_BUG();
2428        return 0;
2429}
2430
2431static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
2432                                 unsigned long end, unsigned int flags,
2433                                 struct page **pages, int *nr)
2434{
2435        BUILD_BUG();
2436        return 0;
2437}
2438#endif
2439
2440static int record_subpages(struct page *page, unsigned long addr,
2441                           unsigned long end, struct page **pages)
2442{
2443        int nr;
2444
2445        for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
2446                pages[nr] = nth_page(page, nr);
2447
2448        return nr;
2449}
2450
2451#ifdef CONFIG_ARCH_HAS_HUGEPD
2452static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
2453                                      unsigned long sz)
2454{
2455        unsigned long __boundary = (addr + sz) & ~(sz-1);
2456        return (__boundary - 1 < end - 1) ? __boundary : end;
2457}
2458
2459static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
2460                       unsigned long end, unsigned int flags,
2461                       struct page **pages, int *nr)
2462{
2463        unsigned long pte_end;
2464        struct page *page;
2465        struct folio *folio;
2466        pte_t pte;
2467        int refs;
2468
2469        pte_end = (addr + sz) & ~(sz-1);
2470        if (pte_end < end)
2471                end = pte_end;
2472
2473        pte = huge_ptep_get(ptep);
2474
2475        if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2476                return 0;
2477
2478        /* hugepages are never "special" */
2479        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2480
2481        page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
2482        refs = record_subpages(page, addr, end, pages + *nr);
2483
2484        folio = try_grab_folio(page, refs, flags);
2485        if (!folio)
2486                return 0;
2487
2488        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
2489                gup_put_folio(folio, refs, flags);
2490                return 0;
2491        }
2492
2493        if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) {
2494                gup_put_folio(folio, refs, flags);
2495                return 0;
2496        }
2497
2498        *nr += refs;
2499        folio_set_referenced(folio);
2500        return 1;
2501}
2502
2503static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2504                unsigned int pdshift, unsigned long end, unsigned int flags,
2505                struct page **pages, int *nr)
2506{
2507        pte_t *ptep;
2508        unsigned long sz = 1UL << hugepd_shift(hugepd);
2509        unsigned long next;
2510
2511        ptep = hugepte_offset(hugepd, addr, pdshift);
2512        do {
2513                next = hugepte_addr_end(addr, end, sz);
2514                if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
2515                        return 0;
2516        } while (ptep++, addr = next, addr != end);
2517
2518        return 1;
2519}
2520#else
2521static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2522                unsigned int pdshift, unsigned long end, unsigned int flags,
2523                struct page **pages, int *nr)
2524{
2525        return 0;
2526}
2527#endif /* CONFIG_ARCH_HAS_HUGEPD */
2528
2529static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2530                        unsigned long end, unsigned int flags,
2531                        struct page **pages, int *nr)
2532{
2533        struct page *page;
2534        struct folio *folio;
2535        int refs;
2536
2537        if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
2538                return 0;
2539
2540        if (pmd_devmap(orig)) {
2541                if (unlikely(flags & FOLL_LONGTERM))
2542                        return 0;
2543                return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
2544                                             pages, nr);
2545        }
2546
2547        page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
2548        refs = record_subpages(page, addr, end, pages + *nr);
2549
2550        folio = try_grab_folio(page, refs, flags);
2551        if (!folio)
2552                return 0;
2553
2554        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2555                gup_put_folio(folio, refs, flags);
2556                return 0;
2557        }
2558
2559        if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) {
2560                gup_put_folio(folio, refs, flags);
2561                return 0;
2562        }
2563
2564        *nr += refs;
2565        folio_set_referenced(folio);
2566        return 1;
2567}
2568
2569static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
2570                        unsigned long end, unsigned int flags,
2571                        struct page **pages, int *nr)
2572{
2573        struct page *page;
2574        struct folio *folio;
2575        int refs;
2576
2577        if (!pud_access_permitted(orig, flags & FOLL_WRITE))
2578                return 0;
2579
2580        if (pud_devmap(orig)) {
2581                if (unlikely(flags & FOLL_LONGTERM))
2582                        return 0;
2583                return __gup_device_huge_pud(orig, pudp, addr, end, flags,
2584                                             pages, nr);
2585        }
2586
2587        page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
2588        refs = record_subpages(page, addr, end, pages + *nr);
2589
2590        folio = try_grab_folio(page, refs, flags);
2591        if (!folio)
2592                return 0;
2593
2594        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
2595                gup_put_folio(folio, refs, flags);
2596                return 0;
2597        }
2598
2599        if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) {
2600                gup_put_folio(folio, refs, flags);
2601                return 0;
2602        }
2603
2604        *nr += refs;
2605        folio_set_referenced(folio);
2606        return 1;
2607}
2608
2609static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
2610                        unsigned long end, unsigned int flags,
2611                        struct page **pages, int *nr)
2612{
2613        int refs;
2614        struct page *page;
2615        struct folio *folio;
2616
2617        if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
2618                return 0;
2619
2620        BUILD_BUG_ON(pgd_devmap(orig));
2621
2622        page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
2623        refs = record_subpages(page, addr, end, pages + *nr);
2624
2625        folio = try_grab_folio(page, refs, flags);
2626        if (!folio)
2627                return 0;
2628
2629        if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
2630                gup_put_folio(folio, refs, flags);
2631                return 0;
2632        }
2633
2634        *nr += refs;
2635        folio_set_referenced(folio);
2636        return 1;
2637}
2638
2639static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
2640                unsigned int flags, struct page **pages, int *nr)
2641{
2642        unsigned long next;
2643        pmd_t *pmdp;
2644
2645        pmdp = pmd_offset_lockless(pudp, pud, addr);
2646        do {
2647                pmd_t pmd = READ_ONCE(*pmdp);
2648
2649                next = pmd_addr_end(addr, end);
2650                if (!pmd_present(pmd))
2651                        return 0;
2652
2653                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
2654                             pmd_devmap(pmd))) {
2655                        /*
2656                         * NUMA hinting faults need to be handled in the GUP
2657                         * slowpath for accounting purposes and so that they
2658                         * can be serialised against THP migration.
2659                         */
2660                        if (pmd_protnone(pmd))
2661                                return 0;
2662
2663                        if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
2664                                pages, nr))
2665                                return 0;
2666
2667                } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
2668                        /*
2669                         * architecture have different format for hugetlbfs
2670                         * pmd format and THP pmd format
2671                         */
2672                        if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
2673                                         PMD_SHIFT, next, flags, pages, nr))
2674                                return 0;
2675                } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
2676                        return 0;
2677        } while (pmdp++, addr = next, addr != end);
2678
2679        return 1;
2680}
2681
2682static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
2683                         unsigned int flags, struct page **pages, int *nr)
2684{
2685        unsigned long next;
2686        pud_t *pudp;
2687
2688        pudp = pud_offset_lockless(p4dp, p4d, addr);
2689        do {
2690                pud_t pud = READ_ONCE(*pudp);
2691
2692                next = pud_addr_end(addr, end);
2693                if (unlikely(!pud_present(pud)))
2694                        return 0;
2695                if (unlikely(pud_huge(pud))) {
2696                        if (!gup_huge_pud(pud, pudp, addr, next, flags,
2697                                          pages, nr))
2698                                return 0;
2699                } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
2700                        if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
2701                                         PUD_SHIFT, next, flags, pages, nr))
2702                                return 0;
2703                } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
2704                        return 0;
2705        } while (pudp++, addr = next, addr != end);
2706
2707        return 1;
2708}
2709
2710static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
2711                         unsigned int flags, struct page **pages, int *nr)
2712{
2713        unsigned long next;
2714        p4d_t *p4dp;
2715
2716        p4dp = p4d_offset_lockless(pgdp, pgd, addr);
2717        do {
2718                p4d_t p4d = READ_ONCE(*p4dp);
2719
2720                next = p4d_addr_end(addr, end);
2721                if (p4d_none(p4d))
2722                        return 0;
2723                BUILD_BUG_ON(p4d_huge(p4d));
2724                if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
2725                        if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
2726                                         P4D_SHIFT, next, flags, pages, nr))
2727                                return 0;
2728                } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
2729                        return 0;
2730        } while (p4dp++, addr = next, addr != end);
2731
2732        return 1;
2733}
2734
2735static void gup_pgd_range(unsigned long addr, unsigned long end,
2736                unsigned int flags, struct page **pages, int *nr)
2737{
2738        unsigned long next;
2739        pgd_t *pgdp;
2740
2741        pgdp = pgd_offset(current->mm, addr);
2742        do {
2743                pgd_t pgd = READ_ONCE(*pgdp);
2744
2745                next = pgd_addr_end(addr, end);
2746                if (pgd_none(pgd))
2747                        return;
2748                if (unlikely(pgd_huge(pgd))) {
2749                        if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
2750                                          pages, nr))
2751                                return;
2752                } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
2753                        if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
2754                                         PGDIR_SHIFT, next, flags, pages, nr))
2755                                return;
2756                } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
2757                        return;
2758        } while (pgdp++, addr = next, addr != end);
2759}
2760#else
2761static inline void gup_pgd_range(unsigned long addr, unsigned long end,
2762                unsigned int flags, struct page **pages, int *nr)
2763{
2764}
2765#endif /* CONFIG_HAVE_FAST_GUP */
2766
2767#ifndef gup_fast_permitted
2768/*
2769 * Check if it's allowed to use get_user_pages_fast_only() for the range, or
2770 * we need to fall back to the slow version:
2771 */
2772static bool gup_fast_permitted(unsigned long start, unsigned long end)
2773{
2774        return true;
2775}
2776#endif
2777
2778static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
2779                                   unsigned int gup_flags, struct page **pages)
2780{
2781        int ret;
2782
2783        /*
2784         * FIXME: FOLL_LONGTERM does not work with
2785         * get_user_pages_unlocked() (see comments in that function)
2786         */
2787        if (gup_flags & FOLL_LONGTERM) {
2788                mmap_read_lock(current->mm);
2789                ret = __gup_longterm_locked(current->mm,
2790                                            start, nr_pages,
2791                                            pages, NULL, gup_flags);
2792                mmap_read_unlock(current->mm);
2793        } else {
2794                ret = get_user_pages_unlocked(start, nr_pages,
2795                                              pages, gup_flags);
2796        }
2797
2798        return ret;
2799}
2800
2801static unsigned long lockless_pages_from_mm(unsigned long start,
2802                                            unsigned long end,
2803                                            unsigned int gup_flags,
2804                                            struct page **pages)
2805{
2806        unsigned long flags;
2807        int nr_pinned = 0;
2808        unsigned seq;
2809
2810        if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
2811            !gup_fast_permitted(start, end))
2812                return 0;
2813
2814        if (gup_flags & FOLL_PIN) {
2815                seq = raw_read_seqcount(&current->mm->write_protect_seq);
2816                if (seq & 1)
2817                        return 0;
2818        }
2819
2820        /*
2821         * Disable interrupts. The nested form is used, in order to allow full,
2822         * general purpose use of this routine.
2823         *
2824         * With interrupts disabled, we block page table pages from being freed
2825         * from under us. See struct mmu_table_batch comments in
2826         * include/asm-generic/tlb.h for more details.
2827         *
2828         * We do not adopt an rcu_read_lock() here as we also want to block IPIs
2829         * that come from THPs splitting.
2830         */
2831        local_irq_save(flags);
2832        gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
2833        local_irq_restore(flags);
2834
2835        /*
2836         * When pinning pages for DMA there could be a concurrent write protect
2837         * from fork() via copy_page_range(), in this case always fail fast GUP.
2838         */
2839        if (gup_flags & FOLL_PIN) {
2840                if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
2841                        unpin_user_pages_lockless(pages, nr_pinned);
2842                        return 0;
2843                } else {
2844                        sanity_check_pinned_pages(pages, nr_pinned);
2845                }
2846        }
2847        return nr_pinned;
2848}
2849
2850static int internal_get_user_pages_fast(unsigned long start,
2851                                        unsigned long nr_pages,
2852                                        unsigned int gup_flags,
2853                                        struct page **pages)
2854{
2855        unsigned long len, end;
2856        unsigned long nr_pinned;
2857        int ret;
2858
2859        if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
2860                                       FOLL_FORCE | FOLL_PIN | FOLL_GET |
2861                                       FOLL_FAST_ONLY | FOLL_NOFAULT)))
2862                return -EINVAL;
2863
2864        if (gup_flags & FOLL_PIN)
2865                mm_set_has_pinned_flag(&current->mm->flags);
2866
2867        if (!(gup_flags & FOLL_FAST_ONLY))
2868                might_lock_read(&current->mm->mmap_lock);
2869
2870        start = untagged_addr(start) & PAGE_MASK;
2871        len = nr_pages << PAGE_SHIFT;
2872        if (check_add_overflow(start, len, &end))
2873                return 0;
2874        if (unlikely(!access_ok((void __user *)start, len)))
2875                return -EFAULT;
2876
2877        nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
2878        if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
2879                return nr_pinned;
2880
2881        /* Slow path: try to get the remaining pages with get_user_pages */
2882        start += nr_pinned << PAGE_SHIFT;
2883        pages += nr_pinned;
2884        ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
2885                                      pages);
2886        if (ret < 0) {
2887                /*
2888                 * The caller has to unpin the pages we already pinned so
2889                 * returning -errno is not an option
2890                 */
2891                if (nr_pinned)
2892                        return nr_pinned;
2893                return ret;
2894        }
2895        return ret + nr_pinned;
2896}
2897
2898/**
2899 * get_user_pages_fast_only() - pin user pages in memory
2900 * @start:      starting user address
2901 * @nr_pages:   number of pages from start to pin
2902 * @gup_flags:  flags modifying pin behaviour
2903 * @pages:      array that receives pointers to the pages pinned.
2904 *              Should be at least nr_pages long.
2905 *
2906 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
2907 * the regular GUP.
2908 * Note a difference with get_user_pages_fast: this always returns the
2909 * number of pages pinned, 0 if no pages were pinned.
2910 *
2911 * If the architecture does not support this function, simply return with no
2912 * pages pinned.
2913 *
2914 * Careful, careful! COW breaking can go either way, so a non-write
2915 * access can get ambiguous page results. If you call this function without
2916 * 'write' set, you'd better be sure that you're ok with that ambiguity.
2917 */
2918int get_user_pages_fast_only(unsigned long start, int nr_pages,
2919                             unsigned int gup_flags, struct page **pages)
2920{
2921        int nr_pinned;
2922        /*
2923         * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
2924         * because gup fast is always a "pin with a +1 page refcount" request.
2925         *
2926         * FOLL_FAST_ONLY is required in order to match the API description of
2927         * this routine: no fall back to regular ("slow") GUP.
2928         */
2929        gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
2930
2931        nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
2932                                                 pages);
2933
2934        /*
2935         * As specified in the API description above, this routine is not
2936         * allowed to return negative values. However, the common core
2937         * routine internal_get_user_pages_fast() *can* return -errno.
2938         * Therefore, correct for that here:
2939         */
2940        if (nr_pinned < 0)
2941                nr_pinned = 0;
2942
2943        return nr_pinned;
2944}
2945EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
2946
2947/**
2948 * get_user_pages_fast() - pin user pages in memory
2949 * @start:      starting user address
2950 * @nr_pages:   number of pages from start to pin
2951 * @gup_flags:  flags modifying pin behaviour
2952 * @pages:      array that receives pointers to the pages pinned.
2953 *              Should be at least nr_pages long.
2954 *
2955 * Attempt to pin user pages in memory without taking mm->mmap_lock.
2956 * If not successful, it will fall back to taking the lock and
2957 * calling get_user_pages().
2958 *
2959 * Returns number of pages pinned. This may be fewer than the number requested.
2960 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
2961 * -errno.
2962 */
2963int get_user_pages_fast(unsigned long start, int nr_pages,
2964                        unsigned int gup_flags, struct page **pages)
2965{
2966        if (!is_valid_gup_flags(gup_flags))
2967                return -EINVAL;
2968
2969        /*
2970         * The caller may or may not have explicitly set FOLL_GET; either way is
2971         * OK. However, internally (within mm/gup.c), gup fast variants must set
2972         * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
2973         * request.
2974         */
2975        gup_flags |= FOLL_GET;
2976        return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
2977}
2978EXPORT_SYMBOL_GPL(get_user_pages_fast);
2979
2980/**
2981 * pin_user_pages_fast() - pin user pages in memory without taking locks
2982 *
2983 * @start:      starting user address
2984 * @nr_pages:   number of pages from start to pin
2985 * @gup_flags:  flags modifying pin behaviour
2986 * @pages:      array that receives pointers to the pages pinned.
2987 *              Should be at least nr_pages long.
2988 *
2989 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
2990 * get_user_pages_fast() for documentation on the function arguments, because
2991 * the arguments here are identical.
2992 *
2993 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
2994 * see Documentation/core-api/pin_user_pages.rst for further details.
2995 */
2996int pin_user_pages_fast(unsigned long start, int nr_pages,
2997                        unsigned int gup_flags, struct page **pages)
2998{
2999        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
3000        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3001                return -EINVAL;
3002
3003        if (WARN_ON_ONCE(!pages))
3004                return -EINVAL;
3005
3006        gup_flags |= FOLL_PIN;
3007        return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
3008}
3009EXPORT_SYMBOL_GPL(pin_user_pages_fast);
3010
3011/*
3012 * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
3013 * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
3014 *
3015 * The API rules are the same, too: no negative values may be returned.
3016 */
3017int pin_user_pages_fast_only(unsigned long start, int nr_pages,
3018                             unsigned int gup_flags, struct page **pages)
3019{
3020        int nr_pinned;
3021
3022        /*
3023         * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
3024         * rules require returning 0, rather than -errno:
3025         */
3026        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3027                return 0;
3028
3029        if (WARN_ON_ONCE(!pages))
3030                return 0;
3031        /*
3032         * FOLL_FAST_ONLY is required in order to match the API description of
3033         * this routine: no fall back to regular ("slow") GUP.
3034         */
3035        gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
3036        nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
3037                                                 pages);
3038        /*
3039         * This routine is not allowed to return negative values. However,
3040         * internal_get_user_pages_fast() *can* return -errno. Therefore,
3041         * correct for that here:
3042         */
3043        if (nr_pinned < 0)
3044                nr_pinned = 0;
3045
3046        return nr_pinned;
3047}
3048EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
3049
3050/**
3051 * pin_user_pages_remote() - pin pages of a remote process
3052 *
3053 * @mm:         mm_struct of target mm
3054 * @start:      starting user address
3055 * @nr_pages:   number of pages from start to pin
3056 * @gup_flags:  flags modifying lookup behaviour
3057 * @pages:      array that receives pointers to the pages pinned.
3058 *              Should be at least nr_pages long.
3059 * @vmas:       array of pointers to vmas corresponding to each page.
3060 *              Or NULL if the caller does not require them.
3061 * @locked:     pointer to lock flag indicating whether lock is held and
3062 *              subsequently whether VM_FAULT_RETRY functionality can be
3063 *              utilised. Lock must initially be held.
3064 *
3065 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
3066 * get_user_pages_remote() for documentation on the function arguments, because
3067 * the arguments here are identical.
3068 *
3069 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
3070 * see Documentation/core-api/pin_user_pages.rst for details.
3071 */
3072long pin_user_pages_remote(struct mm_struct *mm,
3073                           unsigned long start, unsigned long nr_pages,
3074                           unsigned int gup_flags, struct page **pages,
3075                           struct vm_area_struct **vmas, int *locked)
3076{
3077        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
3078        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3079                return -EINVAL;
3080
3081        if (WARN_ON_ONCE(!pages))
3082                return -EINVAL;
3083
3084        gup_flags |= FOLL_PIN;
3085        return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
3086                                       pages, vmas, locked);
3087}
3088EXPORT_SYMBOL(pin_user_pages_remote);
3089
3090/**
3091 * pin_user_pages() - pin user pages in memory for use by other devices
3092 *
3093 * @start:      starting user address
3094 * @nr_pages:   number of pages from start to pin
3095 * @gup_flags:  flags modifying lookup behaviour
3096 * @pages:      array that receives pointers to the pages pinned.
3097 *              Should be at least nr_pages long.
3098 * @vmas:       array of pointers to vmas corresponding to each page.
3099 *              Or NULL if the caller does not require them.
3100 *
3101 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
3102 * FOLL_PIN is set.
3103 *
3104 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
3105 * see Documentation/core-api/pin_user_pages.rst for details.
3106 */
3107long pin_user_pages(unsigned long start, unsigned long nr_pages,
3108                    unsigned int gup_flags, struct page **pages,
3109                    struct vm_area_struct **vmas)
3110{
3111        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
3112        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3113                return -EINVAL;
3114
3115        if (WARN_ON_ONCE(!pages))
3116                return -EINVAL;
3117
3118        gup_flags |= FOLL_PIN;
3119        return __gup_longterm_locked(current->mm, start, nr_pages,
3120                                     pages, vmas, gup_flags);
3121}
3122EXPORT_SYMBOL(pin_user_pages);
3123
3124/*
3125 * pin_user_pages_unlocked() is the FOLL_PIN variant of
3126 * get_user_pages_unlocked(). Behavior is the same, except that this one sets
3127 * FOLL_PIN and rejects FOLL_GET.
3128 */
3129long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
3130                             struct page **pages, unsigned int gup_flags)
3131{
3132        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
3133        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3134                return -EINVAL;
3135
3136        if (WARN_ON_ONCE(!pages))
3137                return -EINVAL;
3138
3139        gup_flags |= FOLL_PIN;
3140        return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
3141}
3142EXPORT_SYMBOL(pin_user_pages_unlocked);
3143