linux/mm/gup.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2#include <linux/kernel.h>
   3#include <linux/errno.h>
   4#include <linux/err.h>
   5#include <linux/spinlock.h>
   6
   7#include <linux/mm.h>
   8#include <linux/memremap.h>
   9#include <linux/pagemap.h>
  10#include <linux/rmap.h>
  11#include <linux/swap.h>
  12#include <linux/swapops.h>
  13
  14#include <linux/sched/signal.h>
  15#include <linux/rwsem.h>
  16#include <linux/hugetlb.h>
  17#include <linux/migrate.h>
  18#include <linux/mm_inline.h>
  19#include <linux/sched/mm.h>
  20
  21#include <asm/mmu_context.h>
  22#include <asm/pgtable.h>
  23#include <asm/tlbflush.h>
  24
  25#include "internal.h"
  26
  27struct follow_page_context {
  28        struct dev_pagemap *pgmap;
  29        unsigned int page_mask;
  30};
  31
  32typedef int (*set_dirty_func_t)(struct page *page);
  33
  34static void __put_user_pages_dirty(struct page **pages,
  35                                   unsigned long npages,
  36                                   set_dirty_func_t sdf)
  37{
  38        unsigned long index;
  39
  40        for (index = 0; index < npages; index++) {
  41                struct page *page = compound_head(pages[index]);
  42
  43                /*
  44                 * Checking PageDirty at this point may race with
  45                 * clear_page_dirty_for_io(), but that's OK. Two key cases:
  46                 *
  47                 * 1) This code sees the page as already dirty, so it skips
  48                 * the call to sdf(). That could happen because
  49                 * clear_page_dirty_for_io() called page_mkclean(),
  50                 * followed by set_page_dirty(). However, now the page is
  51                 * going to get written back, which meets the original
  52                 * intention of setting it dirty, so all is well:
  53                 * clear_page_dirty_for_io() goes on to call
  54                 * TestClearPageDirty(), and write the page back.
  55                 *
  56                 * 2) This code sees the page as clean, so it calls sdf().
  57                 * The page stays dirty, despite being written back, so it
  58                 * gets written back again in the next writeback cycle.
  59                 * This is harmless.
  60                 */
  61                if (!PageDirty(page))
  62                        sdf(page);
  63
  64                put_user_page(page);
  65        }
  66}
  67
  68/**
  69 * put_user_pages_dirty() - release and dirty an array of gup-pinned pages
  70 * @pages:  array of pages to be marked dirty and released.
  71 * @npages: number of pages in the @pages array.
  72 *
  73 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
  74 * variants called on that page.
  75 *
  76 * For each page in the @pages array, make that page (or its head page, if a
  77 * compound page) dirty, if it was previously listed as clean. Then, release
  78 * the page using put_user_page().
  79 *
  80 * Please see the put_user_page() documentation for details.
  81 *
  82 * set_page_dirty(), which does not lock the page, is used here.
  83 * Therefore, it is the caller's responsibility to ensure that this is
  84 * safe. If not, then put_user_pages_dirty_lock() should be called instead.
  85 *
  86 */
  87void put_user_pages_dirty(struct page **pages, unsigned long npages)
  88{
  89        __put_user_pages_dirty(pages, npages, set_page_dirty);
  90}
  91EXPORT_SYMBOL(put_user_pages_dirty);
  92
  93/**
  94 * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages
  95 * @pages:  array of pages to be marked dirty and released.
  96 * @npages: number of pages in the @pages array.
  97 *
  98 * For each page in the @pages array, make that page (or its head page, if a
  99 * compound page) dirty, if it was previously listed as clean. Then, release
 100 * the page using put_user_page().
 101 *
 102 * Please see the put_user_page() documentation for details.
 103 *
 104 * This is just like put_user_pages_dirty(), except that it invokes
 105 * set_page_dirty_lock(), instead of set_page_dirty().
 106 *
 107 */
 108void put_user_pages_dirty_lock(struct page **pages, unsigned long npages)
 109{
 110        __put_user_pages_dirty(pages, npages, set_page_dirty_lock);
 111}
 112EXPORT_SYMBOL(put_user_pages_dirty_lock);
 113
 114/**
 115 * put_user_pages() - release an array of gup-pinned pages.
 116 * @pages:  array of pages to be marked dirty and released.
 117 * @npages: number of pages in the @pages array.
 118 *
 119 * For each page in the @pages array, release the page using put_user_page().
 120 *
 121 * Please see the put_user_page() documentation for details.
 122 */
 123void put_user_pages(struct page **pages, unsigned long npages)
 124{
 125        unsigned long index;
 126
 127        /*
 128         * TODO: this can be optimized for huge pages: if a series of pages is
 129         * physically contiguous and part of the same compound page, then a
 130         * single operation to the head page should suffice.
 131         */
 132        for (index = 0; index < npages; index++)
 133                put_user_page(pages[index]);
 134}
 135EXPORT_SYMBOL(put_user_pages);
 136
 137#ifdef CONFIG_MMU
 138static struct page *no_page_table(struct vm_area_struct *vma,
 139                unsigned int flags)
 140{
 141        /*
 142         * When core dumping an enormous anonymous area that nobody
 143         * has touched so far, we don't want to allocate unnecessary pages or
 144         * page tables.  Return error instead of NULL to skip handle_mm_fault,
 145         * then get_dump_page() will return NULL to leave a hole in the dump.
 146         * But we can only make this optimization where a hole would surely
 147         * be zero-filled if handle_mm_fault() actually did handle it.
 148         */
 149        if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
 150                return ERR_PTR(-EFAULT);
 151        return NULL;
 152}
 153
 154static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 155                pte_t *pte, unsigned int flags)
 156{
 157        /* No page to get reference */
 158        if (flags & FOLL_GET)
 159                return -EFAULT;
 160
 161        if (flags & FOLL_TOUCH) {
 162                pte_t entry = *pte;
 163
 164                if (flags & FOLL_WRITE)
 165                        entry = pte_mkdirty(entry);
 166                entry = pte_mkyoung(entry);
 167
 168                if (!pte_same(*pte, entry)) {
 169                        set_pte_at(vma->vm_mm, address, pte, entry);
 170                        update_mmu_cache(vma, address, pte);
 171                }
 172        }
 173
 174        /* Proper page table entry exists, but no corresponding struct page */
 175        return -EEXIST;
 176}
 177
 178/*
 179 * FOLL_FORCE can write to even unwritable pte's, but only
 180 * after we've gone through a COW cycle and they are dirty.
 181 */
 182static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
 183{
 184        return pte_write(pte) ||
 185                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
 186}
 187
 188static struct page *follow_page_pte(struct vm_area_struct *vma,
 189                unsigned long address, pmd_t *pmd, unsigned int flags,
 190                struct dev_pagemap **pgmap)
 191{
 192        struct mm_struct *mm = vma->vm_mm;
 193        struct page *page;
 194        spinlock_t *ptl;
 195        pte_t *ptep, pte;
 196
 197retry:
 198        if (unlikely(pmd_bad(*pmd)))
 199                return no_page_table(vma, flags);
 200
 201        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 202        pte = *ptep;
 203        if (!pte_present(pte)) {
 204                swp_entry_t entry;
 205                /*
 206                 * KSM's break_ksm() relies upon recognizing a ksm page
 207                 * even while it is being migrated, so for that case we
 208                 * need migration_entry_wait().
 209                 */
 210                if (likely(!(flags & FOLL_MIGRATION)))
 211                        goto no_page;
 212                if (pte_none(pte))
 213                        goto no_page;
 214                entry = pte_to_swp_entry(pte);
 215                if (!is_migration_entry(entry))
 216                        goto no_page;
 217                pte_unmap_unlock(ptep, ptl);
 218                migration_entry_wait(mm, pmd, address);
 219                goto retry;
 220        }
 221        if ((flags & FOLL_NUMA) && pte_protnone(pte))
 222                goto no_page;
 223        if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
 224                pte_unmap_unlock(ptep, ptl);
 225                return NULL;
 226        }
 227
 228        page = vm_normal_page(vma, address, pte);
 229        if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
 230                /*
 231                 * Only return device mapping pages in the FOLL_GET case since
 232                 * they are only valid while holding the pgmap reference.
 233                 */
 234                *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
 235                if (*pgmap)
 236                        page = pte_page(pte);
 237                else
 238                        goto no_page;
 239        } else if (unlikely(!page)) {
 240                if (flags & FOLL_DUMP) {
 241                        /* Avoid special (like zero) pages in core dumps */
 242                        page = ERR_PTR(-EFAULT);
 243                        goto out;
 244                }
 245
 246                if (is_zero_pfn(pte_pfn(pte))) {
 247                        page = pte_page(pte);
 248                } else {
 249                        int ret;
 250
 251                        ret = follow_pfn_pte(vma, address, ptep, flags);
 252                        page = ERR_PTR(ret);
 253                        goto out;
 254                }
 255        }
 256
 257        if (flags & FOLL_SPLIT && PageTransCompound(page)) {
 258                int ret;
 259                get_page(page);
 260                pte_unmap_unlock(ptep, ptl);
 261                lock_page(page);
 262                ret = split_huge_page(page);
 263                unlock_page(page);
 264                put_page(page);
 265                if (ret)
 266                        return ERR_PTR(ret);
 267                goto retry;
 268        }
 269
 270        if (flags & FOLL_GET) {
 271                if (unlikely(!try_get_page(page))) {
 272                        page = ERR_PTR(-ENOMEM);
 273                        goto out;
 274                }
 275        }
 276        if (flags & FOLL_TOUCH) {
 277                if ((flags & FOLL_WRITE) &&
 278                    !pte_dirty(pte) && !PageDirty(page))
 279                        set_page_dirty(page);
 280                /*
 281                 * pte_mkyoung() would be more correct here, but atomic care
 282                 * is needed to avoid losing the dirty bit: it is easier to use
 283                 * mark_page_accessed().
 284                 */
 285                mark_page_accessed(page);
 286        }
 287        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 288                /* Do not mlock pte-mapped THP */
 289                if (PageTransCompound(page))
 290                        goto out;
 291
 292                /*
 293                 * The preliminary mapping check is mainly to avoid the
 294                 * pointless overhead of lock_page on the ZERO_PAGE
 295                 * which might bounce very badly if there is contention.
 296                 *
 297                 * If the page is already locked, we don't need to
 298                 * handle it now - vmscan will handle it later if and
 299                 * when it attempts to reclaim the page.
 300                 */
 301                if (page->mapping && trylock_page(page)) {
 302                        lru_add_drain();  /* push cached pages to LRU */
 303                        /*
 304                         * Because we lock page here, and migration is
 305                         * blocked by the pte's page reference, and we
 306                         * know the page is still mapped, we don't even
 307                         * need to check for file-cache page truncation.
 308                         */
 309                        mlock_vma_page(page);
 310                        unlock_page(page);
 311                }
 312        }
 313out:
 314        pte_unmap_unlock(ptep, ptl);
 315        return page;
 316no_page:
 317        pte_unmap_unlock(ptep, ptl);
 318        if (!pte_none(pte))
 319                return NULL;
 320        return no_page_table(vma, flags);
 321}
 322
 323static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 324                                    unsigned long address, pud_t *pudp,
 325                                    unsigned int flags,
 326                                    struct follow_page_context *ctx)
 327{
 328        pmd_t *pmd, pmdval;
 329        spinlock_t *ptl;
 330        struct page *page;
 331        struct mm_struct *mm = vma->vm_mm;
 332
 333        pmd = pmd_offset(pudp, address);
 334        /*
 335         * The READ_ONCE() will stabilize the pmdval in a register or
 336         * on the stack so that it will stop changing under the code.
 337         */
 338        pmdval = READ_ONCE(*pmd);
 339        if (pmd_none(pmdval))
 340                return no_page_table(vma, flags);
 341        if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
 342                page = follow_huge_pmd(mm, address, pmd, flags);
 343                if (page)
 344                        return page;
 345                return no_page_table(vma, flags);
 346        }
 347        if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
 348                page = follow_huge_pd(vma, address,
 349                                      __hugepd(pmd_val(pmdval)), flags,
 350                                      PMD_SHIFT);
 351                if (page)
 352                        return page;
 353                return no_page_table(vma, flags);
 354        }
 355retry:
 356        if (!pmd_present(pmdval)) {
 357                if (likely(!(flags & FOLL_MIGRATION)))
 358                        return no_page_table(vma, flags);
 359                VM_BUG_ON(thp_migration_supported() &&
 360                                  !is_pmd_migration_entry(pmdval));
 361                if (is_pmd_migration_entry(pmdval))
 362                        pmd_migration_entry_wait(mm, pmd);
 363                pmdval = READ_ONCE(*pmd);
 364                /*
 365                 * MADV_DONTNEED may convert the pmd to null because
 366                 * mmap_sem is held in read mode
 367                 */
 368                if (pmd_none(pmdval))
 369                        return no_page_table(vma, flags);
 370                goto retry;
 371        }
 372        if (pmd_devmap(pmdval)) {
 373                ptl = pmd_lock(mm, pmd);
 374                page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
 375                spin_unlock(ptl);
 376                if (page)
 377                        return page;
 378        }
 379        if (likely(!pmd_trans_huge(pmdval)))
 380                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 381
 382        if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
 383                return no_page_table(vma, flags);
 384
 385retry_locked:
 386        ptl = pmd_lock(mm, pmd);
 387        if (unlikely(pmd_none(*pmd))) {
 388                spin_unlock(ptl);
 389                return no_page_table(vma, flags);
 390        }
 391        if (unlikely(!pmd_present(*pmd))) {
 392                spin_unlock(ptl);
 393                if (likely(!(flags & FOLL_MIGRATION)))
 394                        return no_page_table(vma, flags);
 395                pmd_migration_entry_wait(mm, pmd);
 396                goto retry_locked;
 397        }
 398        if (unlikely(!pmd_trans_huge(*pmd))) {
 399                spin_unlock(ptl);
 400                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 401        }
 402        if (flags & FOLL_SPLIT) {
 403                int ret;
 404                page = pmd_page(*pmd);
 405                if (is_huge_zero_page(page)) {
 406                        spin_unlock(ptl);
 407                        ret = 0;
 408                        split_huge_pmd(vma, pmd, address);
 409                        if (pmd_trans_unstable(pmd))
 410                                ret = -EBUSY;
 411                } else {
 412                        if (unlikely(!try_get_page(page))) {
 413                                spin_unlock(ptl);
 414                                return ERR_PTR(-ENOMEM);
 415                        }
 416                        spin_unlock(ptl);
 417                        lock_page(page);
 418                        ret = split_huge_page(page);
 419                        unlock_page(page);
 420                        put_page(page);
 421                        if (pmd_none(*pmd))
 422                                return no_page_table(vma, flags);
 423                }
 424
 425                return ret ? ERR_PTR(ret) :
 426                        follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 427        }
 428        page = follow_trans_huge_pmd(vma, address, pmd, flags);
 429        spin_unlock(ptl);
 430        ctx->page_mask = HPAGE_PMD_NR - 1;
 431        return page;
 432}
 433
 434static struct page *follow_pud_mask(struct vm_area_struct *vma,
 435                                    unsigned long address, p4d_t *p4dp,
 436                                    unsigned int flags,
 437                                    struct follow_page_context *ctx)
 438{
 439        pud_t *pud;
 440        spinlock_t *ptl;
 441        struct page *page;
 442        struct mm_struct *mm = vma->vm_mm;
 443
 444        pud = pud_offset(p4dp, address);
 445        if (pud_none(*pud))
 446                return no_page_table(vma, flags);
 447        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
 448                page = follow_huge_pud(mm, address, pud, flags);
 449                if (page)
 450                        return page;
 451                return no_page_table(vma, flags);
 452        }
 453        if (is_hugepd(__hugepd(pud_val(*pud)))) {
 454                page = follow_huge_pd(vma, address,
 455                                      __hugepd(pud_val(*pud)), flags,
 456                                      PUD_SHIFT);
 457                if (page)
 458                        return page;
 459                return no_page_table(vma, flags);
 460        }
 461        if (pud_devmap(*pud)) {
 462                ptl = pud_lock(mm, pud);
 463                page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
 464                spin_unlock(ptl);
 465                if (page)
 466                        return page;
 467        }
 468        if (unlikely(pud_bad(*pud)))
 469                return no_page_table(vma, flags);
 470
 471        return follow_pmd_mask(vma, address, pud, flags, ctx);
 472}
 473
 474static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 475                                    unsigned long address, pgd_t *pgdp,
 476                                    unsigned int flags,
 477                                    struct follow_page_context *ctx)
 478{
 479        p4d_t *p4d;
 480        struct page *page;
 481
 482        p4d = p4d_offset(pgdp, address);
 483        if (p4d_none(*p4d))
 484                return no_page_table(vma, flags);
 485        BUILD_BUG_ON(p4d_huge(*p4d));
 486        if (unlikely(p4d_bad(*p4d)))
 487                return no_page_table(vma, flags);
 488
 489        if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
 490                page = follow_huge_pd(vma, address,
 491                                      __hugepd(p4d_val(*p4d)), flags,
 492                                      P4D_SHIFT);
 493                if (page)
 494                        return page;
 495                return no_page_table(vma, flags);
 496        }
 497        return follow_pud_mask(vma, address, p4d, flags, ctx);
 498}
 499
 500/**
 501 * follow_page_mask - look up a page descriptor from a user-virtual address
 502 * @vma: vm_area_struct mapping @address
 503 * @address: virtual address to look up
 504 * @flags: flags modifying lookup behaviour
 505 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
 506 *       pointer to output page_mask
 507 *
 508 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 509 *
 510 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
 511 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
 512 *
 513 * On output, the @ctx->page_mask is set according to the size of the page.
 514 *
 515 * Return: the mapped (struct page *), %NULL if no mapping exists, or
 516 * an error pointer if there is a mapping to something not represented
 517 * by a page descriptor (see also vm_normal_page()).
 518 */
 519static struct page *follow_page_mask(struct vm_area_struct *vma,
 520                              unsigned long address, unsigned int flags,
 521                              struct follow_page_context *ctx)
 522{
 523        pgd_t *pgd;
 524        struct page *page;
 525        struct mm_struct *mm = vma->vm_mm;
 526
 527        ctx->page_mask = 0;
 528
 529        /* make this handle hugepd */
 530        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 531        if (!IS_ERR(page)) {
 532                BUG_ON(flags & FOLL_GET);
 533                return page;
 534        }
 535
 536        pgd = pgd_offset(mm, address);
 537
 538        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 539                return no_page_table(vma, flags);
 540
 541        if (pgd_huge(*pgd)) {
 542                page = follow_huge_pgd(mm, address, pgd, flags);
 543                if (page)
 544                        return page;
 545                return no_page_table(vma, flags);
 546        }
 547        if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
 548                page = follow_huge_pd(vma, address,
 549                                      __hugepd(pgd_val(*pgd)), flags,
 550                                      PGDIR_SHIFT);
 551                if (page)
 552                        return page;
 553                return no_page_table(vma, flags);
 554        }
 555
 556        return follow_p4d_mask(vma, address, pgd, flags, ctx);
 557}
 558
 559struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 560                         unsigned int foll_flags)
 561{
 562        struct follow_page_context ctx = { NULL };
 563        struct page *page;
 564
 565        page = follow_page_mask(vma, address, foll_flags, &ctx);
 566        if (ctx.pgmap)
 567                put_dev_pagemap(ctx.pgmap);
 568        return page;
 569}
 570
 571static int get_gate_page(struct mm_struct *mm, unsigned long address,
 572                unsigned int gup_flags, struct vm_area_struct **vma,
 573                struct page **page)
 574{
 575        pgd_t *pgd;
 576        p4d_t *p4d;
 577        pud_t *pud;
 578        pmd_t *pmd;
 579        pte_t *pte;
 580        int ret = -EFAULT;
 581
 582        /* user gate pages are read-only */
 583        if (gup_flags & FOLL_WRITE)
 584                return -EFAULT;
 585        if (address > TASK_SIZE)
 586                pgd = pgd_offset_k(address);
 587        else
 588                pgd = pgd_offset_gate(mm, address);
 589        if (pgd_none(*pgd))
 590                return -EFAULT;
 591        p4d = p4d_offset(pgd, address);
 592        if (p4d_none(*p4d))
 593                return -EFAULT;
 594        pud = pud_offset(p4d, address);
 595        if (pud_none(*pud))
 596                return -EFAULT;
 597        pmd = pmd_offset(pud, address);
 598        if (!pmd_present(*pmd))
 599                return -EFAULT;
 600        VM_BUG_ON(pmd_trans_huge(*pmd));
 601        pte = pte_offset_map(pmd, address);
 602        if (pte_none(*pte))
 603                goto unmap;
 604        *vma = get_gate_vma(mm);
 605        if (!page)
 606                goto out;
 607        *page = vm_normal_page(*vma, address, *pte);
 608        if (!*page) {
 609                if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
 610                        goto unmap;
 611                *page = pte_page(*pte);
 612        }
 613        if (unlikely(!try_get_page(*page))) {
 614                ret = -ENOMEM;
 615                goto unmap;
 616        }
 617out:
 618        ret = 0;
 619unmap:
 620        pte_unmap(pte);
 621        return ret;
 622}
 623
 624/*
 625 * mmap_sem must be held on entry.  If @nonblocking != NULL and
 626 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
 627 * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
 628 */
 629static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 630                unsigned long address, unsigned int *flags, int *nonblocking)
 631{
 632        unsigned int fault_flags = 0;
 633        vm_fault_t ret;
 634
 635        /* mlock all present pages, but do not fault in new pages */
 636        if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
 637                return -ENOENT;
 638        if (*flags & FOLL_WRITE)
 639                fault_flags |= FAULT_FLAG_WRITE;
 640        if (*flags & FOLL_REMOTE)
 641                fault_flags |= FAULT_FLAG_REMOTE;
 642        if (nonblocking)
 643                fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 644        if (*flags & FOLL_NOWAIT)
 645                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
 646        if (*flags & FOLL_TRIED) {
 647                VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
 648                fault_flags |= FAULT_FLAG_TRIED;
 649        }
 650
 651        ret = handle_mm_fault(vma, address, fault_flags);
 652        if (ret & VM_FAULT_ERROR) {
 653                int err = vm_fault_to_errno(ret, *flags);
 654
 655                if (err)
 656                        return err;
 657                BUG();
 658        }
 659
 660        if (tsk) {
 661                if (ret & VM_FAULT_MAJOR)
 662                        tsk->maj_flt++;
 663                else
 664                        tsk->min_flt++;
 665        }
 666
 667        if (ret & VM_FAULT_RETRY) {
 668                if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
 669                        *nonblocking = 0;
 670                return -EBUSY;
 671        }
 672
 673        /*
 674         * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
 675         * necessary, even if maybe_mkwrite decided not to set pte_write. We
 676         * can thus safely do subsequent page lookups as if they were reads.
 677         * But only do so when looping for pte_write is futile: in some cases
 678         * userspace may also be wanting to write to the gotten user page,
 679         * which a read fault here might prevent (a readonly page might get
 680         * reCOWed by userspace write).
 681         */
 682        if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
 683                *flags |= FOLL_COW;
 684        return 0;
 685}
 686
 687static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 688{
 689        vm_flags_t vm_flags = vma->vm_flags;
 690        int write = (gup_flags & FOLL_WRITE);
 691        int foreign = (gup_flags & FOLL_REMOTE);
 692
 693        if (vm_flags & (VM_IO | VM_PFNMAP))
 694                return -EFAULT;
 695
 696        if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
 697                return -EFAULT;
 698
 699        if (write) {
 700                if (!(vm_flags & VM_WRITE)) {
 701                        if (!(gup_flags & FOLL_FORCE))
 702                                return -EFAULT;
 703                        /*
 704                         * We used to let the write,force case do COW in a
 705                         * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
 706                         * set a breakpoint in a read-only mapping of an
 707                         * executable, without corrupting the file (yet only
 708                         * when that file had been opened for writing!).
 709                         * Anon pages in shared mappings are surprising: now
 710                         * just reject it.
 711                         */
 712                        if (!is_cow_mapping(vm_flags))
 713                                return -EFAULT;
 714                }
 715        } else if (!(vm_flags & VM_READ)) {
 716                if (!(gup_flags & FOLL_FORCE))
 717                        return -EFAULT;
 718                /*
 719                 * Is there actually any vma we can reach here which does not
 720                 * have VM_MAYREAD set?
 721                 */
 722                if (!(vm_flags & VM_MAYREAD))
 723                        return -EFAULT;
 724        }
 725        /*
 726         * gups are always data accesses, not instruction
 727         * fetches, so execute=false here
 728         */
 729        if (!arch_vma_access_permitted(vma, write, false, foreign))
 730                return -EFAULT;
 731        return 0;
 732}
 733
 734/**
 735 * __get_user_pages() - pin user pages in memory
 736 * @tsk:        task_struct of target task
 737 * @mm:         mm_struct of target mm
 738 * @start:      starting user address
 739 * @nr_pages:   number of pages from start to pin
 740 * @gup_flags:  flags modifying pin behaviour
 741 * @pages:      array that receives pointers to the pages pinned.
 742 *              Should be at least nr_pages long. Or NULL, if caller
 743 *              only intends to ensure the pages are faulted in.
 744 * @vmas:       array of pointers to vmas corresponding to each page.
 745 *              Or NULL if the caller does not require them.
 746 * @nonblocking: whether waiting for disk IO or mmap_sem contention
 747 *
 748 * Returns number of pages pinned. This may be fewer than the number
 749 * requested. If nr_pages is 0 or negative, returns 0. If no pages
 750 * were pinned, returns -errno. Each page returned must be released
 751 * with a put_page() call when it is finished with. vmas will only
 752 * remain valid while mmap_sem is held.
 753 *
 754 * Must be called with mmap_sem held.  It may be released.  See below.
 755 *
 756 * __get_user_pages walks a process's page tables and takes a reference to
 757 * each struct page that each user address corresponds to at a given
 758 * instant. That is, it takes the page that would be accessed if a user
 759 * thread accesses the given user virtual address at that instant.
 760 *
 761 * This does not guarantee that the page exists in the user mappings when
 762 * __get_user_pages returns, and there may even be a completely different
 763 * page there in some cases (eg. if mmapped pagecache has been invalidated
 764 * and subsequently re faulted). However it does guarantee that the page
 765 * won't be freed completely. And mostly callers simply care that the page
 766 * contains data that was valid *at some point in time*. Typically, an IO
 767 * or similar operation cannot guarantee anything stronger anyway because
 768 * locks can't be held over the syscall boundary.
 769 *
 770 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 771 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 772 * appropriate) must be called after the page is finished with, and
 773 * before put_page is called.
 774 *
 775 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
 776 * or mmap_sem contention, and if waiting is needed to pin all pages,
 777 * *@nonblocking will be set to 0.  Further, if @gup_flags does not
 778 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
 779 * this case.
 780 *
 781 * A caller using such a combination of @nonblocking and @gup_flags
 782 * must therefore hold the mmap_sem for reading only, and recognize
 783 * when it's been released.  Otherwise, it must be held for either
 784 * reading or writing and will not be released.
 785 *
 786 * In most cases, get_user_pages or get_user_pages_fast should be used
 787 * instead of __get_user_pages. __get_user_pages should be used only if
 788 * you need some special @gup_flags.
 789 */
 790static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 791                unsigned long start, unsigned long nr_pages,
 792                unsigned int gup_flags, struct page **pages,
 793                struct vm_area_struct **vmas, int *nonblocking)
 794{
 795        long ret = 0, i = 0;
 796        struct vm_area_struct *vma = NULL;
 797        struct follow_page_context ctx = { NULL };
 798
 799        if (!nr_pages)
 800                return 0;
 801
 802        VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
 803
 804        /*
 805         * If FOLL_FORCE is set then do not force a full fault as the hinting
 806         * fault information is unrelated to the reference behaviour of a task
 807         * using the address space
 808         */
 809        if (!(gup_flags & FOLL_FORCE))
 810                gup_flags |= FOLL_NUMA;
 811
 812        do {
 813                struct page *page;
 814                unsigned int foll_flags = gup_flags;
 815                unsigned int page_increm;
 816
 817                /* first iteration or cross vma bound */
 818                if (!vma || start >= vma->vm_end) {
 819                        vma = find_extend_vma(mm, start);
 820                        if (!vma && in_gate_area(mm, start)) {
 821                                ret = get_gate_page(mm, start & PAGE_MASK,
 822                                                gup_flags, &vma,
 823                                                pages ? &pages[i] : NULL);
 824                                if (ret)
 825                                        goto out;
 826                                ctx.page_mask = 0;
 827                                goto next_page;
 828                        }
 829
 830                        if (!vma || check_vma_flags(vma, gup_flags)) {
 831                                ret = -EFAULT;
 832                                goto out;
 833                        }
 834                        if (is_vm_hugetlb_page(vma)) {
 835                                i = follow_hugetlb_page(mm, vma, pages, vmas,
 836                                                &start, &nr_pages, i,
 837                                                gup_flags, nonblocking);
 838                                continue;
 839                        }
 840                }
 841retry:
 842                /*
 843                 * If we have a pending SIGKILL, don't keep faulting pages and
 844                 * potentially allocating memory.
 845                 */
 846                if (fatal_signal_pending(current)) {
 847                        ret = -ERESTARTSYS;
 848                        goto out;
 849                }
 850                cond_resched();
 851
 852                page = follow_page_mask(vma, start, foll_flags, &ctx);
 853                if (!page) {
 854                        ret = faultin_page(tsk, vma, start, &foll_flags,
 855                                        nonblocking);
 856                        switch (ret) {
 857                        case 0:
 858                                goto retry;
 859                        case -EBUSY:
 860                                ret = 0;
 861                                /* FALLTHRU */
 862                        case -EFAULT:
 863                        case -ENOMEM:
 864                        case -EHWPOISON:
 865                                goto out;
 866                        case -ENOENT:
 867                                goto next_page;
 868                        }
 869                        BUG();
 870                } else if (PTR_ERR(page) == -EEXIST) {
 871                        /*
 872                         * Proper page table entry exists, but no corresponding
 873                         * struct page.
 874                         */
 875                        goto next_page;
 876                } else if (IS_ERR(page)) {
 877                        ret = PTR_ERR(page);
 878                        goto out;
 879                }
 880                if (pages) {
 881                        pages[i] = page;
 882                        flush_anon_page(vma, page, start);
 883                        flush_dcache_page(page);
 884                        ctx.page_mask = 0;
 885                }
 886next_page:
 887                if (vmas) {
 888                        vmas[i] = vma;
 889                        ctx.page_mask = 0;
 890                }
 891                page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
 892                if (page_increm > nr_pages)
 893                        page_increm = nr_pages;
 894                i += page_increm;
 895                start += page_increm * PAGE_SIZE;
 896                nr_pages -= page_increm;
 897        } while (nr_pages);
 898out:
 899        if (ctx.pgmap)
 900                put_dev_pagemap(ctx.pgmap);
 901        return i ? i : ret;
 902}
 903
 904static bool vma_permits_fault(struct vm_area_struct *vma,
 905                              unsigned int fault_flags)
 906{
 907        bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
 908        bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
 909        vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
 910
 911        if (!(vm_flags & vma->vm_flags))
 912                return false;
 913
 914        /*
 915         * The architecture might have a hardware protection
 916         * mechanism other than read/write that can deny access.
 917         *
 918         * gup always represents data access, not instruction
 919         * fetches, so execute=false here:
 920         */
 921        if (!arch_vma_access_permitted(vma, write, false, foreign))
 922                return false;
 923
 924        return true;
 925}
 926
 927/*
 928 * fixup_user_fault() - manually resolve a user page fault
 929 * @tsk:        the task_struct to use for page fault accounting, or
 930 *              NULL if faults are not to be recorded.
 931 * @mm:         mm_struct of target mm
 932 * @address:    user address
 933 * @fault_flags:flags to pass down to handle_mm_fault()
 934 * @unlocked:   did we unlock the mmap_sem while retrying, maybe NULL if caller
 935 *              does not allow retry
 936 *
 937 * This is meant to be called in the specific scenario where for locking reasons
 938 * we try to access user memory in atomic context (within a pagefault_disable()
 939 * section), this returns -EFAULT, and we want to resolve the user fault before
 940 * trying again.
 941 *
 942 * Typically this is meant to be used by the futex code.
 943 *
 944 * The main difference with get_user_pages() is that this function will
 945 * unconditionally call handle_mm_fault() which will in turn perform all the
 946 * necessary SW fixup of the dirty and young bits in the PTE, while
 947 * get_user_pages() only guarantees to update these in the struct page.
 948 *
 949 * This is important for some architectures where those bits also gate the
 950 * access permission to the page because they are maintained in software.  On
 951 * such architectures, gup() will not be enough to make a subsequent access
 952 * succeed.
 953 *
 954 * This function will not return with an unlocked mmap_sem. So it has not the
 955 * same semantics wrt the @mm->mmap_sem as does filemap_fault().
 956 */
 957int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 958                     unsigned long address, unsigned int fault_flags,
 959                     bool *unlocked)
 960{
 961        struct vm_area_struct *vma;
 962        vm_fault_t ret, major = 0;
 963
 964        if (unlocked)
 965                fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 966
 967retry:
 968        vma = find_extend_vma(mm, address);
 969        if (!vma || address < vma->vm_start)
 970                return -EFAULT;
 971
 972        if (!vma_permits_fault(vma, fault_flags))
 973                return -EFAULT;
 974
 975        ret = handle_mm_fault(vma, address, fault_flags);
 976        major |= ret & VM_FAULT_MAJOR;
 977        if (ret & VM_FAULT_ERROR) {
 978                int err = vm_fault_to_errno(ret, 0);
 979
 980                if (err)
 981                        return err;
 982                BUG();
 983        }
 984
 985        if (ret & VM_FAULT_RETRY) {
 986                down_read(&mm->mmap_sem);
 987                if (!(fault_flags & FAULT_FLAG_TRIED)) {
 988                        *unlocked = true;
 989                        fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
 990                        fault_flags |= FAULT_FLAG_TRIED;
 991                        goto retry;
 992                }
 993        }
 994
 995        if (tsk) {
 996                if (major)
 997                        tsk->maj_flt++;
 998                else
 999                        tsk->min_flt++;
1000        }
1001        return 0;
1002}
1003EXPORT_SYMBOL_GPL(fixup_user_fault);
1004
1005static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
1006                                                struct mm_struct *mm,
1007                                                unsigned long start,
1008                                                unsigned long nr_pages,
1009                                                struct page **pages,
1010                                                struct vm_area_struct **vmas,
1011                                                int *locked,
1012                                                unsigned int flags)
1013{
1014        long ret, pages_done;
1015        bool lock_dropped;
1016
1017        if (locked) {
1018                /* if VM_FAULT_RETRY can be returned, vmas become invalid */
1019                BUG_ON(vmas);
1020                /* check caller initialized locked */
1021                BUG_ON(*locked != 1);
1022        }
1023
1024        if (pages)
1025                flags |= FOLL_GET;
1026
1027        pages_done = 0;
1028        lock_dropped = false;
1029        for (;;) {
1030                ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
1031                                       vmas, locked);
1032                if (!locked)
1033                        /* VM_FAULT_RETRY couldn't trigger, bypass */
1034                        return ret;
1035
1036                /* VM_FAULT_RETRY cannot return errors */
1037                if (!*locked) {
1038                        BUG_ON(ret < 0);
1039                        BUG_ON(ret >= nr_pages);
1040                }
1041
1042                if (ret > 0) {
1043                        nr_pages -= ret;
1044                        pages_done += ret;
1045                        if (!nr_pages)
1046                                break;
1047                }
1048                if (*locked) {
1049                        /*
1050                         * VM_FAULT_RETRY didn't trigger or it was a
1051                         * FOLL_NOWAIT.
1052                         */
1053                        if (!pages_done)
1054                                pages_done = ret;
1055                        break;
1056                }
1057                /*
1058                 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1059                 * For the prefault case (!pages) we only update counts.
1060                 */
1061                if (likely(pages))
1062                        pages += ret;
1063                start += ret << PAGE_SHIFT;
1064
1065                /*
1066                 * Repeat on the address that fired VM_FAULT_RETRY
1067                 * without FAULT_FLAG_ALLOW_RETRY but with
1068                 * FAULT_FLAG_TRIED.
1069                 */
1070                *locked = 1;
1071                lock_dropped = true;
1072                down_read(&mm->mmap_sem);
1073                ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
1074                                       pages, NULL, NULL);
1075                if (ret != 1) {
1076                        BUG_ON(ret > 1);
1077                        if (!pages_done)
1078                                pages_done = ret;
1079                        break;
1080                }
1081                nr_pages--;
1082                pages_done++;
1083                if (!nr_pages)
1084                        break;
1085                if (likely(pages))
1086                        pages++;
1087                start += PAGE_SIZE;
1088        }
1089        if (lock_dropped && *locked) {
1090                /*
1091                 * We must let the caller know we temporarily dropped the lock
1092                 * and so the critical section protected by it was lost.
1093                 */
1094                up_read(&mm->mmap_sem);
1095                *locked = 0;
1096        }
1097        return pages_done;
1098}
1099
1100/*
1101 * get_user_pages_remote() - pin user pages in memory
1102 * @tsk:        the task_struct to use for page fault accounting, or
1103 *              NULL if faults are not to be recorded.
1104 * @mm:         mm_struct of target mm
1105 * @start:      starting user address
1106 * @nr_pages:   number of pages from start to pin
1107 * @gup_flags:  flags modifying lookup behaviour
1108 * @pages:      array that receives pointers to the pages pinned.
1109 *              Should be at least nr_pages long. Or NULL, if caller
1110 *              only intends to ensure the pages are faulted in.
1111 * @vmas:       array of pointers to vmas corresponding to each page.
1112 *              Or NULL if the caller does not require them.
1113 * @locked:     pointer to lock flag indicating whether lock is held and
1114 *              subsequently whether VM_FAULT_RETRY functionality can be
1115 *              utilised. Lock must initially be held.
1116 *
1117 * Returns number of pages pinned. This may be fewer than the number
1118 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1119 * were pinned, returns -errno. Each page returned must be released
1120 * with a put_page() call when it is finished with. vmas will only
1121 * remain valid while mmap_sem is held.
1122 *
1123 * Must be called with mmap_sem held for read or write.
1124 *
1125 * get_user_pages walks a process's page tables and takes a reference to
1126 * each struct page that each user address corresponds to at a given
1127 * instant. That is, it takes the page that would be accessed if a user
1128 * thread accesses the given user virtual address at that instant.
1129 *
1130 * This does not guarantee that the page exists in the user mappings when
1131 * get_user_pages returns, and there may even be a completely different
1132 * page there in some cases (eg. if mmapped pagecache has been invalidated
1133 * and subsequently re faulted). However it does guarantee that the page
1134 * won't be freed completely. And mostly callers simply care that the page
1135 * contains data that was valid *at some point in time*. Typically, an IO
1136 * or similar operation cannot guarantee anything stronger anyway because
1137 * locks can't be held over the syscall boundary.
1138 *
1139 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
1140 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
1141 * be called after the page is finished with, and before put_page is called.
1142 *
1143 * get_user_pages is typically used for fewer-copy IO operations, to get a
1144 * handle on the memory by some means other than accesses via the user virtual
1145 * addresses. The pages may be submitted for DMA to devices or accessed via
1146 * their kernel linear mapping (via the kmap APIs). Care should be taken to
1147 * use the correct cache flushing APIs.
1148 *
1149 * See also get_user_pages_fast, for performance critical applications.
1150 *
1151 * get_user_pages should be phased out in favor of
1152 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
1153 * should use get_user_pages because it cannot pass
1154 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
1155 */
1156long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
1157                unsigned long start, unsigned long nr_pages,
1158                unsigned int gup_flags, struct page **pages,
1159                struct vm_area_struct **vmas, int *locked)
1160{
1161        /*
1162         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1163         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1164         * vmas.  As there are no users of this flag in this call we simply
1165         * disallow this option for now.
1166         */
1167        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1168                return -EINVAL;
1169
1170        return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
1171                                       locked,
1172                                       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
1173}
1174EXPORT_SYMBOL(get_user_pages_remote);
1175
1176/**
1177 * populate_vma_page_range() -  populate a range of pages in the vma.
1178 * @vma:   target vma
1179 * @start: start address
1180 * @end:   end address
1181 * @nonblocking:
1182 *
1183 * This takes care of mlocking the pages too if VM_LOCKED is set.
1184 *
1185 * return 0 on success, negative error code on error.
1186 *
1187 * vma->vm_mm->mmap_sem must be held.
1188 *
1189 * If @nonblocking is NULL, it may be held for read or write and will
1190 * be unperturbed.
1191 *
1192 * If @nonblocking is non-NULL, it must held for read only and may be
1193 * released.  If it's released, *@nonblocking will be set to 0.
1194 */
1195long populate_vma_page_range(struct vm_area_struct *vma,
1196                unsigned long start, unsigned long end, int *nonblocking)
1197{
1198        struct mm_struct *mm = vma->vm_mm;
1199        unsigned long nr_pages = (end - start) / PAGE_SIZE;
1200        int gup_flags;
1201
1202        VM_BUG_ON(start & ~PAGE_MASK);
1203        VM_BUG_ON(end   & ~PAGE_MASK);
1204        VM_BUG_ON_VMA(start < vma->vm_start, vma);
1205        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
1206        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1207
1208        gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
1209        if (vma->vm_flags & VM_LOCKONFAULT)
1210                gup_flags &= ~FOLL_POPULATE;
1211        /*
1212         * We want to touch writable mappings with a write fault in order
1213         * to break COW, except for shared mappings because these don't COW
1214         * and we would not want to dirty them for nothing.
1215         */
1216        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1217                gup_flags |= FOLL_WRITE;
1218
1219        /*
1220         * We want mlock to succeed for regions that have any permissions
1221         * other than PROT_NONE.
1222         */
1223        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
1224                gup_flags |= FOLL_FORCE;
1225
1226        /*
1227         * We made sure addr is within a VMA, so the following will
1228         * not result in a stack expansion that recurses back here.
1229         */
1230        return __get_user_pages(current, mm, start, nr_pages, gup_flags,
1231                                NULL, NULL, nonblocking);
1232}
1233
1234/*
1235 * __mm_populate - populate and/or mlock pages within a range of address space.
1236 *
1237 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1238 * flags. VMAs must be already marked with the desired vm_flags, and
1239 * mmap_sem must not be held.
1240 */
1241int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1242{
1243        struct mm_struct *mm = current->mm;
1244        unsigned long end, nstart, nend;
1245        struct vm_area_struct *vma = NULL;
1246        int locked = 0;
1247        long ret = 0;
1248
1249        end = start + len;
1250
1251        for (nstart = start; nstart < end; nstart = nend) {
1252                /*
1253                 * We want to fault in pages for [nstart; end) address range.
1254                 * Find first corresponding VMA.
1255                 */
1256                if (!locked) {
1257                        locked = 1;
1258                        down_read(&mm->mmap_sem);
1259                        vma = find_vma(mm, nstart);
1260                } else if (nstart >= vma->vm_end)
1261                        vma = vma->vm_next;
1262                if (!vma || vma->vm_start >= end)
1263                        break;
1264                /*
1265                 * Set [nstart; nend) to intersection of desired address
1266                 * range with the first VMA. Also, skip undesirable VMA types.
1267                 */
1268                nend = min(end, vma->vm_end);
1269                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1270                        continue;
1271                if (nstart < vma->vm_start)
1272                        nstart = vma->vm_start;
1273                /*
1274                 * Now fault in a range of pages. populate_vma_page_range()
1275                 * double checks the vma flags, so that it won't mlock pages
1276                 * if the vma was already munlocked.
1277                 */
1278                ret = populate_vma_page_range(vma, nstart, nend, &locked);
1279                if (ret < 0) {
1280                        if (ignore_errors) {
1281                                ret = 0;
1282                                continue;       /* continue at next VMA */
1283                        }
1284                        break;
1285                }
1286                nend = nstart + ret * PAGE_SIZE;
1287                ret = 0;
1288        }
1289        if (locked)
1290                up_read(&mm->mmap_sem);
1291        return ret;     /* 0 or negative error code */
1292}
1293
1294/**
1295 * get_dump_page() - pin user page in memory while writing it to core dump
1296 * @addr: user address
1297 *
1298 * Returns struct page pointer of user page pinned for dump,
1299 * to be freed afterwards by put_page().
1300 *
1301 * Returns NULL on any kind of failure - a hole must then be inserted into
1302 * the corefile, to preserve alignment with its headers; and also returns
1303 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1304 * allowing a hole to be left in the corefile to save diskspace.
1305 *
1306 * Called without mmap_sem, but after all other threads have been killed.
1307 */
1308#ifdef CONFIG_ELF_CORE
1309struct page *get_dump_page(unsigned long addr)
1310{
1311        struct vm_area_struct *vma;
1312        struct page *page;
1313
1314        if (__get_user_pages(current, current->mm, addr, 1,
1315                             FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1316                             NULL) < 1)
1317                return NULL;
1318        flush_cache_page(vma, addr, page_to_pfn(page));
1319        return page;
1320}
1321#endif /* CONFIG_ELF_CORE */
1322#else /* CONFIG_MMU */
1323static long __get_user_pages_locked(struct task_struct *tsk,
1324                struct mm_struct *mm, unsigned long start,
1325                unsigned long nr_pages, struct page **pages,
1326                struct vm_area_struct **vmas, int *locked,
1327                unsigned int foll_flags)
1328{
1329        struct vm_area_struct *vma;
1330        unsigned long vm_flags;
1331        int i;
1332
1333        /* calculate required read or write permissions.
1334         * If FOLL_FORCE is set, we only require the "MAY" flags.
1335         */
1336        vm_flags  = (foll_flags & FOLL_WRITE) ?
1337                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1338        vm_flags &= (foll_flags & FOLL_FORCE) ?
1339                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1340
1341        for (i = 0; i < nr_pages; i++) {
1342                vma = find_vma(mm, start);
1343                if (!vma)
1344                        goto finish_or_fault;
1345
1346                /* protect what we can, including chardevs */
1347                if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1348                    !(vm_flags & vma->vm_flags))
1349                        goto finish_or_fault;
1350
1351                if (pages) {
1352                        pages[i] = virt_to_page(start);
1353                        if (pages[i])
1354                                get_page(pages[i]);
1355                }
1356                if (vmas)
1357                        vmas[i] = vma;
1358                start = (start + PAGE_SIZE) & PAGE_MASK;
1359        }
1360
1361        return i;
1362
1363finish_or_fault:
1364        return i ? : -EFAULT;
1365}
1366#endif /* !CONFIG_MMU */
1367
1368#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
1369static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
1370{
1371        long i;
1372        struct vm_area_struct *vma_prev = NULL;
1373
1374        for (i = 0; i < nr_pages; i++) {
1375                struct vm_area_struct *vma = vmas[i];
1376
1377                if (vma == vma_prev)
1378                        continue;
1379
1380                vma_prev = vma;
1381
1382                if (vma_is_fsdax(vma))
1383                        return true;
1384        }
1385        return false;
1386}
1387
1388#ifdef CONFIG_CMA
1389static struct page *new_non_cma_page(struct page *page, unsigned long private)
1390{
1391        /*
1392         * We want to make sure we allocate the new page from the same node
1393         * as the source page.
1394         */
1395        int nid = page_to_nid(page);
1396        /*
1397         * Trying to allocate a page for migration. Ignore allocation
1398         * failure warnings. We don't force __GFP_THISNODE here because
1399         * this node here is the node where we have CMA reservation and
1400         * in some case these nodes will have really less non movable
1401         * allocation memory.
1402         */
1403        gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
1404
1405        if (PageHighMem(page))
1406                gfp_mask |= __GFP_HIGHMEM;
1407
1408#ifdef CONFIG_HUGETLB_PAGE
1409        if (PageHuge(page)) {
1410                struct hstate *h = page_hstate(page);
1411                /*
1412                 * We don't want to dequeue from the pool because pool pages will
1413                 * mostly be from the CMA region.
1414                 */
1415                return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
1416        }
1417#endif
1418        if (PageTransHuge(page)) {
1419                struct page *thp;
1420                /*
1421                 * ignore allocation failure warnings
1422                 */
1423                gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
1424
1425                /*
1426                 * Remove the movable mask so that we don't allocate from
1427                 * CMA area again.
1428                 */
1429                thp_gfpmask &= ~__GFP_MOVABLE;
1430                thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
1431                if (!thp)
1432                        return NULL;
1433                prep_transhuge_page(thp);
1434                return thp;
1435        }
1436
1437        return __alloc_pages_node(nid, gfp_mask, 0);
1438}
1439
1440static long check_and_migrate_cma_pages(struct task_struct *tsk,
1441                                        struct mm_struct *mm,
1442                                        unsigned long start,
1443                                        unsigned long nr_pages,
1444                                        struct page **pages,
1445                                        struct vm_area_struct **vmas,
1446                                        unsigned int gup_flags)
1447{
1448        unsigned long i;
1449        unsigned long step;
1450        bool drain_allow = true;
1451        bool migrate_allow = true;
1452        LIST_HEAD(cma_page_list);
1453
1454check_again:
1455        for (i = 0; i < nr_pages;) {
1456
1457                struct page *head = compound_head(pages[i]);
1458
1459                /*
1460                 * gup may start from a tail page. Advance step by the left
1461                 * part.
1462                 */
1463                step = (1 << compound_order(head)) - (pages[i] - head);
1464                /*
1465                 * If we get a page from the CMA zone, since we are going to
1466                 * be pinning these entries, we might as well move them out
1467                 * of the CMA zone if possible.
1468                 */
1469                if (is_migrate_cma_page(head)) {
1470                        if (PageHuge(head))
1471                                isolate_huge_page(head, &cma_page_list);
1472                        else {
1473                                if (!PageLRU(head) && drain_allow) {
1474                                        lru_add_drain_all();
1475                                        drain_allow = false;
1476                                }
1477
1478                                if (!isolate_lru_page(head)) {
1479                                        list_add_tail(&head->lru, &cma_page_list);
1480                                        mod_node_page_state(page_pgdat(head),
1481                                                            NR_ISOLATED_ANON +
1482                                                            page_is_file_cache(head),
1483                                                            hpage_nr_pages(head));
1484                                }
1485                        }
1486                }
1487
1488                i += step;
1489        }
1490
1491        if (!list_empty(&cma_page_list)) {
1492                /*
1493                 * drop the above get_user_pages reference.
1494                 */
1495                for (i = 0; i < nr_pages; i++)
1496                        put_page(pages[i]);
1497
1498                if (migrate_pages(&cma_page_list, new_non_cma_page,
1499                                  NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
1500                        /*
1501                         * some of the pages failed migration. Do get_user_pages
1502                         * without migration.
1503                         */
1504                        migrate_allow = false;
1505
1506                        if (!list_empty(&cma_page_list))
1507                                putback_movable_pages(&cma_page_list);
1508                }
1509                /*
1510                 * We did migrate all the pages, Try to get the page references
1511                 * again migrating any new CMA pages which we failed to isolate
1512                 * earlier.
1513                 */
1514                nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
1515                                                   pages, vmas, NULL,
1516                                                   gup_flags);
1517
1518                if ((nr_pages > 0) && migrate_allow) {
1519                        drain_allow = true;
1520                        goto check_again;
1521                }
1522        }
1523
1524        return nr_pages;
1525}
1526#else
1527static long check_and_migrate_cma_pages(struct task_struct *tsk,
1528                                        struct mm_struct *mm,
1529                                        unsigned long start,
1530                                        unsigned long nr_pages,
1531                                        struct page **pages,
1532                                        struct vm_area_struct **vmas,
1533                                        unsigned int gup_flags)
1534{
1535        return nr_pages;
1536}
1537#endif /* CONFIG_CMA */
1538
1539/*
1540 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
1541 * allows us to process the FOLL_LONGTERM flag.
1542 */
1543static long __gup_longterm_locked(struct task_struct *tsk,
1544                                  struct mm_struct *mm,
1545                                  unsigned long start,
1546                                  unsigned long nr_pages,
1547                                  struct page **pages,
1548                                  struct vm_area_struct **vmas,
1549                                  unsigned int gup_flags)
1550{
1551        struct vm_area_struct **vmas_tmp = vmas;
1552        unsigned long flags = 0;
1553        long rc, i;
1554
1555        if (gup_flags & FOLL_LONGTERM) {
1556                if (!pages)
1557                        return -EINVAL;
1558
1559                if (!vmas_tmp) {
1560                        vmas_tmp = kcalloc(nr_pages,
1561                                           sizeof(struct vm_area_struct *),
1562                                           GFP_KERNEL);
1563                        if (!vmas_tmp)
1564                                return -ENOMEM;
1565                }
1566                flags = memalloc_nocma_save();
1567        }
1568
1569        rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
1570                                     vmas_tmp, NULL, gup_flags);
1571
1572        if (gup_flags & FOLL_LONGTERM) {
1573                memalloc_nocma_restore(flags);
1574                if (rc < 0)
1575                        goto out;
1576
1577                if (check_dax_vmas(vmas_tmp, rc)) {
1578                        for (i = 0; i < rc; i++)
1579                                put_page(pages[i]);
1580                        rc = -EOPNOTSUPP;
1581                        goto out;
1582                }
1583
1584                rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
1585                                                 vmas_tmp, gup_flags);
1586        }
1587
1588out:
1589        if (vmas_tmp != vmas)
1590                kfree(vmas_tmp);
1591        return rc;
1592}
1593#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
1594static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
1595                                                  struct mm_struct *mm,
1596                                                  unsigned long start,
1597                                                  unsigned long nr_pages,
1598                                                  struct page **pages,
1599                                                  struct vm_area_struct **vmas,
1600                                                  unsigned int flags)
1601{
1602        return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
1603                                       NULL, flags);
1604}
1605#endif /* CONFIG_FS_DAX || CONFIG_CMA */
1606
1607/*
1608 * This is the same as get_user_pages_remote(), just with a
1609 * less-flexible calling convention where we assume that the task
1610 * and mm being operated on are the current task's and don't allow
1611 * passing of a locked parameter.  We also obviously don't pass
1612 * FOLL_REMOTE in here.
1613 */
1614long get_user_pages(unsigned long start, unsigned long nr_pages,
1615                unsigned int gup_flags, struct page **pages,
1616                struct vm_area_struct **vmas)
1617{
1618        return __gup_longterm_locked(current, current->mm, start, nr_pages,
1619                                     pages, vmas, gup_flags | FOLL_TOUCH);
1620}
1621EXPORT_SYMBOL(get_user_pages);
1622
1623/*
1624 * We can leverage the VM_FAULT_RETRY functionality in the page fault
1625 * paths better by using either get_user_pages_locked() or
1626 * get_user_pages_unlocked().
1627 *
1628 * get_user_pages_locked() is suitable to replace the form:
1629 *
1630 *      down_read(&mm->mmap_sem);
1631 *      do_something()
1632 *      get_user_pages(tsk, mm, ..., pages, NULL);
1633 *      up_read(&mm->mmap_sem);
1634 *
1635 *  to:
1636 *
1637 *      int locked = 1;
1638 *      down_read(&mm->mmap_sem);
1639 *      do_something()
1640 *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
1641 *      if (locked)
1642 *          up_read(&mm->mmap_sem);
1643 */
1644long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1645                           unsigned int gup_flags, struct page **pages,
1646                           int *locked)
1647{
1648        /*
1649         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1650         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1651         * vmas.  As there are no users of this flag in this call we simply
1652         * disallow this option for now.
1653         */
1654        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1655                return -EINVAL;
1656
1657        return __get_user_pages_locked(current, current->mm, start, nr_pages,
1658                                       pages, NULL, locked,
1659                                       gup_flags | FOLL_TOUCH);
1660}
1661EXPORT_SYMBOL(get_user_pages_locked);
1662
1663/*
1664 * get_user_pages_unlocked() is suitable to replace the form:
1665 *
1666 *      down_read(&mm->mmap_sem);
1667 *      get_user_pages(tsk, mm, ..., pages, NULL);
1668 *      up_read(&mm->mmap_sem);
1669 *
1670 *  with:
1671 *
1672 *      get_user_pages_unlocked(tsk, mm, ..., pages);
1673 *
1674 * It is functionally equivalent to get_user_pages_fast so
1675 * get_user_pages_fast should be used instead if specific gup_flags
1676 * (e.g. FOLL_FORCE) are not required.
1677 */
1678long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1679                             struct page **pages, unsigned int gup_flags)
1680{
1681        struct mm_struct *mm = current->mm;
1682        int locked = 1;
1683        long ret;
1684
1685        /*
1686         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1687         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1688         * vmas.  As there are no users of this flag in this call we simply
1689         * disallow this option for now.
1690         */
1691        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1692                return -EINVAL;
1693
1694        down_read(&mm->mmap_sem);
1695        ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
1696                                      &locked, gup_flags | FOLL_TOUCH);
1697        if (locked)
1698                up_read(&mm->mmap_sem);
1699        return ret;
1700}
1701EXPORT_SYMBOL(get_user_pages_unlocked);
1702
1703/*
1704 * Fast GUP
1705 *
1706 * get_user_pages_fast attempts to pin user pages by walking the page
1707 * tables directly and avoids taking locks. Thus the walker needs to be
1708 * protected from page table pages being freed from under it, and should
1709 * block any THP splits.
1710 *
1711 * One way to achieve this is to have the walker disable interrupts, and
1712 * rely on IPIs from the TLB flushing code blocking before the page table
1713 * pages are freed. This is unsuitable for architectures that do not need
1714 * to broadcast an IPI when invalidating TLBs.
1715 *
1716 * Another way to achieve this is to batch up page table containing pages
1717 * belonging to more than one mm_user, then rcu_sched a callback to free those
1718 * pages. Disabling interrupts will allow the fast_gup walker to both block
1719 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
1720 * (which is a relatively rare event). The code below adopts this strategy.
1721 *
1722 * Before activating this code, please be aware that the following assumptions
1723 * are currently made:
1724 *
1725 *  *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
1726 *  free pages containing page tables or TLB flushing requires IPI broadcast.
1727 *
1728 *  *) ptes can be read atomically by the architecture.
1729 *
1730 *  *) access_ok is sufficient to validate userspace address ranges.
1731 *
1732 * The last two assumptions can be relaxed by the addition of helper functions.
1733 *
1734 * This code is based heavily on the PowerPC implementation by Nick Piggin.
1735 */
1736#ifdef CONFIG_HAVE_FAST_GUP
1737#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
1738/*
1739 * WARNING: only to be used in the get_user_pages_fast() implementation.
1740 *
1741 * With get_user_pages_fast(), we walk down the pagetables without taking any
1742 * locks.  For this we would like to load the pointers atomically, but sometimes
1743 * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
1744 * we do have is the guarantee that a PTE will only either go from not present
1745 * to present, or present to not present or both -- it will not switch to a
1746 * completely different present page without a TLB flush in between; something
1747 * that we are blocking by holding interrupts off.
1748 *
1749 * Setting ptes from not present to present goes:
1750 *
1751 *   ptep->pte_high = h;
1752 *   smp_wmb();
1753 *   ptep->pte_low = l;
1754 *
1755 * And present to not present goes:
1756 *
1757 *   ptep->pte_low = 0;
1758 *   smp_wmb();
1759 *   ptep->pte_high = 0;
1760 *
1761 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
1762 * We load pte_high *after* loading pte_low, which ensures we don't see an older
1763 * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
1764 * picked up a changed pte high. We might have gotten rubbish values from
1765 * pte_low and pte_high, but we are guaranteed that pte_low will not have the
1766 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
1767 * operates on present ptes we're safe.
1768 */
1769static inline pte_t gup_get_pte(pte_t *ptep)
1770{
1771        pte_t pte;
1772
1773        do {
1774                pte.pte_low = ptep->pte_low;
1775                smp_rmb();
1776                pte.pte_high = ptep->pte_high;
1777                smp_rmb();
1778        } while (unlikely(pte.pte_low != ptep->pte_low));
1779
1780        return pte;
1781}
1782#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
1783/*
1784 * We require that the PTE can be read atomically.
1785 */
1786static inline pte_t gup_get_pte(pte_t *ptep)
1787{
1788        return READ_ONCE(*ptep);
1789}
1790#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
1791
1792static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
1793                                            struct page **pages)
1794{
1795        while ((*nr) - nr_start) {
1796                struct page *page = pages[--(*nr)];
1797
1798                ClearPageReferenced(page);
1799                put_page(page);
1800        }
1801}
1802
1803/*
1804 * Return the compund head page with ref appropriately incremented,
1805 * or NULL if that failed.
1806 */
1807static inline struct page *try_get_compound_head(struct page *page, int refs)
1808{
1809        struct page *head = compound_head(page);
1810        if (WARN_ON_ONCE(page_ref_count(head) < 0))
1811                return NULL;
1812        if (unlikely(!page_cache_add_speculative(head, refs)))
1813                return NULL;
1814        return head;
1815}
1816
1817#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
1818static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1819                         unsigned int flags, struct page **pages, int *nr)
1820{
1821        struct dev_pagemap *pgmap = NULL;
1822        int nr_start = *nr, ret = 0;
1823        pte_t *ptep, *ptem;
1824
1825        ptem = ptep = pte_offset_map(&pmd, addr);
1826        do {
1827                pte_t pte = gup_get_pte(ptep);
1828                struct page *head, *page;
1829
1830                /*
1831                 * Similar to the PMD case below, NUMA hinting must take slow
1832                 * path using the pte_protnone check.
1833                 */
1834                if (pte_protnone(pte))
1835                        goto pte_unmap;
1836
1837                if (!pte_access_permitted(pte, flags & FOLL_WRITE))
1838                        goto pte_unmap;
1839
1840                if (pte_devmap(pte)) {
1841                        if (unlikely(flags & FOLL_LONGTERM))
1842                                goto pte_unmap;
1843
1844                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
1845                        if (unlikely(!pgmap)) {
1846                                undo_dev_pagemap(nr, nr_start, pages);
1847                                goto pte_unmap;
1848                        }
1849                } else if (pte_special(pte))
1850                        goto pte_unmap;
1851
1852                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1853                page = pte_page(pte);
1854
1855                head = try_get_compound_head(page, 1);
1856                if (!head)
1857                        goto pte_unmap;
1858
1859                if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1860                        put_page(head);
1861                        goto pte_unmap;
1862                }
1863
1864                VM_BUG_ON_PAGE(compound_head(page) != head, page);
1865
1866                SetPageReferenced(page);
1867                pages[*nr] = page;
1868                (*nr)++;
1869
1870        } while (ptep++, addr += PAGE_SIZE, addr != end);
1871
1872        ret = 1;
1873
1874pte_unmap:
1875        if (pgmap)
1876                put_dev_pagemap(pgmap);
1877        pte_unmap(ptem);
1878        return ret;
1879}
1880#else
1881
1882/*
1883 * If we can't determine whether or not a pte is special, then fail immediately
1884 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
1885 * to be special.
1886 *
1887 * For a futex to be placed on a THP tail page, get_futex_key requires a
1888 * __get_user_pages_fast implementation that can pin pages. Thus it's still
1889 * useful to have gup_huge_pmd even if we can't operate on ptes.
1890 */
1891static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1892                         unsigned int flags, struct page **pages, int *nr)
1893{
1894        return 0;
1895}
1896#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
1897
1898#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1899static int __gup_device_huge(unsigned long pfn, unsigned long addr,
1900                unsigned long end, struct page **pages, int *nr)
1901{
1902        int nr_start = *nr;
1903        struct dev_pagemap *pgmap = NULL;
1904
1905        do {
1906                struct page *page = pfn_to_page(pfn);
1907
1908                pgmap = get_dev_pagemap(pfn, pgmap);
1909                if (unlikely(!pgmap)) {
1910                        undo_dev_pagemap(nr, nr_start, pages);
1911                        return 0;
1912                }
1913                SetPageReferenced(page);
1914                pages[*nr] = page;
1915                get_page(page);
1916                (*nr)++;
1917                pfn++;
1918        } while (addr += PAGE_SIZE, addr != end);
1919
1920        if (pgmap)
1921                put_dev_pagemap(pgmap);
1922        return 1;
1923}
1924
1925static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1926                unsigned long end, struct page **pages, int *nr)
1927{
1928        unsigned long fault_pfn;
1929        int nr_start = *nr;
1930
1931        fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1932        if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
1933                return 0;
1934
1935        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1936                undo_dev_pagemap(nr, nr_start, pages);
1937                return 0;
1938        }
1939        return 1;
1940}
1941
1942static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1943                unsigned long end, struct page **pages, int *nr)
1944{
1945        unsigned long fault_pfn;
1946        int nr_start = *nr;
1947
1948        fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1949        if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
1950                return 0;
1951
1952        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1953                undo_dev_pagemap(nr, nr_start, pages);
1954                return 0;
1955        }
1956        return 1;
1957}
1958#else
1959static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1960                unsigned long end, struct page **pages, int *nr)
1961{
1962        BUILD_BUG();
1963        return 0;
1964}
1965
1966static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
1967                unsigned long end, struct page **pages, int *nr)
1968{
1969        BUILD_BUG();
1970        return 0;
1971}
1972#endif
1973
1974#ifdef CONFIG_ARCH_HAS_HUGEPD
1975static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
1976                                      unsigned long sz)
1977{
1978        unsigned long __boundary = (addr + sz) & ~(sz-1);
1979        return (__boundary - 1 < end - 1) ? __boundary : end;
1980}
1981
1982static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
1983                       unsigned long end, int write, struct page **pages, int *nr)
1984{
1985        unsigned long pte_end;
1986        struct page *head, *page;
1987        pte_t pte;
1988        int refs;
1989
1990        pte_end = (addr + sz) & ~(sz-1);
1991        if (pte_end < end)
1992                end = pte_end;
1993
1994        pte = READ_ONCE(*ptep);
1995
1996        if (!pte_access_permitted(pte, write))
1997                return 0;
1998
1999        /* hugepages are never "special" */
2000        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2001
2002        refs = 0;
2003        head = pte_page(pte);
2004
2005        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
2006        do {
2007                VM_BUG_ON(compound_head(page) != head);
2008                pages[*nr] = page;
2009                (*nr)++;
2010                page++;
2011                refs++;
2012        } while (addr += PAGE_SIZE, addr != end);
2013
2014        head = try_get_compound_head(head, refs);
2015        if (!head) {
2016                *nr -= refs;
2017                return 0;
2018        }
2019
2020        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
2021                /* Could be optimized better */
2022                *nr -= refs;
2023                while (refs--)
2024                        put_page(head);
2025                return 0;
2026        }
2027
2028        SetPageReferenced(head);
2029        return 1;
2030}
2031
2032static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2033                unsigned int pdshift, unsigned long end, int write,
2034                struct page **pages, int *nr)
2035{
2036        pte_t *ptep;
2037        unsigned long sz = 1UL << hugepd_shift(hugepd);
2038        unsigned long next;
2039
2040        ptep = hugepte_offset(hugepd, addr, pdshift);
2041        do {
2042                next = hugepte_addr_end(addr, end, sz);
2043                if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
2044                        return 0;
2045        } while (ptep++, addr = next, addr != end);
2046
2047        return 1;
2048}
2049#else
2050static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2051                unsigned pdshift, unsigned long end, int write,
2052                struct page **pages, int *nr)
2053{
2054        return 0;
2055}
2056#endif /* CONFIG_ARCH_HAS_HUGEPD */
2057
2058static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2059                unsigned long end, unsigned int flags, struct page **pages, int *nr)
2060{
2061        struct page *head, *page;
2062        int refs;
2063
2064        if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
2065                return 0;
2066
2067        if (pmd_devmap(orig)) {
2068                if (unlikely(flags & FOLL_LONGTERM))
2069                        return 0;
2070                return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
2071        }
2072
2073        refs = 0;
2074        page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
2075        do {
2076                pages[*nr] = page;
2077                (*nr)++;
2078                page++;
2079                refs++;
2080        } while (addr += PAGE_SIZE, addr != end);
2081
2082        head = try_get_compound_head(pmd_page(orig), refs);
2083        if (!head) {
2084                *nr -= refs;
2085                return 0;
2086        }
2087
2088        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2089                *nr -= refs;
2090                while (refs--)
2091                        put_page(head);
2092                return 0;
2093        }
2094
2095        SetPageReferenced(head);
2096        return 1;
2097}
2098
2099static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
2100                unsigned long end, unsigned int flags, struct page **pages, int *nr)
2101{
2102        struct page *head, *page;
2103        int refs;
2104
2105        if (!pud_access_permitted(orig, flags & FOLL_WRITE))
2106                return 0;
2107
2108        if (pud_devmap(orig)) {
2109                if (unlikely(flags & FOLL_LONGTERM))
2110                        return 0;
2111                return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
2112        }
2113
2114        refs = 0;
2115        page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
2116        do {
2117                pages[*nr] = page;
2118                (*nr)++;
2119                page++;
2120                refs++;
2121        } while (addr += PAGE_SIZE, addr != end);
2122
2123        head = try_get_compound_head(pud_page(orig), refs);
2124        if (!head) {
2125                *nr -= refs;
2126                return 0;
2127        }
2128
2129        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
2130                *nr -= refs;
2131                while (refs--)
2132                        put_page(head);
2133                return 0;
2134        }
2135
2136        SetPageReferenced(head);
2137        return 1;
2138}
2139
2140static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
2141                        unsigned long end, unsigned int flags,
2142                        struct page **pages, int *nr)
2143{
2144        int refs;
2145        struct page *head, *page;
2146
2147        if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
2148                return 0;
2149
2150        BUILD_BUG_ON(pgd_devmap(orig));
2151        refs = 0;
2152        page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
2153        do {
2154                pages[*nr] = page;
2155                (*nr)++;
2156                page++;
2157                refs++;
2158        } while (addr += PAGE_SIZE, addr != end);
2159
2160        head = try_get_compound_head(pgd_page(orig), refs);
2161        if (!head) {
2162                *nr -= refs;
2163                return 0;
2164        }
2165
2166        if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
2167                *nr -= refs;
2168                while (refs--)
2169                        put_page(head);
2170                return 0;
2171        }
2172
2173        SetPageReferenced(head);
2174        return 1;
2175}
2176
2177static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
2178                unsigned int flags, struct page **pages, int *nr)
2179{
2180        unsigned long next;
2181        pmd_t *pmdp;
2182
2183        pmdp = pmd_offset(&pud, addr);
2184        do {
2185                pmd_t pmd = READ_ONCE(*pmdp);
2186
2187                next = pmd_addr_end(addr, end);
2188                if (!pmd_present(pmd))
2189                        return 0;
2190
2191                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
2192                             pmd_devmap(pmd))) {
2193                        /*
2194                         * NUMA hinting faults need to be handled in the GUP
2195                         * slowpath for accounting purposes and so that they
2196                         * can be serialised against THP migration.
2197                         */
2198                        if (pmd_protnone(pmd))
2199                                return 0;
2200
2201                        if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
2202                                pages, nr))
2203                                return 0;
2204
2205                } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
2206                        /*
2207                         * architecture have different format for hugetlbfs
2208                         * pmd format and THP pmd format
2209                         */
2210                        if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
2211                                         PMD_SHIFT, next, flags, pages, nr))
2212                                return 0;
2213                } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
2214                        return 0;
2215        } while (pmdp++, addr = next, addr != end);
2216
2217        return 1;
2218}
2219
2220static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
2221                         unsigned int flags, struct page **pages, int *nr)
2222{
2223        unsigned long next;
2224        pud_t *pudp;
2225
2226        pudp = pud_offset(&p4d, addr);
2227        do {
2228                pud_t pud = READ_ONCE(*pudp);
2229
2230                next = pud_addr_end(addr, end);
2231                if (pud_none(pud))
2232                        return 0;
2233                if (unlikely(pud_huge(pud))) {
2234                        if (!gup_huge_pud(pud, pudp, addr, next, flags,
2235                                          pages, nr))
2236                                return 0;
2237                } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
2238                        if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
2239                                         PUD_SHIFT, next, flags, pages, nr))
2240                                return 0;
2241                } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
2242                        return 0;
2243        } while (pudp++, addr = next, addr != end);
2244
2245        return 1;
2246}
2247
2248static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
2249                         unsigned int flags, struct page **pages, int *nr)
2250{
2251        unsigned long next;
2252        p4d_t *p4dp;
2253
2254        p4dp = p4d_offset(&pgd, addr);
2255        do {
2256                p4d_t p4d = READ_ONCE(*p4dp);
2257
2258                next = p4d_addr_end(addr, end);
2259                if (p4d_none(p4d))
2260                        return 0;
2261                BUILD_BUG_ON(p4d_huge(p4d));
2262                if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
2263                        if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
2264                                         P4D_SHIFT, next, flags, pages, nr))
2265                                return 0;
2266                } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
2267                        return 0;
2268        } while (p4dp++, addr = next, addr != end);
2269
2270        return 1;
2271}
2272
2273static void gup_pgd_range(unsigned long addr, unsigned long end,
2274                unsigned int flags, struct page **pages, int *nr)
2275{
2276        unsigned long next;
2277        pgd_t *pgdp;
2278
2279        pgdp = pgd_offset(current->mm, addr);
2280        do {
2281                pgd_t pgd = READ_ONCE(*pgdp);
2282
2283                next = pgd_addr_end(addr, end);
2284                if (pgd_none(pgd))
2285                        return;
2286                if (unlikely(pgd_huge(pgd))) {
2287                        if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
2288                                          pages, nr))
2289                                return;
2290                } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
2291                        if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
2292                                         PGDIR_SHIFT, next, flags, pages, nr))
2293                                return;
2294                } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
2295                        return;
2296        } while (pgdp++, addr = next, addr != end);
2297}
2298#else
2299static inline void gup_pgd_range(unsigned long addr, unsigned long end,
2300                unsigned int flags, struct page **pages, int *nr)
2301{
2302}
2303#endif /* CONFIG_HAVE_FAST_GUP */
2304
2305#ifndef gup_fast_permitted
2306/*
2307 * Check if it's allowed to use __get_user_pages_fast() for the range, or
2308 * we need to fall back to the slow version:
2309 */
2310static bool gup_fast_permitted(unsigned long start, unsigned long end)
2311{
2312        return true;
2313}
2314#endif
2315
2316/*
2317 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
2318 * the regular GUP.
2319 * Note a difference with get_user_pages_fast: this always returns the
2320 * number of pages pinned, 0 if no pages were pinned.
2321 *
2322 * If the architecture does not support this function, simply return with no
2323 * pages pinned.
2324 */
2325int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
2326                          struct page **pages)
2327{
2328        unsigned long len, end;
2329        unsigned long flags;
2330        int nr = 0;
2331
2332        start = untagged_addr(start) & PAGE_MASK;
2333        len = (unsigned long) nr_pages << PAGE_SHIFT;
2334        end = start + len;
2335
2336        if (end <= start)
2337                return 0;
2338        if (unlikely(!access_ok((void __user *)start, len)))
2339                return 0;
2340
2341        /*
2342         * Disable interrupts.  We use the nested form as we can already have
2343         * interrupts disabled by get_futex_key.
2344         *
2345         * With interrupts disabled, we block page table pages from being
2346         * freed from under us. See struct mmu_table_batch comments in
2347         * include/asm-generic/tlb.h for more details.
2348         *
2349         * We do not adopt an rcu_read_lock(.) here as we also want to
2350         * block IPIs that come from THPs splitting.
2351         */
2352
2353        if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
2354            gup_fast_permitted(start, end)) {
2355                local_irq_save(flags);
2356                gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
2357                local_irq_restore(flags);
2358        }
2359
2360        return nr;
2361}
2362EXPORT_SYMBOL_GPL(__get_user_pages_fast);
2363
2364static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
2365                                   unsigned int gup_flags, struct page **pages)
2366{
2367        int ret;
2368
2369        /*
2370         * FIXME: FOLL_LONGTERM does not work with
2371         * get_user_pages_unlocked() (see comments in that function)
2372         */
2373        if (gup_flags & FOLL_LONGTERM) {
2374                down_read(&current->mm->mmap_sem);
2375                ret = __gup_longterm_locked(current, current->mm,
2376                                            start, nr_pages,
2377                                            pages, NULL, gup_flags);
2378                up_read(&current->mm->mmap_sem);
2379        } else {
2380                ret = get_user_pages_unlocked(start, nr_pages,
2381                                              pages, gup_flags);
2382        }
2383
2384        return ret;
2385}
2386
2387/**
2388 * get_user_pages_fast() - pin user pages in memory
2389 * @start:      starting user address
2390 * @nr_pages:   number of pages from start to pin
2391 * @gup_flags:  flags modifying pin behaviour
2392 * @pages:      array that receives pointers to the pages pinned.
2393 *              Should be at least nr_pages long.
2394 *
2395 * Attempt to pin user pages in memory without taking mm->mmap_sem.
2396 * If not successful, it will fall back to taking the lock and
2397 * calling get_user_pages().
2398 *
2399 * Returns number of pages pinned. This may be fewer than the number
2400 * requested. If nr_pages is 0 or negative, returns 0. If no pages
2401 * were pinned, returns -errno.
2402 */
2403int get_user_pages_fast(unsigned long start, int nr_pages,
2404                        unsigned int gup_flags, struct page **pages)
2405{
2406        unsigned long addr, len, end;
2407        int nr = 0, ret = 0;
2408
2409        if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
2410                return -EINVAL;
2411
2412        start = untagged_addr(start) & PAGE_MASK;
2413        addr = start;
2414        len = (unsigned long) nr_pages << PAGE_SHIFT;
2415        end = start + len;
2416
2417        if (end <= start)
2418                return 0;
2419        if (unlikely(!access_ok((void __user *)start, len)))
2420                return -EFAULT;
2421
2422        if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
2423            gup_fast_permitted(start, end)) {
2424                local_irq_disable();
2425                gup_pgd_range(addr, end, gup_flags, pages, &nr);
2426                local_irq_enable();
2427                ret = nr;
2428        }
2429
2430        if (nr < nr_pages) {
2431                /* Try to get the remaining pages with get_user_pages */
2432                start += nr << PAGE_SHIFT;
2433                pages += nr;
2434
2435                ret = __gup_longterm_unlocked(start, nr_pages - nr,
2436                                              gup_flags, pages);
2437
2438                /* Have to be a bit careful with return values */
2439                if (nr > 0) {
2440                        if (ret < 0)
2441                                ret = nr;
2442                        else
2443                                ret += nr;
2444                }
2445        }
2446
2447        return ret;
2448}
2449EXPORT_SYMBOL_GPL(get_user_pages_fast);
2450