LXR linux/mm/gup.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2#include <linux/kernel.h>
   3#include <linux/errno.h>
   4#include <linux/err.h>
   5#include <linux/spinlock.h>
   6
   7#include <linux/mm.h>
   8#include <linux/memremap.h>
   9#include <linux/pagemap.h>
  10#include <linux/rmap.h>
  11#include <linux/swap.h>
  12#include <linux/swapops.h>
  13
  14#include <linux/sched/signal.h>
  15#include <linux/rwsem.h>
  16#include <linux/hugetlb.h>
  17#include <linux/migrate.h>
  18#include <linux/mm_inline.h>
  19#include <linux/sched/mm.h>
  20
  21#include <asm/mmu_context.h>
  22#include <asm/pgtable.h>
  23#include <asm/tlbflush.h>
  24
  25#include "internal.h"
  26
  27struct follow_page_context {
  28        struct dev_pagemap *pgmap;
  29        unsigned int page_mask;
  30};
  31
  32typedef int (*set_dirty_func_t)(struct page *page);
  33
  34static void __put_user_pages_dirty(struct page **pages,
  35                                   unsigned long npages,
  36                                   set_dirty_func_t sdf)
  37{
  38        unsigned long index;
  39
  40        for (index = 0; index < npages; index++) {
  41                struct page *page = compound_head(pages[index]);
  42
  43                /*
  44                 * Checking PageDirty at this point may race with
  45                 * clear_page_dirty_for_io(), but that's OK. Two key cases:
  46                 *
  47                 * 1) This code sees the page as already dirty, so it skips
  48                 * the call to sdf(). That could happen because
  49                 * clear_page_dirty_for_io() called page_mkclean(),
  50                 * followed by set_page_dirty(). However, now the page is
  51                 * going to get written back, which meets the original
  52                 * intention of setting it dirty, so all is well:
  53                 * clear_page_dirty_for_io() goes on to call
  54                 * TestClearPageDirty(), and write the page back.
  55                 *
  56                 * 2) This code sees the page as clean, so it calls sdf().
  57                 * The page stays dirty, despite being written back, so it
  58                 * gets written back again in the next writeback cycle.
  59                 * This is harmless.
  60                 */
  61                if (!PageDirty(page))
  62                        sdf(page);
  63
  64                put_user_page(page);
  65        }
  66}
  67
  68/**
  69 * put_user_pages_dirty() - release and dirty an array of gup-pinned pages
  70 * @pages:  array of pages to be marked dirty and released.
  71 * @npages: number of pages in the @pages array.
  72 *
  73 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
  74 * variants called on that page.
  75 *
  76 * For each page in the @pages array, make that page (or its head page, if a
  77 * compound page) dirty, if it was previously listed as clean. Then, release
  78 * the page using put_user_page().
  79 *
  80 * Please see the put_user_page() documentation for details.
  81 *
  82 * set_page_dirty(), which does not lock the page, is used here.
  83 * Therefore, it is the caller's responsibility to ensure that this is
  84 * safe. If not, then put_user_pages_dirty_lock() should be called instead.
  85 *
  86 */
  87void put_user_pages_dirty(struct page **pages, unsigned long npages)
  88{
  89        __put_user_pages_dirty(pages, npages, set_page_dirty);
  90}
  91EXPORT_SYMBOL(put_user_pages_dirty);
  92
  93/**
  94 * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages
  95 * @pages:  array of pages to be marked dirty and released.
  96 * @npages: number of pages in the @pages array.
  97 *
  98 * For each page in the @pages array, make that page (or its head page, if a
  99 * compound page) dirty, if it was previously listed as clean. Then, release
 100 * the page using put_user_page().
 101 *
 102 * Please see the put_user_page() documentation for details.
 103 *
 104 * This is just like put_user_pages_dirty(), except that it invokes
 105 * set_page_dirty_lock(), instead of set_page_dirty().
 106 *
 107 */
 108void put_user_pages_dirty_lock(struct page **pages, unsigned long npages)
 109{
 110        __put_user_pages_dirty(pages, npages, set_page_dirty_lock);
 111}
 112EXPORT_SYMBOL(put_user_pages_dirty_lock);
 113
 114/**
 115 * put_user_pages() - release an array of gup-pinned pages.
 116 * @pages:  array of pages to be marked dirty and released.
 117 * @npages: number of pages in the @pages array.
 118 *
 119 * For each page in the @pages array, release the page using put_user_page().
 120 *
 121 * Please see the put_user_page() documentation for details.
 122 */
 123void put_user_pages(struct page **pages, unsigned long npages)
 124{
 125        unsigned long index;
 126
 127        /*
 128         * TODO: this can be optimized for huge pages: if a series of pages is
 129         * physically contiguous and part of the same compound page, then a
 130         * single operation to the head page should suffice.
 131         */
 132        for (index = 0; index < npages; index++)
 133                put_user_page(pages[index]);
 134}
 135EXPORT_SYMBOL(put_user_pages);
 136
 137static struct page *no_page_table(struct vm_area_struct *vma,
 138                unsigned int flags)
 139{
 140        /*
 141         * When core dumping an enormous anonymous area that nobody
 142         * has touched so far, we don't want to allocate unnecessary pages or
 143         * page tables.  Return error instead of NULL to skip handle_mm_fault,
 144         * then get_dump_page() will return NULL to leave a hole in the dump.
 145         * But we can only make this optimization where a hole would surely
 146         * be zero-filled if handle_mm_fault() actually did handle it.
 147         */
 148        if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
 149                return ERR_PTR(-EFAULT);
 150        return NULL;
 151}
 152
 153static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 154                pte_t *pte, unsigned int flags)
 155{
 156        /* No page to get reference */
 157        if (flags & FOLL_GET)
 158                return -EFAULT;
 159
 160        if (flags & FOLL_TOUCH) {
 161                pte_t entry = *pte;
 162
 163                if (flags & FOLL_WRITE)
 164                        entry = pte_mkdirty(entry);
 165                entry = pte_mkyoung(entry);
 166
 167                if (!pte_same(*pte, entry)) {
 168                        set_pte_at(vma->vm_mm, address, pte, entry);
 169                        update_mmu_cache(vma, address, pte);
 170                }
 171        }
 172
 173        /* Proper page table entry exists, but no corresponding struct page */
 174        return -EEXIST;
 175}
 176
 177/*
 178 * FOLL_FORCE can write to even unwritable pte's, but only
 179 * after we've gone through a COW cycle and they are dirty.
 180 */
 181static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
 182{
 183        return pte_write(pte) ||
 184                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
 185}
 186
 187static struct page *follow_page_pte(struct vm_area_struct *vma,
 188                unsigned long address, pmd_t *pmd, unsigned int flags,
 189                struct dev_pagemap **pgmap)
 190{
 191        struct mm_struct *mm = vma->vm_mm;
 192        struct page *page;
 193        spinlock_t *ptl;
 194        pte_t *ptep, pte;
 195
 196retry:
 197        if (unlikely(pmd_bad(*pmd)))
 198                return no_page_table(vma, flags);
 199
 200        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 201        pte = *ptep;
 202        if (!pte_present(pte)) {
 203                swp_entry_t entry;
 204                /*
 205                 * KSM's break_ksm() relies upon recognizing a ksm page
 206                 * even while it is being migrated, so for that case we
 207                 * need migration_entry_wait().
 208                 */
 209                if (likely(!(flags & FOLL_MIGRATION)))
 210                        goto no_page;
 211                if (pte_none(pte))
 212                        goto no_page;
 213                entry = pte_to_swp_entry(pte);
 214                if (!is_migration_entry(entry))
 215                        goto no_page;
 216                pte_unmap_unlock(ptep, ptl);
 217                migration_entry_wait(mm, pmd, address);
 218                goto retry;
 219        }
 220        if ((flags & FOLL_NUMA) && pte_protnone(pte))
 221                goto no_page;
 222        if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
 223                pte_unmap_unlock(ptep, ptl);
 224                return NULL;
 225        }
 226
 227        page = vm_normal_page(vma, address, pte);
 228        if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
 229                /*
 230                 * Only return device mapping pages in the FOLL_GET case since
 231                 * they are only valid while holding the pgmap reference.
 232                 */
 233                *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
 234                if (*pgmap)
 235                        page = pte_page(pte);
 236                else
 237                        goto no_page;
 238        } else if (unlikely(!page)) {
 239                if (flags & FOLL_DUMP) {
 240                        /* Avoid special (like zero) pages in core dumps */
 241                        page = ERR_PTR(-EFAULT);
 242                        goto out;
 243                }
 244
 245                if (is_zero_pfn(pte_pfn(pte))) {
 246                        page = pte_page(pte);
 247                } else {
 248                        int ret;
 249
 250                        ret = follow_pfn_pte(vma, address, ptep, flags);
 251                        page = ERR_PTR(ret);
 252                        goto out;
 253                }
 254        }
 255
 256        if (flags & FOLL_SPLIT && PageTransCompound(page)) {
 257                int ret;
 258                get_page(page);
 259                pte_unmap_unlock(ptep, ptl);
 260                lock_page(page);
 261                ret = split_huge_page(page);
 262                unlock_page(page);
 263                put_page(page);
 264                if (ret)
 265                        return ERR_PTR(ret);
 266                goto retry;
 267        }
 268
 269        if (flags & FOLL_GET) {
 270                if (unlikely(!try_get_page(page))) {
 271                        page = ERR_PTR(-ENOMEM);
 272                        goto out;
 273                }
 274        }
 275        if (flags & FOLL_TOUCH) {
 276                if ((flags & FOLL_WRITE) &&
 277                    !pte_dirty(pte) && !PageDirty(page))
 278                        set_page_dirty(page);
 279                /*
 280                 * pte_mkyoung() would be more correct here, but atomic care
 281                 * is needed to avoid losing the dirty bit: it is easier to use
 282                 * mark_page_accessed().
 283                 */
 284                mark_page_accessed(page);
 285        }
 286        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 287                /* Do not mlock pte-mapped THP */
 288                if (PageTransCompound(page))
 289                        goto out;
 290
 291                /*
 292                 * The preliminary mapping check is mainly to avoid the
 293                 * pointless overhead of lock_page on the ZERO_PAGE
 294                 * which might bounce very badly if there is contention.
 295                 *
 296                 * If the page is already locked, we don't need to
 297                 * handle it now - vmscan will handle it later if and
 298                 * when it attempts to reclaim the page.
 299                 */
 300                if (page->mapping && trylock_page(page)) {
 301                        lru_add_drain();  /* push cached pages to LRU */
 302                        /*
 303                         * Because we lock page here, and migration is
 304                         * blocked by the pte's page reference, and we
 305                         * know the page is still mapped, we don't even
 306                         * need to check for file-cache page truncation.
 307                         */
 308                        mlock_vma_page(page);
 309                        unlock_page(page);
 310                }
 311        }
 312out:
 313        pte_unmap_unlock(ptep, ptl);
 314        return page;
 315no_page:
 316        pte_unmap_unlock(ptep, ptl);
 317        if (!pte_none(pte))
 318                return NULL;
 319        return no_page_table(vma, flags);
 320}
 321
 322static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 323                                    unsigned long address, pud_t *pudp,
 324                                    unsigned int flags,
 325                                    struct follow_page_context *ctx)
 326{
 327        pmd_t *pmd, pmdval;
 328        spinlock_t *ptl;
 329        struct page *page;
 330        struct mm_struct *mm = vma->vm_mm;
 331
 332        pmd = pmd_offset(pudp, address);
 333        /*
 334         * The READ_ONCE() will stabilize the pmdval in a register or
 335         * on the stack so that it will stop changing under the code.
 336         */
 337        pmdval = READ_ONCE(*pmd);
 338        if (pmd_none(pmdval))
 339                return no_page_table(vma, flags);
 340        if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
 341                page = follow_huge_pmd(mm, address, pmd, flags);
 342                if (page)
 343                        return page;
 344                return no_page_table(vma, flags);
 345        }
 346        if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
 347                page = follow_huge_pd(vma, address,
 348                                      __hugepd(pmd_val(pmdval)), flags,
 349                                      PMD_SHIFT);
 350                if (page)
 351                        return page;
 352                return no_page_table(vma, flags);
 353        }
 354retry:
 355        if (!pmd_present(pmdval)) {
 356                if (likely(!(flags & FOLL_MIGRATION)))
 357                        return no_page_table(vma, flags);
 358                VM_BUG_ON(thp_migration_supported() &&
 359                                  !is_pmd_migration_entry(pmdval));
 360                if (is_pmd_migration_entry(pmdval))
 361                        pmd_migration_entry_wait(mm, pmd);
 362                pmdval = READ_ONCE(*pmd);
 363                /*
 364                 * MADV_DONTNEED may convert the pmd to null because
 365                 * mmap_sem is held in read mode
 366                 */
 367                if (pmd_none(pmdval))
 368                        return no_page_table(vma, flags);
 369                goto retry;
 370        }
 371        if (pmd_devmap(pmdval)) {
 372                ptl = pmd_lock(mm, pmd);
 373                page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
 374                spin_unlock(ptl);
 375                if (page)
 376                        return page;
 377        }
 378        if (likely(!pmd_trans_huge(pmdval)))
 379                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 380
 381        if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
 382                return no_page_table(vma, flags);
 383
 384retry_locked:
 385        ptl = pmd_lock(mm, pmd);
 386        if (unlikely(pmd_none(*pmd))) {
 387                spin_unlock(ptl);
 388                return no_page_table(vma, flags);
 389        }
 390        if (unlikely(!pmd_present(*pmd))) {
 391                spin_unlock(ptl);
 392                if (likely(!(flags & FOLL_MIGRATION)))
 393                        return no_page_table(vma, flags);
 394                pmd_migration_entry_wait(mm, pmd);
 395                goto retry_locked;
 396        }
 397        if (unlikely(!pmd_trans_huge(*pmd))) {
 398                spin_unlock(ptl);
 399                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 400        }
 401        if (flags & FOLL_SPLIT) {
 402                int ret;
 403                page = pmd_page(*pmd);
 404                if (is_huge_zero_page(page)) {
 405                        spin_unlock(ptl);
 406                        ret = 0;
 407                        split_huge_pmd(vma, pmd, address);
 408                        if (pmd_trans_unstable(pmd))
 409                                ret = -EBUSY;
 410                } else {
 411                        if (unlikely(!try_get_page(page))) {
 412                                spin_unlock(ptl);
 413                                return ERR_PTR(-ENOMEM);
 414                        }
 415                        spin_unlock(ptl);
 416                        lock_page(page);
 417                        ret = split_huge_page(page);
 418                        unlock_page(page);
 419                        put_page(page);
 420                        if (pmd_none(*pmd))
 421                                return no_page_table(vma, flags);
 422                }
 423
 424                return ret ? ERR_PTR(ret) :
 425                        follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 426        }
 427        page = follow_trans_huge_pmd(vma, address, pmd, flags);
 428        spin_unlock(ptl);
 429        ctx->page_mask = HPAGE_PMD_NR - 1;
 430        return page;
 431}
 432
 433static struct page *follow_pud_mask(struct vm_area_struct *vma,
 434                                    unsigned long address, p4d_t *p4dp,
 435                                    unsigned int flags,
 436                                    struct follow_page_context *ctx)
 437{
 438        pud_t *pud;
 439        spinlock_t *ptl;
 440        struct page *page;
 441        struct mm_struct *mm = vma->vm_mm;
 442
 443        pud = pud_offset(p4dp, address);
 444        if (pud_none(*pud))
 445                return no_page_table(vma, flags);
 446        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
 447                page = follow_huge_pud(mm, address, pud, flags);
 448                if (page)
 449                        return page;
 450                return no_page_table(vma, flags);
 451        }
 452        if (is_hugepd(__hugepd(pud_val(*pud)))) {
 453                page = follow_huge_pd(vma, address,
 454                                      __hugepd(pud_val(*pud)), flags,
 455                                      PUD_SHIFT);
 456                if (page)
 457                        return page;
 458                return no_page_table(vma, flags);
 459        }
 460        if (pud_devmap(*pud)) {
 461                ptl = pud_lock(mm, pud);
 462                page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
 463                spin_unlock(ptl);
 464                if (page)
 465                        return page;
 466        }
 467        if (unlikely(pud_bad(*pud)))
 468                return no_page_table(vma, flags);
 469
 470        return follow_pmd_mask(vma, address, pud, flags, ctx);
 471}
 472
 473static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 474                                    unsigned long address, pgd_t *pgdp,
 475                                    unsigned int flags,
 476                                    struct follow_page_context *ctx)
 477{
 478        p4d_t *p4d;
 479        struct page *page;
 480
 481        p4d = p4d_offset(pgdp, address);
 482        if (p4d_none(*p4d))
 483                return no_page_table(vma, flags);
 484        BUILD_BUG_ON(p4d_huge(*p4d));
 485        if (unlikely(p4d_bad(*p4d)))
 486                return no_page_table(vma, flags);
 487
 488        if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
 489                page = follow_huge_pd(vma, address,
 490                                      __hugepd(p4d_val(*p4d)), flags,
 491                                      P4D_SHIFT);
 492                if (page)
 493                        return page;
 494                return no_page_table(vma, flags);
 495        }
 496        return follow_pud_mask(vma, address, p4d, flags, ctx);
 497}
 498
 499/**
 500 * follow_page_mask - look up a page descriptor from a user-virtual address
 501 * @vma: vm_area_struct mapping @address
 502 * @address: virtual address to look up
 503 * @flags: flags modifying lookup behaviour
 504 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
 505 *       pointer to output page_mask
 506 *
 507 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 508 *
 509 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
 510 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
 511 *
 512 * On output, the @ctx->page_mask is set according to the size of the page.
 513 *
 514 * Return: the mapped (struct page *), %NULL if no mapping exists, or
 515 * an error pointer if there is a mapping to something not represented
 516 * by a page descriptor (see also vm_normal_page()).
 517 */
 518struct page *follow_page_mask(struct vm_area_struct *vma,
 519                              unsigned long address, unsigned int flags,
 520                              struct follow_page_context *ctx)
 521{
 522        pgd_t *pgd;
 523        struct page *page;
 524        struct mm_struct *mm = vma->vm_mm;
 525
 526        ctx->page_mask = 0;
 527
 528        /* make this handle hugepd */
 529        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 530        if (!IS_ERR(page)) {
 531                BUG_ON(flags & FOLL_GET);
 532                return page;
 533        }
 534
 535        pgd = pgd_offset(mm, address);
 536
 537        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 538                return no_page_table(vma, flags);
 539
 540        if (pgd_huge(*pgd)) {
 541                page = follow_huge_pgd(mm, address, pgd, flags);
 542                if (page)
 543                        return page;
 544                return no_page_table(vma, flags);
 545        }
 546        if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
 547                page = follow_huge_pd(vma, address,
 548                                      __hugepd(pgd_val(*pgd)), flags,
 549                                      PGDIR_SHIFT);
 550                if (page)
 551                        return page;
 552                return no_page_table(vma, flags);
 553        }
 554
 555        return follow_p4d_mask(vma, address, pgd, flags, ctx);
 556}
 557
 558struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 559                         unsigned int foll_flags)
 560{
 561        struct follow_page_context ctx = { NULL };
 562        struct page *page;
 563
 564        page = follow_page_mask(vma, address, foll_flags, &ctx);
 565        if (ctx.pgmap)
 566                put_dev_pagemap(ctx.pgmap);
 567        return page;
 568}
 569
 570static int get_gate_page(struct mm_struct *mm, unsigned long address,
 571                unsigned int gup_flags, struct vm_area_struct **vma,
 572                struct page **page)
 573{
 574        pgd_t *pgd;
 575        p4d_t *p4d;
 576        pud_t *pud;
 577        pmd_t *pmd;
 578        pte_t *pte;
 579        int ret = -EFAULT;
 580
 581        /* user gate pages are read-only */
 582        if (gup_flags & FOLL_WRITE)
 583                return -EFAULT;
 584        if (address > TASK_SIZE)
 585                pgd = pgd_offset_k(address);
 586        else
 587                pgd = pgd_offset_gate(mm, address);
 588        BUG_ON(pgd_none(*pgd));
 589        p4d = p4d_offset(pgd, address);
 590        BUG_ON(p4d_none(*p4d));
 591        pud = pud_offset(p4d, address);
 592        BUG_ON(pud_none(*pud));
 593        pmd = pmd_offset(pud, address);
 594        if (!pmd_present(*pmd))
 595                return -EFAULT;
 596        VM_BUG_ON(pmd_trans_huge(*pmd));
 597        pte = pte_offset_map(pmd, address);
 598        if (pte_none(*pte))
 599                goto unmap;
 600        *vma = get_gate_vma(mm);
 601        if (!page)
 602                goto out;
 603        *page = vm_normal_page(*vma, address, *pte);
 604        if (!*page) {
 605                if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
 606                        goto unmap;
 607                *page = pte_page(*pte);
 608
 609                /*
 610                 * This should never happen (a device public page in the gate
 611                 * area).
 612                 */
 613                if (is_device_public_page(*page))
 614                        goto unmap;
 615        }
 616        if (unlikely(!try_get_page(*page))) {
 617                ret = -ENOMEM;
 618                goto unmap;
 619        }
 620out:
 621        ret = 0;
 622unmap:
 623        pte_unmap(pte);
 624        return ret;
 625}
 626
 627/*
 628 * mmap_sem must be held on entry.  If @nonblocking != NULL and
 629 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
 630 * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
 631 */
 632static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 633                unsigned long address, unsigned int *flags, int *nonblocking)
 634{
 635        unsigned int fault_flags = 0;
 636        vm_fault_t ret;
 637
 638        /* mlock all present pages, but do not fault in new pages */
 639        if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
 640                return -ENOENT;
 641        if (*flags & FOLL_WRITE)
 642                fault_flags |= FAULT_FLAG_WRITE;
 643        if (*flags & FOLL_REMOTE)
 644                fault_flags |= FAULT_FLAG_REMOTE;
 645        if (nonblocking)
 646                fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 647        if (*flags & FOLL_NOWAIT)
 648                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
 649        if (*flags & FOLL_TRIED) {
 650                VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
 651                fault_flags |= FAULT_FLAG_TRIED;
 652        }
 653
 654        ret = handle_mm_fault(vma, address, fault_flags);
 655        if (ret & VM_FAULT_ERROR) {
 656                int err = vm_fault_to_errno(ret, *flags);
 657
 658                if (err)
 659                        return err;
 660                BUG();
 661        }
 662
 663        if (tsk) {
 664                if (ret & VM_FAULT_MAJOR)
 665                        tsk->maj_flt++;
 666                else
 667                        tsk->min_flt++;
 668        }
 669
 670        if (ret & VM_FAULT_RETRY) {
 671                if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
 672                        *nonblocking = 0;
 673                return -EBUSY;
 674        }
 675
 676        /*
 677         * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
 678         * necessary, even if maybe_mkwrite decided not to set pte_write. We
 679         * can thus safely do subsequent page lookups as if they were reads.
 680         * But only do so when looping for pte_write is futile: in some cases
 681         * userspace may also be wanting to write to the gotten user page,
 682         * which a read fault here might prevent (a readonly page might get
 683         * reCOWed by userspace write).
 684         */
 685        if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
 686                *flags |= FOLL_COW;
 687        return 0;
 688}
 689
 690static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 691{
 692        vm_flags_t vm_flags = vma->vm_flags;
 693        int write = (gup_flags & FOLL_WRITE);
 694        int foreign = (gup_flags & FOLL_REMOTE);
 695
 696        if (vm_flags & (VM_IO | VM_PFNMAP))
 697                return -EFAULT;
 698
 699        if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
 700                return -EFAULT;
 701
 702        if (write) {
 703                if (!(vm_flags & VM_WRITE)) {
 704                        if (!(gup_flags & FOLL_FORCE))
 705                                return -EFAULT;
 706                        /*
 707                         * We used to let the write,force case do COW in a
 708                         * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
 709                         * set a breakpoint in a read-only mapping of an
 710                         * executable, without corrupting the file (yet only
 711                         * when that file had been opened for writing!).
 712                         * Anon pages in shared mappings are surprising: now
 713                         * just reject it.
 714                         */
 715                        if (!is_cow_mapping(vm_flags))
 716                                return -EFAULT;
 717                }
 718        } else if (!(vm_flags & VM_READ)) {
 719                if (!(gup_flags & FOLL_FORCE))
 720                        return -EFAULT;
 721                /*
 722                 * Is there actually any vma we can reach here which does not
 723                 * have VM_MAYREAD set?
 724                 */
 725                if (!(vm_flags & VM_MAYREAD))
 726                        return -EFAULT;
 727        }
 728        /*
 729         * gups are always data accesses, not instruction
 730         * fetches, so execute=false here
 731         */
 732        if (!arch_vma_access_permitted(vma, write, false, foreign))
 733                return -EFAULT;
 734        return 0;
 735}
 736
 737/**
 738 * __get_user_pages() - pin user pages in memory
 739 * @tsk:        task_struct of target task
 740 * @mm:         mm_struct of target mm
 741 * @start:      starting user address
 742 * @nr_pages:   number of pages from start to pin
 743 * @gup_flags:  flags modifying pin behaviour
 744 * @pages:      array that receives pointers to the pages pinned.
 745 *              Should be at least nr_pages long. Or NULL, if caller
 746 *              only intends to ensure the pages are faulted in.
 747 * @vmas:       array of pointers to vmas corresponding to each page.
 748 *              Or NULL if the caller does not require them.
 749 * @nonblocking: whether waiting for disk IO or mmap_sem contention
 750 *
 751 * Returns number of pages pinned. This may be fewer than the number
 752 * requested. If nr_pages is 0 or negative, returns 0. If no pages
 753 * were pinned, returns -errno. Each page returned must be released
 754 * with a put_page() call when it is finished with. vmas will only
 755 * remain valid while mmap_sem is held.
 756 *
 757 * Must be called with mmap_sem held.  It may be released.  See below.
 758 *
 759 * __get_user_pages walks a process's page tables and takes a reference to
 760 * each struct page that each user address corresponds to at a given
 761 * instant. That is, it takes the page that would be accessed if a user
 762 * thread accesses the given user virtual address at that instant.
 763 *
 764 * This does not guarantee that the page exists in the user mappings when
 765 * __get_user_pages returns, and there may even be a completely different
 766 * page there in some cases (eg. if mmapped pagecache has been invalidated
 767 * and subsequently re faulted). However it does guarantee that the page
 768 * won't be freed completely. And mostly callers simply care that the page
 769 * contains data that was valid *at some point in time*. Typically, an IO
 770 * or similar operation cannot guarantee anything stronger anyway because
 771 * locks can't be held over the syscall boundary.
 772 *
 773 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 774 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 775 * appropriate) must be called after the page is finished with, and
 776 * before put_page is called.
 777 *
 778 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
 779 * or mmap_sem contention, and if waiting is needed to pin all pages,
 780 * *@nonblocking will be set to 0.  Further, if @gup_flags does not
 781 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
 782 * this case.
 783 *
 784 * A caller using such a combination of @nonblocking and @gup_flags
 785 * must therefore hold the mmap_sem for reading only, and recognize
 786 * when it's been released.  Otherwise, it must be held for either
 787 * reading or writing and will not be released.
 788 *
 789 * In most cases, get_user_pages or get_user_pages_fast should be used
 790 * instead of __get_user_pages. __get_user_pages should be used only if
 791 * you need some special @gup_flags.
 792 */
 793static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 794                unsigned long start, unsigned long nr_pages,
 795                unsigned int gup_flags, struct page **pages,
 796                struct vm_area_struct **vmas, int *nonblocking)
 797{
 798        long ret = 0, i = 0;
 799        struct vm_area_struct *vma = NULL;
 800        struct follow_page_context ctx = { NULL };
 801
 802        if (!nr_pages)
 803                return 0;
 804
 805        VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
 806
 807        /*
 808         * If FOLL_FORCE is set then do not force a full fault as the hinting
 809         * fault information is unrelated to the reference behaviour of a task
 810         * using the address space
 811         */
 812        if (!(gup_flags & FOLL_FORCE))
 813                gup_flags |= FOLL_NUMA;
 814
 815        do {
 816                struct page *page;
 817                unsigned int foll_flags = gup_flags;
 818                unsigned int page_increm;
 819
 820                /* first iteration or cross vma bound */
 821                if (!vma || start >= vma->vm_end) {
 822                        vma = find_extend_vma(mm, start);
 823                        if (!vma && in_gate_area(mm, start)) {
 824                                ret = get_gate_page(mm, start & PAGE_MASK,
 825                                                gup_flags, &vma,
 826                                                pages ? &pages[i] : NULL);
 827                                if (ret)
 828                                        goto out;
 829                                ctx.page_mask = 0;
 830                                goto next_page;
 831                        }
 832
 833                        if (!vma || check_vma_flags(vma, gup_flags)) {
 834                                ret = -EFAULT;
 835                                goto out;
 836                        }
 837                        if (is_vm_hugetlb_page(vma)) {
 838                                i = follow_hugetlb_page(mm, vma, pages, vmas,
 839                                                &start, &nr_pages, i,
 840                                                gup_flags, nonblocking);
 841                                continue;
 842                        }
 843                }
 844retry:
 845                /*
 846                 * If we have a pending SIGKILL, don't keep faulting pages and
 847                 * potentially allocating memory.
 848                 */
 849                if (fatal_signal_pending(current)) {
 850                        ret = -ERESTARTSYS;
 851                        goto out;
 852                }
 853                cond_resched();
 854
 855                page = follow_page_mask(vma, start, foll_flags, &ctx);
 856                if (!page) {
 857                        ret = faultin_page(tsk, vma, start, &foll_flags,
 858                                        nonblocking);
 859                        switch (ret) {
 860                        case 0:
 861                                goto retry;
 862                        case -EBUSY:
 863                                ret = 0;
 864                                /* FALLTHRU */
 865                        case -EFAULT:
 866                        case -ENOMEM:
 867                        case -EHWPOISON:
 868                                goto out;
 869                        case -ENOENT:
 870                                goto next_page;
 871                        }
 872                        BUG();
 873                } else if (PTR_ERR(page) == -EEXIST) {
 874                        /*
 875                         * Proper page table entry exists, but no corresponding
 876                         * struct page.
 877                         */
 878                        goto next_page;
 879                } else if (IS_ERR(page)) {
 880                        ret = PTR_ERR(page);
 881                        goto out;
 882                }
 883                if (pages) {
 884                        pages[i] = page;
 885                        flush_anon_page(vma, page, start);
 886                        flush_dcache_page(page);
 887                        ctx.page_mask = 0;
 888                }
 889next_page:
 890                if (vmas) {
 891                        vmas[i] = vma;
 892                        ctx.page_mask = 0;
 893                }
 894                page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
 895                if (page_increm > nr_pages)
 896                        page_increm = nr_pages;
 897                i += page_increm;
 898                start += page_increm * PAGE_SIZE;
 899                nr_pages -= page_increm;
 900        } while (nr_pages);
 901out:
 902        if (ctx.pgmap)
 903                put_dev_pagemap(ctx.pgmap);
 904        return i ? i : ret;
 905}
 906
 907static bool vma_permits_fault(struct vm_area_struct *vma,
 908                              unsigned int fault_flags)
 909{
 910        bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
 911        bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
 912        vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
 913
 914        if (!(vm_flags & vma->vm_flags))
 915                return false;
 916
 917        /*
 918         * The architecture might have a hardware protection
 919         * mechanism other than read/write that can deny access.
 920         *
 921         * gup always represents data access, not instruction
 922         * fetches, so execute=false here:
 923         */
 924        if (!arch_vma_access_permitted(vma, write, false, foreign))
 925                return false;
 926
 927        return true;
 928}
 929
 930/*
 931 * fixup_user_fault() - manually resolve a user page fault
 932 * @tsk:        the task_struct to use for page fault accounting, or
 933 *              NULL if faults are not to be recorded.
 934 * @mm:         mm_struct of target mm
 935 * @address:    user address
 936 * @fault_flags:flags to pass down to handle_mm_fault()
 937 * @unlocked:   did we unlock the mmap_sem while retrying, maybe NULL if caller
 938 *              does not allow retry
 939 *
 940 * This is meant to be called in the specific scenario where for locking reasons
 941 * we try to access user memory in atomic context (within a pagefault_disable()
 942 * section), this returns -EFAULT, and we want to resolve the user fault before
 943 * trying again.
 944 *
 945 * Typically this is meant to be used by the futex code.
 946 *
 947 * The main difference with get_user_pages() is that this function will
 948 * unconditionally call handle_mm_fault() which will in turn perform all the
 949 * necessary SW fixup of the dirty and young bits in the PTE, while
 950 * get_user_pages() only guarantees to update these in the struct page.
 951 *
 952 * This is important for some architectures where those bits also gate the
 953 * access permission to the page because they are maintained in software.  On
 954 * such architectures, gup() will not be enough to make a subsequent access
 955 * succeed.
 956 *
 957 * This function will not return with an unlocked mmap_sem. So it has not the
 958 * same semantics wrt the @mm->mmap_sem as does filemap_fault().
 959 */
 960int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 961                     unsigned long address, unsigned int fault_flags,
 962                     bool *unlocked)
 963{
 964        struct vm_area_struct *vma;
 965        vm_fault_t ret, major = 0;
 966
 967        if (unlocked)
 968                fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 969
 970retry:
 971        vma = find_extend_vma(mm, address);
 972        if (!vma || address < vma->vm_start)
 973                return -EFAULT;
 974
 975        if (!vma_permits_fault(vma, fault_flags))
 976                return -EFAULT;
 977
 978        ret = handle_mm_fault(vma, address, fault_flags);
 979        major |= ret & VM_FAULT_MAJOR;
 980        if (ret & VM_FAULT_ERROR) {
 981                int err = vm_fault_to_errno(ret, 0);
 982
 983                if (err)
 984                        return err;
 985                BUG();
 986        }
 987
 988        if (ret & VM_FAULT_RETRY) {
 989                down_read(&mm->mmap_sem);
 990                if (!(fault_flags & FAULT_FLAG_TRIED)) {
 991                        *unlocked = true;
 992                        fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
 993                        fault_flags |= FAULT_FLAG_TRIED;
 994                        goto retry;
 995                }
 996        }
 997
 998        if (tsk) {
 999                if (major)
1000                        tsk->maj_flt++;

1001                else
1002                        tsk->min_flt++;
1003        }
1004        return 0;
1005}
1006EXPORT_SYMBOL_GPL(fixup_user_fault);
1007
1008static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
1009                                                struct mm_struct *mm,
1010                                                unsigned long start,
1011                                                unsigned long nr_pages,
1012                                                struct page **pages,
1013                                                struct vm_area_struct **vmas,
1014                                                int *locked,
1015                                                unsigned int flags)
1016{
1017        long ret, pages_done;
1018        bool lock_dropped;
1019
1020        if (locked) {
1021                /* if VM_FAULT_RETRY can be returned, vmas become invalid */
1022                BUG_ON(vmas);
1023                /* check caller initialized locked */
1024                BUG_ON(*locked != 1);
1025        }
1026
1027        if (pages)
1028                flags |= FOLL_GET;
1029
1030        pages_done = 0;
1031        lock_dropped = false;
1032        for (;;) {
1033                ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
1034                                       vmas, locked);
1035                if (!locked)
1036                        /* VM_FAULT_RETRY couldn't trigger, bypass */
1037                        return ret;
1038
1039                /* VM_FAULT_RETRY cannot return errors */
1040                if (!*locked) {
1041                        BUG_ON(ret < 0);
1042                        BUG_ON(ret >= nr_pages);
1043                }
1044
1045                if (ret > 0) {
1046                        nr_pages -= ret;
1047                        pages_done += ret;
1048                        if (!nr_pages)
1049                                break;
1050                }
1051                if (*locked) {
1052                        /*
1053                         * VM_FAULT_RETRY didn't trigger or it was a
1054                         * FOLL_NOWAIT.
1055                         */
1056                        if (!pages_done)
1057                                pages_done = ret;
1058                        break;
1059                }
1060                /*
1061                 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1062                 * For the prefault case (!pages) we only update counts.
1063                 */
1064                if (likely(pages))
1065                        pages += ret;
1066                start += ret << PAGE_SHIFT;
1067
1068                /*
1069                 * Repeat on the address that fired VM_FAULT_RETRY
1070                 * without FAULT_FLAG_ALLOW_RETRY but with
1071                 * FAULT_FLAG_TRIED.
1072                 */
1073                *locked = 1;
1074                lock_dropped = true;
1075                down_read(&mm->mmap_sem);
1076                ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
1077                                       pages, NULL, NULL);
1078                if (ret != 1) {
1079                        BUG_ON(ret > 1);
1080                        if (!pages_done)
1081                                pages_done = ret;
1082                        break;
1083                }
1084                nr_pages--;
1085                pages_done++;
1086                if (!nr_pages)
1087                        break;
1088                if (likely(pages))
1089                        pages++;
1090                start += PAGE_SIZE;
1091        }
1092        if (lock_dropped && *locked) {
1093                /*
1094                 * We must let the caller know we temporarily dropped the lock
1095                 * and so the critical section protected by it was lost.
1096                 */
1097                up_read(&mm->mmap_sem);
1098                *locked = 0;
1099        }
1100        return pages_done;
1101}
1102
1103/*
1104 * We can leverage the VM_FAULT_RETRY functionality in the page fault
1105 * paths better by using either get_user_pages_locked() or
1106 * get_user_pages_unlocked().
1107 *
1108 * get_user_pages_locked() is suitable to replace the form:
1109 *
1110 *      down_read(&mm->mmap_sem);
1111 *      do_something()
1112 *      get_user_pages(tsk, mm, ..., pages, NULL);
1113 *      up_read(&mm->mmap_sem);
1114 *
1115 *  to:
1116 *
1117 *      int locked = 1;
1118 *      down_read(&mm->mmap_sem);
1119 *      do_something()
1120 *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
1121 *      if (locked)
1122 *          up_read(&mm->mmap_sem);
1123 */
1124long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1125                           unsigned int gup_flags, struct page **pages,
1126                           int *locked)
1127{
1128        /*
1129         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1130         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1131         * vmas.  As there are no users of this flag in this call we simply
1132         * disallow this option for now.
1133         */
1134        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1135                return -EINVAL;
1136
1137        return __get_user_pages_locked(current, current->mm, start, nr_pages,
1138                                       pages, NULL, locked,
1139                                       gup_flags | FOLL_TOUCH);
1140}
1141EXPORT_SYMBOL(get_user_pages_locked);
1142
1143/*
1144 * get_user_pages_unlocked() is suitable to replace the form:
1145 *
1146 *      down_read(&mm->mmap_sem);
1147 *      get_user_pages(tsk, mm, ..., pages, NULL);
1148 *      up_read(&mm->mmap_sem);
1149 *
1150 *  with:
1151 *
1152 *      get_user_pages_unlocked(tsk, mm, ..., pages);
1153 *
1154 * It is functionally equivalent to get_user_pages_fast so
1155 * get_user_pages_fast should be used instead if specific gup_flags
1156 * (e.g. FOLL_FORCE) are not required.
1157 */
1158long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1159                             struct page **pages, unsigned int gup_flags)
1160{
1161        struct mm_struct *mm = current->mm;
1162        int locked = 1;
1163        long ret;
1164
1165        /*
1166         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1167         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1168         * vmas.  As there are no users of this flag in this call we simply
1169         * disallow this option for now.
1170         */
1171        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1172                return -EINVAL;
1173
1174        down_read(&mm->mmap_sem);
1175        ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
1176                                      &locked, gup_flags | FOLL_TOUCH);
1177        if (locked)
1178                up_read(&mm->mmap_sem);
1179        return ret;
1180}
1181EXPORT_SYMBOL(get_user_pages_unlocked);
1182
1183/*
1184 * get_user_pages_remote() - pin user pages in memory
1185 * @tsk:        the task_struct to use for page fault accounting, or
1186 *              NULL if faults are not to be recorded.
1187 * @mm:         mm_struct of target mm
1188 * @start:      starting user address
1189 * @nr_pages:   number of pages from start to pin
1190 * @gup_flags:  flags modifying lookup behaviour
1191 * @pages:      array that receives pointers to the pages pinned.
1192 *              Should be at least nr_pages long. Or NULL, if caller
1193 *              only intends to ensure the pages are faulted in.
1194 * @vmas:       array of pointers to vmas corresponding to each page.
1195 *              Or NULL if the caller does not require them.
1196 * @locked:     pointer to lock flag indicating whether lock is held and
1197 *              subsequently whether VM_FAULT_RETRY functionality can be
1198 *              utilised. Lock must initially be held.
1199 *
1200 * Returns number of pages pinned. This may be fewer than the number
1201 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1202 * were pinned, returns -errno. Each page returned must be released
1203 * with a put_page() call when it is finished with. vmas will only
1204 * remain valid while mmap_sem is held.
1205 *
1206 * Must be called with mmap_sem held for read or write.
1207 *
1208 * get_user_pages walks a process's page tables and takes a reference to
1209 * each struct page that each user address corresponds to at a given
1210 * instant. That is, it takes the page that would be accessed if a user
1211 * thread accesses the given user virtual address at that instant.
1212 *
1213 * This does not guarantee that the page exists in the user mappings when
1214 * get_user_pages returns, and there may even be a completely different
1215 * page there in some cases (eg. if mmapped pagecache has been invalidated
1216 * and subsequently re faulted). However it does guarantee that the page
1217 * won't be freed completely. And mostly callers simply care that the page
1218 * contains data that was valid *at some point in time*. Typically, an IO
1219 * or similar operation cannot guarantee anything stronger anyway because
1220 * locks can't be held over the syscall boundary.
1221 *
1222 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
1223 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
1224 * be called after the page is finished with, and before put_page is called.
1225 *
1226 * get_user_pages is typically used for fewer-copy IO operations, to get a
1227 * handle on the memory by some means other than accesses via the user virtual
1228 * addresses. The pages may be submitted for DMA to devices or accessed via
1229 * their kernel linear mapping (via the kmap APIs). Care should be taken to
1230 * use the correct cache flushing APIs.
1231 *
1232 * See also get_user_pages_fast, for performance critical applications.
1233 *
1234 * get_user_pages should be phased out in favor of
1235 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
1236 * should use get_user_pages because it cannot pass
1237 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
1238 */
1239long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
1240                unsigned long start, unsigned long nr_pages,
1241                unsigned int gup_flags, struct page **pages,
1242                struct vm_area_struct **vmas, int *locked)
1243{
1244        /*
1245         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1246         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1247         * vmas.  As there are no users of this flag in this call we simply
1248         * disallow this option for now.
1249         */
1250        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1251                return -EINVAL;
1252
1253        return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
1254                                       locked,
1255                                       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
1256}
1257EXPORT_SYMBOL(get_user_pages_remote);
1258
1259#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
1260static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
1261{
1262        long i;
1263        struct vm_area_struct *vma_prev = NULL;
1264
1265        for (i = 0; i < nr_pages; i++) {
1266                struct vm_area_struct *vma = vmas[i];
1267
1268                if (vma == vma_prev)
1269                        continue;
1270
1271                vma_prev = vma;
1272
1273                if (vma_is_fsdax(vma))
1274                        return true;
1275        }
1276        return false;
1277}
1278
1279#ifdef CONFIG_CMA
1280static struct page *new_non_cma_page(struct page *page, unsigned long private)
1281{
1282        /*
1283         * We want to make sure we allocate the new page from the same node
1284         * as the source page.
1285         */
1286        int nid = page_to_nid(page);
1287        /*
1288         * Trying to allocate a page for migration. Ignore allocation
1289         * failure warnings. We don't force __GFP_THISNODE here because
1290         * this node here is the node where we have CMA reservation and
1291         * in some case these nodes will have really less non movable
1292         * allocation memory.
1293         */
1294        gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
1295
1296        if (PageHighMem(page))
1297                gfp_mask |= __GFP_HIGHMEM;
1298
1299#ifdef CONFIG_HUGETLB_PAGE
1300        if (PageHuge(page)) {
1301                struct hstate *h = page_hstate(page);
1302                /*
1303                 * We don't want to dequeue from the pool because pool pages will
1304                 * mostly be from the CMA region.
1305                 */
1306                return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
1307        }
1308#endif
1309        if (PageTransHuge(page)) {
1310                struct page *thp;
1311                /*
1312                 * ignore allocation failure warnings
1313                 */
1314                gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
1315
1316                /*
1317                 * Remove the movable mask so that we don't allocate from
1318                 * CMA area again.
1319                 */
1320                thp_gfpmask &= ~__GFP_MOVABLE;
1321                thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
1322                if (!thp)
1323                        return NULL;
1324                prep_transhuge_page(thp);
1325                return thp;
1326        }
1327
1328        return __alloc_pages_node(nid, gfp_mask, 0);
1329}
1330
1331static long check_and_migrate_cma_pages(struct task_struct *tsk,
1332                                        struct mm_struct *mm,
1333                                        unsigned long start,
1334                                        unsigned long nr_pages,
1335                                        struct page **pages,
1336                                        struct vm_area_struct **vmas,
1337                                        unsigned int gup_flags)
1338{
1339        long i;
1340        bool drain_allow = true;
1341        bool migrate_allow = true;
1342        LIST_HEAD(cma_page_list);
1343
1344check_again:
1345        for (i = 0; i < nr_pages; i++) {
1346                /*
1347                 * If we get a page from the CMA zone, since we are going to
1348                 * be pinning these entries, we might as well move them out
1349                 * of the CMA zone if possible.
1350                 */
1351                if (is_migrate_cma_page(pages[i])) {
1352
1353                        struct page *head = compound_head(pages[i]);
1354
1355                        if (PageHuge(head)) {
1356                                isolate_huge_page(head, &cma_page_list);
1357                        } else {
1358                                if (!PageLRU(head) && drain_allow) {
1359                                        lru_add_drain_all();
1360                                        drain_allow = false;
1361                                }
1362
1363                                if (!isolate_lru_page(head)) {
1364                                        list_add_tail(&head->lru, &cma_page_list);
1365                                        mod_node_page_state(page_pgdat(head),
1366                                                            NR_ISOLATED_ANON +
1367                                                            page_is_file_cache(head),
1368                                                            hpage_nr_pages(head));
1369                                }
1370                        }
1371                }
1372        }
1373
1374        if (!list_empty(&cma_page_list)) {
1375                /*
1376                 * drop the above get_user_pages reference.
1377                 */
1378                for (i = 0; i < nr_pages; i++)
1379                        put_page(pages[i]);
1380
1381                if (migrate_pages(&cma_page_list, new_non_cma_page,
1382                                  NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
1383                        /*
1384                         * some of the pages failed migration. Do get_user_pages
1385                         * without migration.
1386                         */
1387                        migrate_allow = false;
1388
1389                        if (!list_empty(&cma_page_list))
1390                                putback_movable_pages(&cma_page_list);
1391                }
1392                /*
1393                 * We did migrate all the pages, Try to get the page references
1394                 * again migrating any new CMA pages which we failed to isolate
1395                 * earlier.
1396                 */
1397                nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
1398                                                   pages, vmas, NULL,
1399                                                   gup_flags);
1400
1401                if ((nr_pages > 0) && migrate_allow) {
1402                        drain_allow = true;
1403                        goto check_again;
1404                }
1405        }
1406
1407        return nr_pages;
1408}
1409#else
1410static long check_and_migrate_cma_pages(struct task_struct *tsk,
1411                                        struct mm_struct *mm,
1412                                        unsigned long start,
1413                                        unsigned long nr_pages,
1414                                        struct page **pages,
1415                                        struct vm_area_struct **vmas,
1416                                        unsigned int gup_flags)
1417{
1418        return nr_pages;
1419}
1420#endif
1421
1422/*
1423 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
1424 * allows us to process the FOLL_LONGTERM flag.
1425 */
1426static long __gup_longterm_locked(struct task_struct *tsk,
1427                                  struct mm_struct *mm,
1428                                  unsigned long start,
1429                                  unsigned long nr_pages,
1430                                  struct page **pages,
1431                                  struct vm_area_struct **vmas,
1432                                  unsigned int gup_flags)
1433{
1434        struct vm_area_struct **vmas_tmp = vmas;
1435        unsigned long flags = 0;
1436        long rc, i;
1437
1438        if (gup_flags & FOLL_LONGTERM) {
1439                if (!pages)
1440                        return -EINVAL;
1441
1442                if (!vmas_tmp) {
1443                        vmas_tmp = kcalloc(nr_pages,
1444                                           sizeof(struct vm_area_struct *),
1445                                           GFP_KERNEL);
1446                        if (!vmas_tmp)
1447                                return -ENOMEM;
1448                }
1449                flags = memalloc_nocma_save();
1450        }
1451
1452        rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
1453                                     vmas_tmp, NULL, gup_flags);
1454
1455        if (gup_flags & FOLL_LONGTERM) {
1456                memalloc_nocma_restore(flags);
1457                if (rc < 0)
1458                        goto out;
1459
1460                if (check_dax_vmas(vmas_tmp, rc)) {
1461                        for (i = 0; i < rc; i++)
1462                                put_page(pages[i]);
1463                        rc = -EOPNOTSUPP;
1464                        goto out;
1465                }
1466
1467                rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
1468                                                 vmas_tmp, gup_flags);
1469        }
1470
1471out:
1472        if (vmas_tmp != vmas)
1473                kfree(vmas_tmp);
1474        return rc;
1475}
1476#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
1477static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
1478                                                  struct mm_struct *mm,
1479                                                  unsigned long start,
1480                                                  unsigned long nr_pages,
1481                                                  struct page **pages,
1482                                                  struct vm_area_struct **vmas,
1483                                                  unsigned int flags)
1484{
1485        return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
1486                                       NULL, flags);
1487}
1488#endif /* CONFIG_FS_DAX || CONFIG_CMA */
1489
1490/*
1491 * This is the same as get_user_pages_remote(), just with a
1492 * less-flexible calling convention where we assume that the task
1493 * and mm being operated on are the current task's and don't allow
1494 * passing of a locked parameter.  We also obviously don't pass
1495 * FOLL_REMOTE in here.
1496 */
1497long get_user_pages(unsigned long start, unsigned long nr_pages,
1498                unsigned int gup_flags, struct page **pages,
1499                struct vm_area_struct **vmas)
1500{
1501        return __gup_longterm_locked(current, current->mm, start, nr_pages,
1502                                     pages, vmas, gup_flags | FOLL_TOUCH);
1503}
1504EXPORT_SYMBOL(get_user_pages);
1505
1506/**
1507 * populate_vma_page_range() -  populate a range of pages in the vma.
1508 * @vma:   target vma
1509 * @start: start address
1510 * @end:   end address
1511 * @nonblocking:
1512 *
1513 * This takes care of mlocking the pages too if VM_LOCKED is set.
1514 *
1515 * return 0 on success, negative error code on error.
1516 *
1517 * vma->vm_mm->mmap_sem must be held.
1518 *
1519 * If @nonblocking is NULL, it may be held for read or write and will
1520 * be unperturbed.
1521 *
1522 * If @nonblocking is non-NULL, it must held for read only and may be
1523 * released.  If it's released, *@nonblocking will be set to 0.
1524 */
1525long populate_vma_page_range(struct vm_area_struct *vma,
1526                unsigned long start, unsigned long end, int *nonblocking)
1527{
1528        struct mm_struct *mm = vma->vm_mm;
1529        unsigned long nr_pages = (end - start) / PAGE_SIZE;
1530        int gup_flags;
1531
1532        VM_BUG_ON(start & ~PAGE_MASK);
1533        VM_BUG_ON(end   & ~PAGE_MASK);
1534        VM_BUG_ON_VMA(start < vma->vm_start, vma);
1535        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
1536        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1537
1538        gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
1539        if (vma->vm_flags & VM_LOCKONFAULT)
1540                gup_flags &= ~FOLL_POPULATE;
1541        /*
1542         * We want to touch writable mappings with a write fault in order
1543         * to break COW, except for shared mappings because these don't COW
1544         * and we would not want to dirty them for nothing.
1545         */
1546        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1547                gup_flags |= FOLL_WRITE;
1548
1549        /*
1550         * We want mlock to succeed for regions that have any permissions
1551         * other than PROT_NONE.
1552         */
1553        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
1554                gup_flags |= FOLL_FORCE;
1555
1556        /*
1557         * We made sure addr is within a VMA, so the following will
1558         * not result in a stack expansion that recurses back here.
1559         */
1560        return __get_user_pages(current, mm, start, nr_pages, gup_flags,
1561                                NULL, NULL, nonblocking);
1562}
1563
1564/*
1565 * __mm_populate - populate and/or mlock pages within a range of address space.
1566 *
1567 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1568 * flags. VMAs must be already marked with the desired vm_flags, and
1569 * mmap_sem must not be held.
1570 */
1571int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1572{
1573        struct mm_struct *mm = current->mm;
1574        unsigned long end, nstart, nend;
1575        struct vm_area_struct *vma = NULL;
1576        int locked = 0;
1577        long ret = 0;
1578
1579        end = start + len;
1580
1581        for (nstart = start; nstart < end; nstart = nend) {
1582                /*
1583                 * We want to fault in pages for [nstart; end) address range.
1584                 * Find first corresponding VMA.
1585                 */
1586                if (!locked) {
1587                        locked = 1;
1588                        down_read(&mm->mmap_sem);
1589                        vma = find_vma(mm, nstart);
1590                } else if (nstart >= vma->vm_end)
1591                        vma = vma->vm_next;
1592                if (!vma || vma->vm_start >= end)
1593                        break;
1594                /*
1595                 * Set [nstart; nend) to intersection of desired address
1596                 * range with the first VMA. Also, skip undesirable VMA types.
1597                 */
1598                nend = min(end, vma->vm_end);
1599                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1600                        continue;
1601                if (nstart < vma->vm_start)
1602                        nstart = vma->vm_start;
1603                /*
1604                 * Now fault in a range of pages. populate_vma_page_range()
1605                 * double checks the vma flags, so that it won't mlock pages
1606                 * if the vma was already munlocked.
1607                 */
1608                ret = populate_vma_page_range(vma, nstart, nend, &locked);
1609                if (ret < 0) {
1610                        if (ignore_errors) {
1611                                ret = 0;
1612                                continue;       /* continue at next VMA */
1613                        }
1614                        break;
1615                }
1616                nend = nstart + ret * PAGE_SIZE;
1617                ret = 0;
1618        }
1619        if (locked)
1620                up_read(&mm->mmap_sem);
1621        return ret;     /* 0 or negative error code */
1622}
1623
1624/**
1625 * get_dump_page() - pin user page in memory while writing it to core dump
1626 * @addr: user address
1627 *
1628 * Returns struct page pointer of user page pinned for dump,
1629 * to be freed afterwards by put_page().
1630 *
1631 * Returns NULL on any kind of failure - a hole must then be inserted into
1632 * the corefile, to preserve alignment with its headers; and also returns
1633 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1634 * allowing a hole to be left in the corefile to save diskspace.
1635 *
1636 * Called without mmap_sem, but after all other threads have been killed.
1637 */
1638#ifdef CONFIG_ELF_CORE
1639struct page *get_dump_page(unsigned long addr)
1640{
1641        struct vm_area_struct *vma;
1642        struct page *page;
1643
1644        if (__get_user_pages(current, current->mm, addr, 1,
1645                             FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1646                             NULL) < 1)
1647                return NULL;
1648        flush_cache_page(vma, addr, page_to_pfn(page));
1649        return page;
1650}
1651#endif /* CONFIG_ELF_CORE */
1652
1653/*
1654 * Generic Fast GUP
1655 *
1656 * get_user_pages_fast attempts to pin user pages by walking the page
1657 * tables directly and avoids taking locks. Thus the walker needs to be
1658 * protected from page table pages being freed from under it, and should
1659 * block any THP splits.
1660 *
1661 * One way to achieve this is to have the walker disable interrupts, and
1662 * rely on IPIs from the TLB flushing code blocking before the page table
1663 * pages are freed. This is unsuitable for architectures that do not need
1664 * to broadcast an IPI when invalidating TLBs.
1665 *
1666 * Another way to achieve this is to batch up page table containing pages
1667 * belonging to more than one mm_user, then rcu_sched a callback to free those
1668 * pages. Disabling interrupts will allow the fast_gup walker to both block
1669 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
1670 * (which is a relatively rare event). The code below adopts this strategy.
1671 *
1672 * Before activating this code, please be aware that the following assumptions
1673 * are currently made:
1674 *
1675 *  *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
1676 *  free pages containing page tables or TLB flushing requires IPI broadcast.
1677 *
1678 *  *) ptes can be read atomically by the architecture.
1679 *
1680 *  *) access_ok is sufficient to validate userspace address ranges.
1681 *
1682 * The last two assumptions can be relaxed by the addition of helper functions.
1683 *
1684 * This code is based heavily on the PowerPC implementation by Nick Piggin.
1685 */
1686#ifdef CONFIG_HAVE_GENERIC_GUP
1687
1688#ifndef gup_get_pte
1689/*
1690 * We assume that the PTE can be read atomically. If this is not the case for
1691 * your architecture, please provide the helper.
1692 */
1693static inline pte_t gup_get_pte(pte_t *ptep)
1694{
1695        return READ_ONCE(*ptep);
1696}
1697#endif
1698
1699static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
1700{
1701        while ((*nr) - nr_start) {
1702                struct page *page = pages[--(*nr)];
1703
1704                ClearPageReferenced(page);
1705                put_page(page);
1706        }
1707}
1708
1709/*
1710 * Return the compund head page with ref appropriately incremented,
1711 * or NULL if that failed.
1712 */
1713static inline struct page *try_get_compound_head(struct page *page, int refs)
1714{
1715        struct page *head = compound_head(page);
1716        if (WARN_ON_ONCE(page_ref_count(head) < 0))
1717                return NULL;
1718        if (unlikely(!page_cache_add_speculative(head, refs)))
1719                return NULL;
1720        return head;
1721}
1722
1723#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
1724static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1725                         unsigned int flags, struct page **pages, int *nr)
1726{
1727        struct dev_pagemap *pgmap = NULL;
1728        int nr_start = *nr, ret = 0;
1729        pte_t *ptep, *ptem;
1730
1731        ptem = ptep = pte_offset_map(&pmd, addr);
1732        do {
1733                pte_t pte = gup_get_pte(ptep);
1734                struct page *head, *page;
1735
1736                /*
1737                 * Similar to the PMD case below, NUMA hinting must take slow
1738                 * path using the pte_protnone check.
1739                 */
1740                if (pte_protnone(pte))
1741                        goto pte_unmap;
1742
1743                if (!pte_access_permitted(pte, flags & FOLL_WRITE))
1744                        goto pte_unmap;
1745
1746                if (pte_devmap(pte)) {
1747                        if (unlikely(flags & FOLL_LONGTERM))
1748                                goto pte_unmap;
1749
1750                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
1751                        if (unlikely(!pgmap)) {
1752                                undo_dev_pagemap(nr, nr_start, pages);
1753                                goto pte_unmap;
1754                        }
1755                } else if (pte_special(pte))
1756                        goto pte_unmap;
1757
1758                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1759                page = pte_page(pte);
1760
1761                head = try_get_compound_head(page, 1);
1762                if (!head)
1763                        goto pte_unmap;
1764
1765                if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1766                        put_page(head);
1767                        goto pte_unmap;
1768                }
1769
1770                VM_BUG_ON_PAGE(compound_head(page) != head, page);
1771
1772                SetPageReferenced(page);
1773                pages[*nr] = page;
1774                (*nr)++;
1775
1776        } while (ptep++, addr += PAGE_SIZE, addr != end);
1777
1778        ret = 1;
1779
1780pte_unmap:
1781        if (pgmap)
1782                put_dev_pagemap(pgmap);
1783        pte_unmap(ptem);
1784        return ret;
1785}
1786#else
1787
1788/*
1789 * If we can't determine whether or not a pte is special, then fail immediately
1790 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
1791 * to be special.
1792 *
1793 * For a futex to be placed on a THP tail page, get_futex_key requires a
1794 * __get_user_pages_fast implementation that can pin pages. Thus it's still
1795 * useful to have gup_huge_pmd even if we can't operate on ptes.
1796 */
1797static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1798                         unsigned int flags, struct page **pages, int *nr)
1799{
1800        return 0;
1801}
1802#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
1803
1804#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1805static int __gup_device_huge(unsigned long pfn, unsigned long addr,
1806                unsigned long end, struct page **pages, int *nr)
1807{
1808        int nr_start = *nr;
1809        struct dev_pagemap *pgmap = NULL;
1810
1811        do {
1812                struct page *page = pfn_to_page(pfn);
1813
1814                pgmap = get_dev_pagemap(pfn, pgmap);
1815                if (unlikely(!pgmap)) {
1816                        undo_dev_pagemap(nr, nr_start, pages);
1817                        return 0;
1818                }
1819                SetPageReferenced(page);
1820                pages[*nr] = page;
1821                get_page(page);
1822                (*nr)++;
1823                pfn++;
1824        } while (addr += PAGE_SIZE, addr != end);
1825
1826        if (pgmap)
1827                put_dev_pagemap(pgmap);
1828        return 1;
1829}
1830
1831static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1832                unsigned long end, struct page **pages, int *nr)
1833{
1834        unsigned long fault_pfn;
1835        int nr_start = *nr;
1836
1837        fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1838        if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
1839                return 0;
1840
1841        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1842                undo_dev_pagemap(nr, nr_start, pages);
1843                return 0;
1844        }
1845        return 1;
1846}
1847
1848static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1849                unsigned long end, struct page **pages, int *nr)
1850{
1851        unsigned long fault_pfn;
1852        int nr_start = *nr;
1853
1854        fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1855        if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
1856                return 0;
1857
1858        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1859                undo_dev_pagemap(nr, nr_start, pages);
1860                return 0;
1861        }
1862        return 1;
1863}
1864#else
1865static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1866                unsigned long end, struct page **pages, int *nr)
1867{
1868        BUILD_BUG();
1869        return 0;
1870}
1871
1872static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
1873                unsigned long end, struct page **pages, int *nr)
1874{
1875        BUILD_BUG();
1876        return 0;
1877}
1878#endif
1879
1880static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1881                unsigned long end, unsigned int flags, struct page **pages, int *nr)
1882{
1883        struct page *head, *page;
1884        int refs;
1885
1886        if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
1887                return 0;
1888
1889        if (pmd_devmap(orig)) {
1890                if (unlikely(flags & FOLL_LONGTERM))
1891                        return 0;
1892                return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
1893        }
1894
1895        refs = 0;
1896        page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1897        do {
1898                pages[*nr] = page;
1899                (*nr)++;
1900                page++;
1901                refs++;
1902        } while (addr += PAGE_SIZE, addr != end);
1903
1904        head = try_get_compound_head(pmd_page(orig), refs);
1905        if (!head) {
1906                *nr -= refs;
1907                return 0;
1908        }
1909
1910        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1911                *nr -= refs;
1912                while (refs--)
1913                        put_page(head);
1914                return 0;
1915        }
1916
1917        SetPageReferenced(head);
1918        return 1;
1919}
1920
1921static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1922                unsigned long end, unsigned int flags, struct page **pages, int *nr)
1923{
1924        struct page *head, *page;
1925        int refs;
1926
1927        if (!pud_access_permitted(orig, flags & FOLL_WRITE))
1928                return 0;
1929
1930        if (pud_devmap(orig)) {
1931                if (unlikely(flags & FOLL_LONGTERM))
1932                        return 0;
1933                return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
1934        }
1935
1936        refs = 0;
1937        page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1938        do {
1939                pages[*nr] = page;
1940                (*nr)++;
1941                page++;
1942                refs++;
1943        } while (addr += PAGE_SIZE, addr != end);
1944
1945        head = try_get_compound_head(pud_page(orig), refs);
1946        if (!head) {
1947                *nr -= refs;
1948                return 0;
1949        }
1950
1951        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1952                *nr -= refs;
1953                while (refs--)
1954                        put_page(head);
1955                return 0;
1956        }
1957
1958        SetPageReferenced(head);
1959        return 1;
1960}
1961
1962static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1963                        unsigned long end, unsigned int flags,
1964                        struct page **pages, int *nr)
1965{
1966        int refs;
1967        struct page *head, *page;
1968
1969        if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
1970                return 0;
1971
1972        BUILD_BUG_ON(pgd_devmap(orig));
1973        refs = 0;
1974        page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
1975        do {
1976                pages[*nr] = page;
1977                (*nr)++;
1978                page++;
1979                refs++;
1980        } while (addr += PAGE_SIZE, addr != end);
1981
1982        head = try_get_compound_head(pgd_page(orig), refs);
1983        if (!head) {
1984                *nr -= refs;
1985                return 0;
1986        }
1987
1988        if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
1989                *nr -= refs;
1990                while (refs--)
1991                        put_page(head);
1992                return 0;
1993        }
1994
1995        SetPageReferenced(head);
1996        return 1;
1997}
1998
1999static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
2000                unsigned int flags, struct page **pages, int *nr)

2001{
2002        unsigned long next;
2003        pmd_t *pmdp;
2004
2005        pmdp = pmd_offset(&pud, addr);
2006        do {
2007                pmd_t pmd = READ_ONCE(*pmdp);
2008
2009                next = pmd_addr_end(addr, end);
2010                if (!pmd_present(pmd))
2011                        return 0;
2012
2013                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
2014                             pmd_devmap(pmd))) {
2015                        /*
2016                         * NUMA hinting faults need to be handled in the GUP
2017                         * slowpath for accounting purposes and so that they
2018                         * can be serialised against THP migration.
2019                         */
2020                        if (pmd_protnone(pmd))
2021                                return 0;
2022
2023                        if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
2024                                pages, nr))
2025                                return 0;
2026
2027                } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
2028                        /*
2029                         * architecture have different format for hugetlbfs
2030                         * pmd format and THP pmd format
2031                         */
2032                        if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
2033                                         PMD_SHIFT, next, flags, pages, nr))
2034                                return 0;
2035                } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
2036                        return 0;
2037        } while (pmdp++, addr = next, addr != end);
2038
2039        return 1;
2040}
2041
2042static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
2043                         unsigned int flags, struct page **pages, int *nr)
2044{
2045        unsigned long next;
2046        pud_t *pudp;
2047
2048        pudp = pud_offset(&p4d, addr);
2049        do {
2050                pud_t pud = READ_ONCE(*pudp);
2051
2052                next = pud_addr_end(addr, end);
2053                if (pud_none(pud))
2054                        return 0;
2055                if (unlikely(pud_huge(pud))) {
2056                        if (!gup_huge_pud(pud, pudp, addr, next, flags,
2057                                          pages, nr))
2058                                return 0;
2059                } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
2060                        if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
2061                                         PUD_SHIFT, next, flags, pages, nr))
2062                                return 0;
2063                } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
2064                        return 0;
2065        } while (pudp++, addr = next, addr != end);
2066
2067        return 1;
2068}
2069
2070static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
2071                         unsigned int flags, struct page **pages, int *nr)
2072{
2073        unsigned long next;
2074        p4d_t *p4dp;
2075
2076        p4dp = p4d_offset(&pgd, addr);
2077        do {
2078                p4d_t p4d = READ_ONCE(*p4dp);
2079
2080                next = p4d_addr_end(addr, end);
2081                if (p4d_none(p4d))
2082                        return 0;
2083                BUILD_BUG_ON(p4d_huge(p4d));
2084                if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
2085                        if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
2086                                         P4D_SHIFT, next, flags, pages, nr))
2087                                return 0;
2088                } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
2089                        return 0;
2090        } while (p4dp++, addr = next, addr != end);
2091
2092        return 1;
2093}
2094
2095static void gup_pgd_range(unsigned long addr, unsigned long end,
2096                unsigned int flags, struct page **pages, int *nr)
2097{
2098        unsigned long next;
2099        pgd_t *pgdp;
2100
2101        pgdp = pgd_offset(current->mm, addr);
2102        do {
2103                pgd_t pgd = READ_ONCE(*pgdp);
2104
2105                next = pgd_addr_end(addr, end);
2106                if (pgd_none(pgd))
2107                        return;
2108                if (unlikely(pgd_huge(pgd))) {
2109                        if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
2110                                          pages, nr))
2111                                return;
2112                } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
2113                        if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
2114                                         PGDIR_SHIFT, next, flags, pages, nr))
2115                                return;
2116                } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
2117                        return;
2118        } while (pgdp++, addr = next, addr != end);
2119}
2120
2121#ifndef gup_fast_permitted
2122/*
2123 * Check if it's allowed to use __get_user_pages_fast() for the range, or
2124 * we need to fall back to the slow version:
2125 */
2126bool gup_fast_permitted(unsigned long start, int nr_pages)
2127{
2128        unsigned long len, end;
2129
2130        len = (unsigned long) nr_pages << PAGE_SHIFT;
2131        end = start + len;
2132        return end >= start;
2133}
2134#endif
2135
2136/*
2137 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
2138 * the regular GUP.
2139 * Note a difference with get_user_pages_fast: this always returns the
2140 * number of pages pinned, 0 if no pages were pinned.
2141 */
2142int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
2143                          struct page **pages)
2144{
2145        unsigned long len, end;
2146        unsigned long flags;
2147        int nr = 0;
2148
2149        start &= PAGE_MASK;
2150        len = (unsigned long) nr_pages << PAGE_SHIFT;
2151        end = start + len;
2152
2153        if (unlikely(!access_ok((void __user *)start, len)))
2154                return 0;
2155
2156        /*
2157         * Disable interrupts.  We use the nested form as we can already have
2158         * interrupts disabled by get_futex_key.
2159         *
2160         * With interrupts disabled, we block page table pages from being
2161         * freed from under us. See struct mmu_table_batch comments in
2162         * include/asm-generic/tlb.h for more details.
2163         *
2164         * We do not adopt an rcu_read_lock(.) here as we also want to
2165         * block IPIs that come from THPs splitting.
2166         */
2167
2168        if (gup_fast_permitted(start, nr_pages)) {
2169                local_irq_save(flags);
2170                gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
2171                local_irq_restore(flags);
2172        }
2173
2174        return nr;
2175}
2176
2177static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
2178                                   unsigned int gup_flags, struct page **pages)
2179{
2180        int ret;
2181
2182        /*
2183         * FIXME: FOLL_LONGTERM does not work with
2184         * get_user_pages_unlocked() (see comments in that function)
2185         */
2186        if (gup_flags & FOLL_LONGTERM) {
2187                down_read(&current->mm->mmap_sem);
2188                ret = __gup_longterm_locked(current, current->mm,
2189                                            start, nr_pages,
2190                                            pages, NULL, gup_flags);
2191                up_read(&current->mm->mmap_sem);
2192        } else {
2193                ret = get_user_pages_unlocked(start, nr_pages,
2194                                              pages, gup_flags);
2195        }
2196
2197        return ret;
2198}
2199
2200/**
2201 * get_user_pages_fast() - pin user pages in memory
2202 * @start:      starting user address
2203 * @nr_pages:   number of pages from start to pin
2204 * @gup_flags:  flags modifying pin behaviour
2205 * @pages:      array that receives pointers to the pages pinned.
2206 *              Should be at least nr_pages long.
2207 *
2208 * Attempt to pin user pages in memory without taking mm->mmap_sem.
2209 * If not successful, it will fall back to taking the lock and
2210 * calling get_user_pages().
2211 *
2212 * Returns number of pages pinned. This may be fewer than the number
2213 * requested. If nr_pages is 0 or negative, returns 0. If no pages
2214 * were pinned, returns -errno.
2215 */
2216int get_user_pages_fast(unsigned long start, int nr_pages,
2217                        unsigned int gup_flags, struct page **pages)
2218{
2219        unsigned long addr, len, end;
2220        int nr = 0, ret = 0;
2221
2222        start &= PAGE_MASK;
2223        addr = start;
2224        len = (unsigned long) nr_pages << PAGE_SHIFT;
2225        end = start + len;
2226
2227        if (nr_pages <= 0)
2228                return 0;
2229
2230        if (unlikely(!access_ok((void __user *)start, len)))
2231                return -EFAULT;
2232
2233        if (gup_fast_permitted(start, nr_pages)) {
2234                local_irq_disable();
2235                gup_pgd_range(addr, end, gup_flags, pages, &nr);
2236                local_irq_enable();
2237                ret = nr;
2238        }
2239
2240        if (nr < nr_pages) {
2241                /* Try to get the remaining pages with get_user_pages */
2242                start += nr << PAGE_SHIFT;
2243                pages += nr;
2244
2245                ret = __gup_longterm_unlocked(start, nr_pages - nr,
2246                                              gup_flags, pages);
2247
2248                /* Have to be a bit careful with return values */
2249                if (nr > 0) {
2250                        if (ret < 0)
2251                                ret = nr;
2252                        else
2253                                ret += nr;
2254                }
2255        }
2256
2257        return ret;
2258}
2259
2260#endif /* CONFIG_HAVE_GENERIC_GUP */
2261