linux/mm/userfaultfd.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  mm/userfaultfd.c
   4 *
   5 *  Copyright (C) 2015  Red Hat, Inc.
   6 */
   7
   8#include <linux/mm.h>
   9#include <linux/sched/signal.h>
  10#include <linux/pagemap.h>
  11#include <linux/rmap.h>
  12#include <linux/swap.h>
  13#include <linux/swapops.h>
  14#include <linux/userfaultfd_k.h>
  15#include <linux/mmu_notifier.h>
  16#include <linux/hugetlb.h>
  17#include <linux/shmem_fs.h>
  18#include <asm/tlbflush.h>
  19#include <asm/tlb.h>
  20#include "internal.h"
  21#include "swap.h"
  22
  23static __always_inline
  24bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
  25{
  26        /* Make sure that the dst range is fully within dst_vma. */
  27        if (dst_end > dst_vma->vm_end)
  28                return false;
  29
  30        /*
  31         * Check the vma is registered in uffd, this is required to
  32         * enforce the VM_MAYWRITE check done at uffd registration
  33         * time.
  34         */
  35        if (!dst_vma->vm_userfaultfd_ctx.ctx)
  36                return false;
  37
  38        return true;
  39}
  40
  41static __always_inline
  42struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
  43                                                 unsigned long addr)
  44{
  45        struct vm_area_struct *vma;
  46
  47        mmap_assert_locked(mm);
  48        vma = vma_lookup(mm, addr);
  49        if (!vma)
  50                vma = ERR_PTR(-ENOENT);
  51        else if (!(vma->vm_flags & VM_SHARED) &&
  52                 unlikely(anon_vma_prepare(vma)))
  53                vma = ERR_PTR(-ENOMEM);
  54
  55        return vma;
  56}
  57
  58#ifdef CONFIG_PER_VMA_LOCK
  59/*
  60 * uffd_lock_vma() - Lookup and lock vma corresponding to @address.
  61 * @mm: mm to search vma in.
  62 * @address: address that the vma should contain.
  63 *
  64 * Should be called without holding mmap_lock.
  65 *
  66 * Return: A locked vma containing @address, -ENOENT if no vma is found, or
  67 * -ENOMEM if anon_vma couldn't be allocated.
  68 */
  69static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
  70                                       unsigned long address)
  71{
  72        struct vm_area_struct *vma;
  73
  74        vma = lock_vma_under_rcu(mm, address);
  75        if (vma) {
  76                /*
  77                 * We know we're going to need to use anon_vma, so check
  78                 * that early.
  79                 */
  80                if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
  81                        vma_end_read(vma);
  82                else
  83                        return vma;
  84        }
  85
  86        mmap_read_lock(mm);
  87        vma = find_vma_and_prepare_anon(mm, address);
  88        if (!IS_ERR(vma)) {
  89                bool locked = vma_start_read_locked(vma);
  90
  91                if (!locked)
  92                        vma = ERR_PTR(-EAGAIN);
  93        }
  94
  95        mmap_read_unlock(mm);
  96        return vma;
  97}
  98
  99static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
 100                                              unsigned long dst_start,
 101                                              unsigned long len)
 102{
 103        struct vm_area_struct *dst_vma;
 104
 105        dst_vma = uffd_lock_vma(dst_mm, dst_start);
 106        if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
 107                return dst_vma;
 108
 109        vma_end_read(dst_vma);
 110        return ERR_PTR(-ENOENT);
 111}
 112
 113static void uffd_mfill_unlock(struct vm_area_struct *vma)
 114{
 115        vma_end_read(vma);
 116}
 117
 118#else
 119
 120static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
 121                                              unsigned long dst_start,
 122                                              unsigned long len)
 123{
 124        struct vm_area_struct *dst_vma;
 125
 126        mmap_read_lock(dst_mm);
 127        dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
 128        if (IS_ERR(dst_vma))
 129                goto out_unlock;
 130
 131        if (validate_dst_vma(dst_vma, dst_start + len))
 132                return dst_vma;
 133
 134        dst_vma = ERR_PTR(-ENOENT);
 135out_unlock:
 136        mmap_read_unlock(dst_mm);
 137        return dst_vma;
 138}
 139
 140static void uffd_mfill_unlock(struct vm_area_struct *vma)
 141{
 142        mmap_read_unlock(vma->vm_mm);
 143}
 144#endif
 145
 146/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
 147static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
 148                                 unsigned long dst_addr)
 149{
 150        struct inode *inode;
 151        pgoff_t offset, max_off;
 152
 153        if (!dst_vma->vm_file)
 154                return false;
 155
 156        inode = dst_vma->vm_file->f_inode;
 157        offset = linear_page_index(dst_vma, dst_addr);
 158        max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 159        return offset >= max_off;
 160}
 161
 162/*
 163 * Install PTEs, to map dst_addr (within dst_vma) to page.
 164 *
 165 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
 166 * and anon, and for both shared and private VMAs.
 167 */
 168int mfill_atomic_install_pte(pmd_t *dst_pmd,
 169                             struct vm_area_struct *dst_vma,
 170                             unsigned long dst_addr, struct page *page,
 171                             bool newly_allocated, uffd_flags_t flags)
 172{
 173        int ret;
 174        struct mm_struct *dst_mm = dst_vma->vm_mm;
 175        pte_t _dst_pte, *dst_pte;
 176        bool writable = dst_vma->vm_flags & VM_WRITE;
 177        bool vm_shared = dst_vma->vm_flags & VM_SHARED;
 178        spinlock_t *ptl;
 179        struct folio *folio = page_folio(page);
 180        bool page_in_cache = folio_mapping(folio);
 181
 182        _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
 183        _dst_pte = pte_mkdirty(_dst_pte);
 184        if (page_in_cache && !vm_shared)
 185                writable = false;
 186        if (writable)
 187                _dst_pte = pte_mkwrite(_dst_pte, dst_vma);
 188        if (flags & MFILL_ATOMIC_WP)
 189                _dst_pte = pte_mkuffd_wp(_dst_pte);
 190
 191        ret = -EAGAIN;
 192        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
 193        if (!dst_pte)
 194                goto out;
 195
 196        if (mfill_file_over_size(dst_vma, dst_addr)) {
 197                ret = -EFAULT;
 198                goto out_unlock;
 199        }
 200
 201        ret = -EEXIST;
 202        /*
 203         * We allow to overwrite a pte marker: consider when both MISSING|WP
 204         * registered, we firstly wr-protect a none pte which has no page cache
 205         * page backing it, then access the page.
 206         */
 207        if (!pte_none_mostly(ptep_get(dst_pte)))
 208                goto out_unlock;
 209
 210        if (page_in_cache) {
 211                /* Usually, cache pages are already added to LRU */
 212                if (newly_allocated)
 213                        folio_add_lru(folio);
 214                folio_add_file_rmap_pte(folio, page, dst_vma);
 215        } else {
 216                folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE);
 217                folio_add_lru_vma(folio, dst_vma);
 218        }
 219
 220        /*
 221         * Must happen after rmap, as mm_counter() checks mapping (via
 222         * PageAnon()), which is set by __page_set_anon_rmap().
 223         */
 224        inc_mm_counter(dst_mm, mm_counter(folio));
 225
 226        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 227
 228        /* No need to invalidate - it was non-present before */
 229        update_mmu_cache(dst_vma, dst_addr, dst_pte);
 230        ret = 0;
 231out_unlock:
 232        pte_unmap_unlock(dst_pte, ptl);
 233out:
 234        return ret;
 235}
 236
 237static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
 238                                 struct vm_area_struct *dst_vma,
 239                                 unsigned long dst_addr,
 240                                 unsigned long src_addr,
 241                                 uffd_flags_t flags,
 242                                 struct folio **foliop)
 243{
 244        void *kaddr;
 245        int ret;
 246        struct folio *folio;
 247
 248        if (!*foliop) {
 249                ret = -ENOMEM;
 250                folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
 251                                        dst_addr);
 252                if (!folio)
 253                        goto out;
 254
 255                kaddr = kmap_local_folio(folio, 0);
 256                /*
 257                 * The read mmap_lock is held here.  Despite the
 258                 * mmap_lock being read recursive a deadlock is still
 259                 * possible if a writer has taken a lock.  For example:
 260                 *
 261                 * process A thread 1 takes read lock on own mmap_lock
 262                 * process A thread 2 calls mmap, blocks taking write lock
 263                 * process B thread 1 takes page fault, read lock on own mmap lock
 264                 * process B thread 2 calls mmap, blocks taking write lock
 265                 * process A thread 1 blocks taking read lock on process B
 266                 * process B thread 1 blocks taking read lock on process A
 267                 *
 268                 * Disable page faults to prevent potential deadlock
 269                 * and retry the copy outside the mmap_lock.
 270                 */
 271                pagefault_disable();
 272                ret = copy_from_user(kaddr, (const void __user *) src_addr,
 273                                     PAGE_SIZE);
 274                pagefault_enable();
 275                kunmap_local(kaddr);
 276
 277                /* fallback to copy_from_user outside mmap_lock */
 278                if (unlikely(ret)) {
 279                        ret = -ENOENT;
 280                        *foliop = folio;
 281                        /* don't free the page */
 282                        goto out;
 283                }
 284
 285                flush_dcache_folio(folio);
 286        } else {
 287                folio = *foliop;
 288                *foliop = NULL;
 289        }
 290
 291        /*
 292         * The memory barrier inside __folio_mark_uptodate makes sure that
 293         * preceding stores to the page contents become visible before
 294         * the set_pte_at() write.
 295         */
 296        __folio_mark_uptodate(folio);
 297
 298        ret = -ENOMEM;
 299        if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
 300                goto out_release;
 301
 302        ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
 303                                       &folio->page, true, flags);
 304        if (ret)
 305                goto out_release;
 306out:
 307        return ret;
 308out_release:
 309        folio_put(folio);
 310        goto out;
 311}
 312
 313static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
 314                                         struct vm_area_struct *dst_vma,
 315                                         unsigned long dst_addr)
 316{
 317        struct folio *folio;
 318        int ret = -ENOMEM;
 319
 320        folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
 321        if (!folio)
 322                return ret;
 323
 324        if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
 325                goto out_put;
 326
 327        /*
 328         * The memory barrier inside __folio_mark_uptodate makes sure that
 329         * zeroing out the folio become visible before mapping the page
 330         * using set_pte_at(). See do_anonymous_page().
 331         */
 332        __folio_mark_uptodate(folio);
 333
 334        ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
 335                                       &folio->page, true, 0);
 336        if (ret)
 337                goto out_put;
 338
 339        return 0;
 340out_put:
 341        folio_put(folio);
 342        return ret;
 343}
 344
 345static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
 346                                     struct vm_area_struct *dst_vma,
 347                                     unsigned long dst_addr)
 348{
 349        pte_t _dst_pte, *dst_pte;
 350        spinlock_t *ptl;
 351        int ret;
 352
 353        if (mm_forbids_zeropage(dst_vma->vm_mm))
 354                return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
 355
 356        _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
 357                                         dst_vma->vm_page_prot));
 358        ret = -EAGAIN;
 359        dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
 360        if (!dst_pte)
 361                goto out;
 362        if (mfill_file_over_size(dst_vma, dst_addr)) {
 363                ret = -EFAULT;
 364                goto out_unlock;
 365        }
 366        ret = -EEXIST;
 367        if (!pte_none(ptep_get(dst_pte)))
 368                goto out_unlock;
 369        set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
 370        /* No need to invalidate - it was non-present before */
 371        update_mmu_cache(dst_vma, dst_addr, dst_pte);
 372        ret = 0;
 373out_unlock:
 374        pte_unmap_unlock(dst_pte, ptl);
 375out:
 376        return ret;
 377}
 378
 379/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
 380static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
 381                                     struct vm_area_struct *dst_vma,
 382                                     unsigned long dst_addr,
 383                                     uffd_flags_t flags)
 384{
 385        struct inode *inode = file_inode(dst_vma->vm_file);
 386        pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
 387        struct folio *folio;
 388        struct page *page;
 389        int ret;
 390
 391        ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
 392        /* Our caller expects us to return -EFAULT if we failed to find folio */
 393        if (ret == -ENOENT)
 394                ret = -EFAULT;
 395        if (ret)
 396                goto out;
 397        if (!folio) {
 398                ret = -EFAULT;
 399                goto out;
 400        }
 401
 402        page = folio_file_page(folio, pgoff);
 403        if (PageHWPoison(page)) {
 404                ret = -EIO;
 405                goto out_release;
 406        }
 407
 408        ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
 409                                       page, false, flags);
 410        if (ret)
 411                goto out_release;
 412
 413        folio_unlock(folio);
 414        ret = 0;
 415out:
 416        return ret;
 417out_release:
 418        folio_unlock(folio);
 419        folio_put(folio);
 420        goto out;
 421}
 422
 423/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
 424static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
 425                                   struct vm_area_struct *dst_vma,
 426                                   unsigned long dst_addr,
 427                                   uffd_flags_t flags)
 428{
 429        int ret;
 430        struct mm_struct *dst_mm = dst_vma->vm_mm;
 431        pte_t _dst_pte, *dst_pte;
 432        spinlock_t *ptl;
 433
 434        _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
 435        ret = -EAGAIN;
 436        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
 437        if (!dst_pte)
 438                goto out;
 439
 440        if (mfill_file_over_size(dst_vma, dst_addr)) {
 441                ret = -EFAULT;
 442                goto out_unlock;
 443        }
 444
 445        ret = -EEXIST;
 446        /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
 447        if (!pte_none(ptep_get(dst_pte)))
 448                goto out_unlock;
 449
 450        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 451
 452        /* No need to invalidate - it was non-present before */
 453        update_mmu_cache(dst_vma, dst_addr, dst_pte);
 454        ret = 0;
 455out_unlock:
 456        pte_unmap_unlock(dst_pte, ptl);
 457out:
 458        return ret;
 459}
 460
 461static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 462{
 463        pgd_t *pgd;
 464        p4d_t *p4d;
 465        pud_t *pud;
 466
 467        pgd = pgd_offset(mm, address);
 468        p4d = p4d_alloc(mm, pgd, address);
 469        if (!p4d)
 470                return NULL;
 471        pud = pud_alloc(mm, p4d, address);
 472        if (!pud)
 473                return NULL;
 474        /*
 475         * Note that we didn't run this because the pmd was
 476         * missing, the *pmd may be already established and in
 477         * turn it may also be a trans_huge_pmd.
 478         */
 479        return pmd_alloc(mm, pud, address);
 480}
 481
 482#ifdef CONFIG_HUGETLB_PAGE
 483/*
 484 * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
 485 * called with either vma-lock or mmap_lock held, it will release the lock
 486 * before returning.
 487 */
 488static __always_inline ssize_t mfill_atomic_hugetlb(
 489                                              struct userfaultfd_ctx *ctx,
 490                                              struct vm_area_struct *dst_vma,
 491                                              unsigned long dst_start,
 492                                              unsigned long src_start,
 493                                              unsigned long len,
 494                                              uffd_flags_t flags)
 495{
 496        struct mm_struct *dst_mm = dst_vma->vm_mm;
 497        ssize_t err;
 498        pte_t *dst_pte;
 499        unsigned long src_addr, dst_addr;
 500        long copied;
 501        struct folio *folio;
 502        unsigned long vma_hpagesize;
 503        pgoff_t idx;
 504        u32 hash;
 505        struct address_space *mapping;
 506
 507        /*
 508         * There is no default zero huge page for all huge page sizes as
 509         * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
 510         * by THP.  Since we can not reliably insert a zero page, this
 511         * feature is not supported.
 512         */
 513        if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
 514                up_read(&ctx->map_changing_lock);
 515                uffd_mfill_unlock(dst_vma);
 516                return -EINVAL;
 517        }
 518
 519        src_addr = src_start;
 520        dst_addr = dst_start;
 521        copied = 0;
 522        folio = NULL;
 523        vma_hpagesize = vma_kernel_pagesize(dst_vma);
 524
 525        /*
 526         * Validate alignment based on huge page size
 527         */
 528        err = -EINVAL;
 529        if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
 530                goto out_unlock;
 531
 532retry:
 533        /*
 534         * On routine entry dst_vma is set.  If we had to drop mmap_lock and
 535         * retry, dst_vma will be set to NULL and we must lookup again.
 536         */
 537        if (!dst_vma) {
 538                dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
 539                if (IS_ERR(dst_vma)) {
 540                        err = PTR_ERR(dst_vma);
 541                        goto out;
 542                }
 543
 544                err = -ENOENT;
 545                if (!is_vm_hugetlb_page(dst_vma))
 546                        goto out_unlock_vma;
 547
 548                err = -EINVAL;
 549                if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
 550                        goto out_unlock_vma;
 551
 552                /*
 553                 * If memory mappings are changing because of non-cooperative
 554                 * operation (e.g. mremap) running in parallel, bail out and
 555                 * request the user to retry later
 556                 */
 557                down_read(&ctx->map_changing_lock);
 558                err = -EAGAIN;
 559                if (atomic_read(&ctx->mmap_changing))
 560                        goto out_unlock;
 561        }
 562
 563        while (src_addr < src_start + len) {
 564                VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
 565
 566                /*
 567                 * Serialize via vma_lock and hugetlb_fault_mutex.
 568                 * vma_lock ensures the dst_pte remains valid even
 569                 * in the case of shared pmds.  fault mutex prevents
 570                 * races with other faulting threads.
 571                 */
 572                idx = linear_page_index(dst_vma, dst_addr);
 573                mapping = dst_vma->vm_file->f_mapping;
 574                hash = hugetlb_fault_mutex_hash(mapping, idx);
 575                mutex_lock(&hugetlb_fault_mutex_table[hash]);
 576                hugetlb_vma_lock_read(dst_vma);
 577
 578                err = -ENOMEM;
 579                dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
 580                if (!dst_pte) {
 581                        hugetlb_vma_unlock_read(dst_vma);
 582                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 583                        goto out_unlock;
 584                }
 585
 586                if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
 587                    !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
 588                        err = -EEXIST;
 589                        hugetlb_vma_unlock_read(dst_vma);
 590                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 591                        goto out_unlock;
 592                }
 593
 594                err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
 595                                               src_addr, flags, &folio);
 596
 597                hugetlb_vma_unlock_read(dst_vma);
 598                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 599
 600                cond_resched();
 601
 602                if (unlikely(err == -ENOENT)) {
 603                        up_read(&ctx->map_changing_lock);
 604                        uffd_mfill_unlock(dst_vma);
 605                        VM_WARN_ON_ONCE(!folio);
 606
 607                        err = copy_folio_from_user(folio,
 608                                                   (const void __user *)src_addr, true);
 609                        if (unlikely(err)) {
 610                                err = -EFAULT;
 611                                goto out;
 612                        }
 613
 614                        dst_vma = NULL;
 615                        goto retry;
 616                } else
 617                        VM_WARN_ON_ONCE(folio);
 618
 619                if (!err) {
 620                        dst_addr += vma_hpagesize;
 621                        src_addr += vma_hpagesize;
 622                        copied += vma_hpagesize;
 623
 624                        if (fatal_signal_pending(current))
 625                                err = -EINTR;
 626                }
 627                if (err)
 628                        break;
 629        }
 630
 631out_unlock:
 632        up_read(&ctx->map_changing_lock);
 633out_unlock_vma:
 634        uffd_mfill_unlock(dst_vma);
 635out:
 636        if (folio)
 637                folio_put(folio);
 638        VM_WARN_ON_ONCE(copied < 0);
 639        VM_WARN_ON_ONCE(err > 0);
 640        VM_WARN_ON_ONCE(!copied && !err);
 641        return copied ? copied : err;
 642}
 643#else /* !CONFIG_HUGETLB_PAGE */
 644/* fail at build time if gcc attempts to use this */
 645extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
 646                                    struct vm_area_struct *dst_vma,
 647                                    unsigned long dst_start,
 648                                    unsigned long src_start,
 649                                    unsigned long len,
 650                                    uffd_flags_t flags);
 651#endif /* CONFIG_HUGETLB_PAGE */
 652
 653static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 654                                                struct vm_area_struct *dst_vma,
 655                                                unsigned long dst_addr,
 656                                                unsigned long src_addr,
 657                                                uffd_flags_t flags,
 658                                                struct folio **foliop)
 659{
 660        ssize_t err;
 661
 662        if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
 663                return mfill_atomic_pte_continue(dst_pmd, dst_vma,
 664                                                 dst_addr, flags);
 665        } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
 666                return mfill_atomic_pte_poison(dst_pmd, dst_vma,
 667                                               dst_addr, flags);
 668        }
 669
 670        /*
 671         * The normal page fault path for a shmem will invoke the
 672         * fault, fill the hole in the file and COW it right away. The
 673         * result generates plain anonymous memory. So when we are
 674         * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
 675         * generate anonymous memory directly without actually filling
 676         * the hole. For the MAP_PRIVATE case the robustness check
 677         * only happens in the pagetable (to verify it's still none)
 678         * and not in the radix tree.
 679         */
 680        if (!(dst_vma->vm_flags & VM_SHARED)) {
 681                if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
 682                        err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
 683                                                    dst_addr, src_addr,
 684                                                    flags, foliop);
 685                else
 686                        err = mfill_atomic_pte_zeropage(dst_pmd,
 687                                                 dst_vma, dst_addr);
 688        } else {
 689                err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
 690                                             dst_addr, src_addr,
 691                                             flags, foliop);
 692        }
 693
 694        return err;
 695}
 696
 697static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 698                                            unsigned long dst_start,
 699                                            unsigned long src_start,
 700                                            unsigned long len,
 701                                            uffd_flags_t flags)
 702{
 703        struct mm_struct *dst_mm = ctx->mm;
 704        struct vm_area_struct *dst_vma;
 705        ssize_t err;
 706        pmd_t *dst_pmd;
 707        unsigned long src_addr, dst_addr;
 708        long copied;
 709        struct folio *folio;
 710
 711        /*
 712         * Sanitize the command parameters:
 713         */
 714        VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
 715        VM_WARN_ON_ONCE(len & ~PAGE_MASK);
 716
 717        /* Does the address range wrap, or is the span zero-sized? */
 718        VM_WARN_ON_ONCE(src_start + len <= src_start);
 719        VM_WARN_ON_ONCE(dst_start + len <= dst_start);
 720
 721        src_addr = src_start;
 722        dst_addr = dst_start;
 723        copied = 0;
 724        folio = NULL;
 725retry:
 726        /*
 727         * Make sure the vma is not shared, that the dst range is
 728         * both valid and fully within a single existing vma.
 729         */
 730        dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
 731        if (IS_ERR(dst_vma)) {
 732                err = PTR_ERR(dst_vma);
 733                goto out;
 734        }
 735
 736        /*
 737         * If memory mappings are changing because of non-cooperative
 738         * operation (e.g. mremap) running in parallel, bail out and
 739         * request the user to retry later
 740         */
 741        down_read(&ctx->map_changing_lock);
 742        err = -EAGAIN;
 743        if (atomic_read(&ctx->mmap_changing))
 744                goto out_unlock;
 745
 746        err = -EINVAL;
 747        /*
 748         * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
 749         * it will overwrite vm_ops, so vma_is_anonymous must return false.
 750         */
 751        if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
 752            dst_vma->vm_flags & VM_SHARED))
 753                goto out_unlock;
 754
 755        /*
 756         * validate 'mode' now that we know the dst_vma: don't allow
 757         * a wrprotect copy if the userfaultfd didn't register as WP.
 758         */
 759        if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
 760                goto out_unlock;
 761
 762        /*
 763         * If this is a HUGETLB vma, pass off to appropriate routine
 764         */
 765        if (is_vm_hugetlb_page(dst_vma))
 766                return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
 767                                             src_start, len, flags);
 768
 769        if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 770                goto out_unlock;
 771        if (!vma_is_shmem(dst_vma) &&
 772            uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
 773                goto out_unlock;
 774
 775        while (src_addr < src_start + len) {
 776                pmd_t dst_pmdval;
 777
 778                VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
 779
 780                dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
 781                if (unlikely(!dst_pmd)) {
 782                        err = -ENOMEM;
 783                        break;
 784                }
 785
 786                dst_pmdval = pmdp_get_lockless(dst_pmd);
 787                if (unlikely(pmd_none(dst_pmdval)) &&
 788                    unlikely(__pte_alloc(dst_mm, dst_pmd))) {
 789                        err = -ENOMEM;
 790                        break;
 791                }
 792                dst_pmdval = pmdp_get_lockless(dst_pmd);
 793                /*
 794                 * If the dst_pmd is THP don't override it and just be strict.
 795                 * (This includes the case where the PMD used to be THP and
 796                 * changed back to none after __pte_alloc().)
 797                 */
 798                if (unlikely(!pmd_present(dst_pmdval) ||
 799                                pmd_trans_huge(dst_pmdval))) {
 800                        err = -EEXIST;
 801                        break;
 802                }
 803                if (unlikely(pmd_bad(dst_pmdval))) {
 804                        err = -EFAULT;
 805                        break;
 806                }
 807                /*
 808                 * For shmem mappings, khugepaged is allowed to remove page
 809                 * tables under us; pte_offset_map_lock() will deal with that.
 810                 */
 811
 812                err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
 813                                       src_addr, flags, &folio);
 814                cond_resched();
 815
 816                if (unlikely(err == -ENOENT)) {
 817                        void *kaddr;
 818
 819                        up_read(&ctx->map_changing_lock);
 820                        uffd_mfill_unlock(dst_vma);
 821                        VM_WARN_ON_ONCE(!folio);
 822
 823                        kaddr = kmap_local_folio(folio, 0);
 824                        err = copy_from_user(kaddr,
 825                                             (const void __user *) src_addr,
 826                                             PAGE_SIZE);
 827                        kunmap_local(kaddr);
 828                        if (unlikely(err)) {
 829                                err = -EFAULT;
 830                                goto out;
 831                        }
 832                        flush_dcache_folio(folio);
 833                        goto retry;
 834                } else
 835                        VM_WARN_ON_ONCE(folio);
 836
 837                if (!err) {
 838                        dst_addr += PAGE_SIZE;
 839                        src_addr += PAGE_SIZE;
 840                        copied += PAGE_SIZE;
 841
 842                        if (fatal_signal_pending(current))
 843                                err = -EINTR;
 844                }
 845                if (err)
 846                        break;
 847        }
 848
 849out_unlock:
 850        up_read(&ctx->map_changing_lock);
 851        uffd_mfill_unlock(dst_vma);
 852out:
 853        if (folio)
 854                folio_put(folio);
 855        VM_WARN_ON_ONCE(copied < 0);
 856        VM_WARN_ON_ONCE(err > 0);
 857        VM_WARN_ON_ONCE(!copied && !err);
 858        return copied ? copied : err;
 859}
 860
 861ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 862                          unsigned long src_start, unsigned long len,
 863                          uffd_flags_t flags)
 864{
 865        return mfill_atomic(ctx, dst_start, src_start, len,
 866                            uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
 867}
 868
 869ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
 870                              unsigned long start,
 871                              unsigned long len)
 872{
 873        return mfill_atomic(ctx, start, 0, len,
 874                            uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
 875}
 876
 877ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
 878                              unsigned long len, uffd_flags_t flags)
 879{
 880
 881        /*
 882         * A caller might reasonably assume that UFFDIO_CONTINUE contains an
 883         * smp_wmb() to ensure that any writes to the about-to-be-mapped page by
 884         * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to
 885         * subsequent loads from the page through the newly mapped address range.
 886         */
 887        smp_wmb();
 888
 889        return mfill_atomic(ctx, start, 0, len,
 890                            uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
 891}
 892
 893ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
 894                            unsigned long len, uffd_flags_t flags)
 895{
 896        return mfill_atomic(ctx, start, 0, len,
 897                            uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
 898}
 899
 900long uffd_wp_range(struct vm_area_struct *dst_vma,
 901                   unsigned long start, unsigned long len, bool enable_wp)
 902{
 903        unsigned int mm_cp_flags;
 904        struct mmu_gather tlb;
 905        long ret;
 906
 907        VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
 908                        "The address range exceeds VMA boundary.\n");
 909        if (enable_wp)
 910                mm_cp_flags = MM_CP_UFFD_WP;
 911        else
 912                mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
 913
 914        /*
 915         * vma->vm_page_prot already reflects that uffd-wp is enabled for this
 916         * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
 917         * to be write-protected as default whenever protection changes.
 918         * Try upgrading write permissions manually.
 919         */
 920        if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
 921                mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
 922        tlb_gather_mmu(&tlb, dst_vma->vm_mm);
 923        ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
 924        tlb_finish_mmu(&tlb);
 925
 926        return ret;
 927}
 928
 929int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
 930                        unsigned long len, bool enable_wp)
 931{
 932        struct mm_struct *dst_mm = ctx->mm;
 933        unsigned long end = start + len;
 934        unsigned long _start, _end;
 935        struct vm_area_struct *dst_vma;
 936        unsigned long page_mask;
 937        long err;
 938        VMA_ITERATOR(vmi, dst_mm, start);
 939
 940        /*
 941         * Sanitize the command parameters:
 942         */
 943        VM_WARN_ON_ONCE(start & ~PAGE_MASK);
 944        VM_WARN_ON_ONCE(len & ~PAGE_MASK);
 945
 946        /* Does the address range wrap, or is the span zero-sized? */
 947        VM_WARN_ON_ONCE(start + len <= start);
 948
 949        mmap_read_lock(dst_mm);
 950
 951        /*
 952         * If memory mappings are changing because of non-cooperative
 953         * operation (e.g. mremap) running in parallel, bail out and
 954         * request the user to retry later
 955         */
 956        down_read(&ctx->map_changing_lock);
 957        err = -EAGAIN;
 958        if (atomic_read(&ctx->mmap_changing))
 959                goto out_unlock;
 960
 961        err = -ENOENT;
 962        for_each_vma_range(vmi, dst_vma, end) {
 963
 964                if (!userfaultfd_wp(dst_vma)) {
 965                        err = -ENOENT;
 966                        break;
 967                }
 968
 969                if (is_vm_hugetlb_page(dst_vma)) {
 970                        err = -EINVAL;
 971                        page_mask = vma_kernel_pagesize(dst_vma) - 1;
 972                        if ((start & page_mask) || (len & page_mask))
 973                                break;
 974                }
 975
 976                _start = max(dst_vma->vm_start, start);
 977                _end = min(dst_vma->vm_end, end);
 978
 979                err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
 980
 981                /* Return 0 on success, <0 on failures */
 982                if (err < 0)
 983                        break;
 984                err = 0;
 985        }
 986out_unlock:
 987        up_read(&ctx->map_changing_lock);
 988        mmap_read_unlock(dst_mm);
 989        return err;
 990}
 991
 992
 993void double_pt_lock(spinlock_t *ptl1,
 994                    spinlock_t *ptl2)
 995        __acquires(ptl1)
 996        __acquires(ptl2)
 997{
 998        if (ptl1 > ptl2)
 999                swap(ptl1, ptl2);
1000        /* lock in virtual address order to avoid lock inversion */
1001        spin_lock(ptl1);
1002        if (ptl1 != ptl2)
1003                spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING);
1004        else
1005                __acquire(ptl2);
1006}
1007
1008void double_pt_unlock(spinlock_t *ptl1,
1009                      spinlock_t *ptl2)
1010        __releases(ptl1)
1011        __releases(ptl2)
1012{
1013        spin_unlock(ptl1);
1014        if (ptl1 != ptl2)
1015                spin_unlock(ptl2);
1016        else
1017                __release(ptl2);
1018}
1019
1020static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
1021                                       pte_t orig_dst_pte, pte_t orig_src_pte,
1022                                       pmd_t *dst_pmd, pmd_t dst_pmdval)
1023{
1024        return pte_same(ptep_get(src_pte), orig_src_pte) &&
1025               pte_same(ptep_get(dst_pte), orig_dst_pte) &&
1026               pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
1027}
1028
1029static int move_present_pte(struct mm_struct *mm,
1030                            struct vm_area_struct *dst_vma,
1031                            struct vm_area_struct *src_vma,
1032                            unsigned long dst_addr, unsigned long src_addr,
1033                            pte_t *dst_pte, pte_t *src_pte,
1034                            pte_t orig_dst_pte, pte_t orig_src_pte,
1035                            pmd_t *dst_pmd, pmd_t dst_pmdval,
1036                            spinlock_t *dst_ptl, spinlock_t *src_ptl,
1037                            struct folio *src_folio)
1038{
1039        int err = 0;
1040
1041        double_pt_lock(dst_ptl, src_ptl);
1042
1043        if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1044                                 dst_pmd, dst_pmdval)) {
1045                err = -EAGAIN;
1046                goto out;
1047        }
1048        if (folio_test_large(src_folio) ||
1049            folio_maybe_dma_pinned(src_folio) ||
1050            !PageAnonExclusive(&src_folio->page)) {
1051                err = -EBUSY;
1052                goto out;
1053        }
1054
1055        orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
1056        /* Folio got pinned from under us. Put it back and fail the move. */
1057        if (folio_maybe_dma_pinned(src_folio)) {
1058                set_pte_at(mm, src_addr, src_pte, orig_src_pte);
1059                err = -EBUSY;
1060                goto out;
1061        }
1062
1063        folio_move_anon_rmap(src_folio, dst_vma);
1064        src_folio->index = linear_page_index(dst_vma, dst_addr);
1065
1066        orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
1067        /* Set soft dirty bit so userspace can notice the pte was moved */
1068#ifdef CONFIG_MEM_SOFT_DIRTY
1069        orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
1070#endif
1071        if (pte_dirty(orig_src_pte))
1072                orig_dst_pte = pte_mkdirty(orig_dst_pte);
1073        orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
1074
1075        set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
1076out:
1077        double_pt_unlock(dst_ptl, src_ptl);
1078        return err;
1079}
1080
1081static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
1082                         unsigned long dst_addr, unsigned long src_addr,
1083                         pte_t *dst_pte, pte_t *src_pte,
1084                         pte_t orig_dst_pte, pte_t orig_src_pte,
1085                         pmd_t *dst_pmd, pmd_t dst_pmdval,
1086                         spinlock_t *dst_ptl, spinlock_t *src_ptl,
1087                         struct folio *src_folio,
1088                         struct swap_info_struct *si, swp_entry_t entry)
1089{
1090        /*
1091         * Check if the folio still belongs to the target swap entry after
1092         * acquiring the lock. Folio can be freed in the swap cache while
1093         * not locked.
1094         */
1095        if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
1096                                  entry.val != src_folio->swap.val))
1097                return -EAGAIN;
1098
1099        double_pt_lock(dst_ptl, src_ptl);
1100
1101        if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1102                                 dst_pmd, dst_pmdval)) {
1103                double_pt_unlock(dst_ptl, src_ptl);
1104                return -EAGAIN;
1105        }
1106
1107        /*
1108         * The src_folio resides in the swapcache, requiring an update to its
1109         * index and mapping to align with the dst_vma, where a swap-in may
1110         * occur and hit the swapcache after moving the PTE.
1111         */
1112        if (src_folio) {
1113                folio_move_anon_rmap(src_folio, dst_vma);
1114                src_folio->index = linear_page_index(dst_vma, dst_addr);
1115        } else {
1116                /*
1117                 * Check if the swap entry is cached after acquiring the src_pte
1118                 * lock. Otherwise, we might miss a newly loaded swap cache folio.
1119                 *
1120                 * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
1121                 * We are trying to catch newly added swap cache, the only possible case is
1122                 * when a folio is swapped in and out again staying in swap cache, using the
1123                 * same entry before the PTE check above. The PTL is acquired and released
1124                 * twice, each time after updating the swap_map's flag. So holding
1125                 * the PTL here ensures we see the updated value. False positive is possible,
1126                 * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
1127                 * cache, or during the tiny synchronization window between swap cache and
1128                 * swap_map, but it will be gone very quickly, worst result is retry jitters.
1129                 */
1130                if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
1131                        double_pt_unlock(dst_ptl, src_ptl);
1132                        return -EAGAIN;
1133                }
1134        }
1135
1136        orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
1137#ifdef CONFIG_MEM_SOFT_DIRTY
1138        orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
1139#endif
1140        set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
1141        double_pt_unlock(dst_ptl, src_ptl);
1142
1143        return 0;
1144}
1145
1146static int move_zeropage_pte(struct mm_struct *mm,
1147                             struct vm_area_struct *dst_vma,
1148                             struct vm_area_struct *src_vma,
1149                             unsigned long dst_addr, unsigned long src_addr,
1150                             pte_t *dst_pte, pte_t *src_pte,
1151                             pte_t orig_dst_pte, pte_t orig_src_pte,
1152                             pmd_t *dst_pmd, pmd_t dst_pmdval,
1153                             spinlock_t *dst_ptl, spinlock_t *src_ptl)
1154{
1155        pte_t zero_pte;
1156
1157        double_pt_lock(dst_ptl, src_ptl);
1158        if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1159                                 dst_pmd, dst_pmdval)) {
1160                double_pt_unlock(dst_ptl, src_ptl);
1161                return -EAGAIN;
1162        }
1163
1164        zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
1165                                         dst_vma->vm_page_prot));
1166        ptep_clear_flush(src_vma, src_addr, src_pte);
1167        set_pte_at(mm, dst_addr, dst_pte, zero_pte);
1168        double_pt_unlock(dst_ptl, src_ptl);
1169
1170        return 0;
1171}
1172
1173
1174/*
1175 * The mmap_lock for reading is held by the caller. Just move the page
1176 * from src_pmd to dst_pmd if possible, and return true if succeeded
1177 * in moving the page.
1178 */
1179static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
1180                          struct vm_area_struct *dst_vma,
1181                          struct vm_area_struct *src_vma,
1182                          unsigned long dst_addr, unsigned long src_addr,
1183                          __u64 mode)
1184{
1185        swp_entry_t entry;
1186        struct swap_info_struct *si = NULL;
1187        pte_t orig_src_pte, orig_dst_pte;
1188        pte_t src_folio_pte;
1189        spinlock_t *src_ptl, *dst_ptl;
1190        pte_t *src_pte = NULL;
1191        pte_t *dst_pte = NULL;
1192        pmd_t dummy_pmdval;
1193        pmd_t dst_pmdval;
1194        struct folio *src_folio = NULL;
1195        struct anon_vma *src_anon_vma = NULL;
1196        struct mmu_notifier_range range;
1197        int err = 0;
1198
1199        flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
1200        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
1201                                src_addr, src_addr + PAGE_SIZE);
1202        mmu_notifier_invalidate_range_start(&range);
1203retry:
1204        /*
1205         * Use the maywrite version to indicate that dst_pte will be modified,
1206         * since dst_pte needs to be none, the subsequent pte_same() check
1207         * cannot prevent the dst_pte page from being freed concurrently, so we
1208         * also need to abtain dst_pmdval and recheck pmd_same() later.
1209         */
1210        dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval,
1211                                           &dst_ptl);
1212
1213        /* Retry if a huge pmd materialized from under us */
1214        if (unlikely(!dst_pte)) {
1215                err = -EAGAIN;
1216                goto out;
1217        }
1218
1219        /*
1220         * Unlike dst_pte, the subsequent pte_same() check can ensure the
1221         * stability of the src_pte page, so there is no need to get pmdval,
1222         * just pass a dummy variable to it.
1223         */
1224        src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval,
1225                                           &src_ptl);
1226
1227        /*
1228         * We held the mmap_lock for reading so MADV_DONTNEED
1229         * can zap transparent huge pages under us, or the
1230         * transparent huge page fault can establish new
1231         * transparent huge pages under us.
1232         */
1233        if (unlikely(!src_pte)) {
1234                err = -EAGAIN;
1235                goto out;
1236        }
1237
1238        /* Sanity checks before the operation */
1239        if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
1240            pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
1241                err = -EINVAL;
1242                goto out;
1243        }
1244
1245        spin_lock(dst_ptl);
1246        orig_dst_pte = ptep_get(dst_pte);
1247        spin_unlock(dst_ptl);
1248        if (!pte_none(orig_dst_pte)) {
1249                err = -EEXIST;
1250                goto out;
1251        }
1252
1253        spin_lock(src_ptl);
1254        orig_src_pte = ptep_get(src_pte);
1255        spin_unlock(src_ptl);
1256        if (pte_none(orig_src_pte)) {
1257                if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
1258                        err = -ENOENT;
1259                else /* nothing to do to move a hole */
1260                        err = 0;
1261                goto out;
1262        }
1263
1264        /* If PTE changed after we locked the folio them start over */
1265        if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
1266                err = -EAGAIN;
1267                goto out;
1268        }
1269
1270        if (pte_present(orig_src_pte)) {
1271                if (is_zero_pfn(pte_pfn(orig_src_pte))) {
1272                        err = move_zeropage_pte(mm, dst_vma, src_vma,
1273                                               dst_addr, src_addr, dst_pte, src_pte,
1274                                               orig_dst_pte, orig_src_pte,
1275                                               dst_pmd, dst_pmdval, dst_ptl, src_ptl);
1276                        goto out;
1277                }
1278
1279                /*
1280                 * Pin and lock both source folio and anon_vma. Since we are in
1281                 * RCU read section, we can't block, so on contention have to
1282                 * unmap the ptes, obtain the lock and retry.
1283                 */
1284                if (!src_folio) {
1285                        struct folio *folio;
1286                        bool locked;
1287
1288                        /*
1289                         * Pin the page while holding the lock to be sure the
1290                         * page isn't freed under us
1291                         */
1292                        spin_lock(src_ptl);
1293                        if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
1294                                spin_unlock(src_ptl);
1295                                err = -EAGAIN;
1296                                goto out;
1297                        }
1298
1299                        folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
1300                        if (!folio || !PageAnonExclusive(&folio->page)) {
1301                                spin_unlock(src_ptl);
1302                                err = -EBUSY;
1303                                goto out;
1304                        }
1305
1306                        locked = folio_trylock(folio);
1307                        /*
1308                         * We avoid waiting for folio lock with a raised
1309                         * refcount for large folios because extra refcounts
1310                         * will result in split_folio() failing later and
1311                         * retrying.  If multiple tasks are trying to move a
1312                         * large folio we can end up livelocking.
1313                         */
1314                        if (!locked && folio_test_large(folio)) {
1315                                spin_unlock(src_ptl);
1316                                err = -EAGAIN;
1317                                goto out;
1318                        }
1319
1320                        folio_get(folio);
1321                        src_folio = folio;
1322                        src_folio_pte = orig_src_pte;
1323                        spin_unlock(src_ptl);
1324
1325                        if (!locked) {
1326                                pte_unmap(src_pte);
1327                                pte_unmap(dst_pte);
1328                                src_pte = dst_pte = NULL;
1329                                /* now we can block and wait */
1330                                folio_lock(src_folio);
1331                                goto retry;
1332                        }
1333
1334                        if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
1335                                err = -EBUSY;
1336                                goto out;
1337                        }
1338                }
1339
1340                /* at this point we have src_folio locked */
1341                if (folio_test_large(src_folio)) {
1342                        /* split_folio() can block */
1343                        pte_unmap(src_pte);
1344                        pte_unmap(dst_pte);
1345                        src_pte = dst_pte = NULL;
1346                        err = split_folio(src_folio);
1347                        if (err)
1348                                goto out;
1349                        /* have to reacquire the folio after it got split */
1350                        folio_unlock(src_folio);
1351                        folio_put(src_folio);
1352                        src_folio = NULL;
1353                        goto retry;
1354                }
1355
1356                if (!src_anon_vma) {
1357                        /*
1358                         * folio_referenced walks the anon_vma chain
1359                         * without the folio lock. Serialize against it with
1360                         * the anon_vma lock, the folio lock is not enough.
1361                         */
1362                        src_anon_vma = folio_get_anon_vma(src_folio);
1363                        if (!src_anon_vma) {
1364                                /* page was unmapped from under us */
1365                                err = -EAGAIN;
1366                                goto out;
1367                        }
1368                        if (!anon_vma_trylock_write(src_anon_vma)) {
1369                                pte_unmap(src_pte);
1370                                pte_unmap(dst_pte);
1371                                src_pte = dst_pte = NULL;
1372                                /* now we can block and wait */
1373                                anon_vma_lock_write(src_anon_vma);
1374                                goto retry;
1375                        }
1376                }
1377
1378                err = move_present_pte(mm,  dst_vma, src_vma,
1379                                       dst_addr, src_addr, dst_pte, src_pte,
1380                                       orig_dst_pte, orig_src_pte, dst_pmd,
1381                                       dst_pmdval, dst_ptl, src_ptl, src_folio);
1382        } else {
1383                struct folio *folio = NULL;
1384
1385                entry = pte_to_swp_entry(orig_src_pte);
1386                if (non_swap_entry(entry)) {
1387                        if (is_migration_entry(entry)) {
1388                                pte_unmap(src_pte);
1389                                pte_unmap(dst_pte);
1390                                src_pte = dst_pte = NULL;
1391                                migration_entry_wait(mm, src_pmd, src_addr);
1392                                err = -EAGAIN;
1393                        } else
1394                                err = -EFAULT;
1395                        goto out;
1396                }
1397
1398                if (!pte_swp_exclusive(orig_src_pte)) {
1399                        err = -EBUSY;
1400                        goto out;
1401                }
1402
1403                si = get_swap_device(entry);
1404                if (unlikely(!si)) {
1405                        err = -EAGAIN;
1406                        goto out;
1407                }
1408                /*
1409                 * Verify the existence of the swapcache. If present, the folio's
1410                 * index and mapping must be updated even when the PTE is a swap
1411                 * entry. The anon_vma lock is not taken during this process since
1412                 * the folio has already been unmapped, and the swap entry is
1413                 * exclusive, preventing rmap walks.
1414                 *
1415                 * For large folios, return -EBUSY immediately, as split_folio()
1416                 * also returns -EBUSY when attempting to split unmapped large
1417                 * folios in the swapcache. This issue needs to be resolved
1418                 * separately to allow proper handling.
1419                 */
1420                if (!src_folio)
1421                        folio = filemap_get_folio(swap_address_space(entry),
1422                                        swap_cache_index(entry));
1423                if (!IS_ERR_OR_NULL(folio)) {
1424                        if (folio_test_large(folio)) {
1425                                err = -EBUSY;
1426                                folio_put(folio);
1427                                goto out;
1428                        }
1429                        src_folio = folio;
1430                        src_folio_pte = orig_src_pte;
1431                        if (!folio_trylock(src_folio)) {
1432                                pte_unmap(src_pte);
1433                                pte_unmap(dst_pte);
1434                                src_pte = dst_pte = NULL;
1435                                put_swap_device(si);
1436                                si = NULL;
1437                                /* now we can block and wait */
1438                                folio_lock(src_folio);
1439                                goto retry;
1440                        }
1441                }
1442                err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
1443                                orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
1444                                dst_ptl, src_ptl, src_folio, si, entry);
1445        }
1446
1447out:
1448        if (src_anon_vma) {
1449                anon_vma_unlock_write(src_anon_vma);
1450                put_anon_vma(src_anon_vma);
1451        }
1452        if (src_folio) {
1453                folio_unlock(src_folio);
1454                folio_put(src_folio);
1455        }
1456        /*
1457         * Unmap in reverse order (LIFO) to maintain proper kmap_local
1458         * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte
1459         * first, then src_pte, so we must unmap src_pte first, then dst_pte.
1460         */
1461        if (src_pte)
1462                pte_unmap(src_pte);
1463        if (dst_pte)
1464                pte_unmap(dst_pte);
1465        mmu_notifier_invalidate_range_end(&range);
1466        if (si)
1467                put_swap_device(si);
1468
1469        return err;
1470}
1471
1472#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1473static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1474                                        unsigned long src_addr,
1475                                        unsigned long src_end)
1476{
1477        return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
1478                src_end - src_addr < HPAGE_PMD_SIZE;
1479}
1480#else
1481static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1482                                        unsigned long src_addr,
1483                                        unsigned long src_end)
1484{
1485        /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */
1486        return false;
1487}
1488#endif
1489
1490static inline bool vma_move_compatible(struct vm_area_struct *vma)
1491{
1492        return !(vma->vm_flags & (VM_PFNMAP | VM_IO |  VM_HUGETLB |
1493                                  VM_MIXEDMAP | VM_SHADOW_STACK));
1494}
1495
1496static int validate_move_areas(struct userfaultfd_ctx *ctx,
1497                               struct vm_area_struct *src_vma,
1498                               struct vm_area_struct *dst_vma)
1499{
1500        /* Only allow moving if both have the same access and protection */
1501        if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) ||
1502            pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot))
1503                return -EINVAL;
1504
1505        /* Only allow moving if both are mlocked or both aren't */
1506        if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED))
1507                return -EINVAL;
1508
1509        /*
1510         * For now, we keep it simple and only move between writable VMAs.
1511         * Access flags are equal, therefore cheching only the source is enough.
1512         */
1513        if (!(src_vma->vm_flags & VM_WRITE))
1514                return -EINVAL;
1515
1516        /* Check if vma flags indicate content which can be moved */
1517        if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma))
1518                return -EINVAL;
1519
1520        /* Ensure dst_vma is registered in uffd we are operating on */
1521        if (!dst_vma->vm_userfaultfd_ctx.ctx ||
1522            dst_vma->vm_userfaultfd_ctx.ctx != ctx)
1523                return -EINVAL;
1524
1525        /* Only allow moving across anonymous vmas */
1526        if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
1527                return -EINVAL;
1528
1529        return 0;
1530}
1531
1532static __always_inline
1533int find_vmas_mm_locked(struct mm_struct *mm,
1534                        unsigned long dst_start,
1535                        unsigned long src_start,
1536                        struct vm_area_struct **dst_vmap,
1537                        struct vm_area_struct **src_vmap)
1538{
1539        struct vm_area_struct *vma;
1540
1541        mmap_assert_locked(mm);
1542        vma = find_vma_and_prepare_anon(mm, dst_start);
1543        if (IS_ERR(vma))
1544                return PTR_ERR(vma);
1545
1546        *dst_vmap = vma;
1547        /* Skip finding src_vma if src_start is in dst_vma */
1548        if (src_start >= vma->vm_start && src_start < vma->vm_end)
1549                goto out_success;
1550
1551        vma = vma_lookup(mm, src_start);
1552        if (!vma)
1553                return -ENOENT;
1554out_success:
1555        *src_vmap = vma;
1556        return 0;
1557}
1558
1559#ifdef CONFIG_PER_VMA_LOCK
1560static int uffd_move_lock(struct mm_struct *mm,
1561                          unsigned long dst_start,
1562                          unsigned long src_start,
1563                          struct vm_area_struct **dst_vmap,
1564                          struct vm_area_struct **src_vmap)
1565{
1566        struct vm_area_struct *vma;
1567        int err;
1568
1569        vma = uffd_lock_vma(mm, dst_start);
1570        if (IS_ERR(vma))
1571                return PTR_ERR(vma);
1572
1573        *dst_vmap = vma;
1574        /*
1575         * Skip finding src_vma if src_start is in dst_vma. This also ensures
1576         * that we don't lock the same vma twice.
1577         */
1578        if (src_start >= vma->vm_start && src_start < vma->vm_end) {
1579                *src_vmap = vma;
1580                return 0;
1581        }
1582
1583        /*
1584         * Using uffd_lock_vma() to get src_vma can lead to following deadlock:
1585         *
1586         * Thread1                              Thread2
1587         * -------                              -------
1588         * vma_start_read(dst_vma)
1589         *                                      mmap_write_lock(mm)
1590         *                                      vma_start_write(src_vma)
1591         * vma_start_read(src_vma)
1592         * mmap_read_lock(mm)
1593         *                                      vma_start_write(dst_vma)
1594         */
1595        *src_vmap = lock_vma_under_rcu(mm, src_start);
1596        if (likely(*src_vmap))
1597                return 0;
1598
1599        /* Undo any locking and retry in mmap_lock critical section */
1600        vma_end_read(*dst_vmap);
1601
1602        mmap_read_lock(mm);
1603        err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1604        if (err)
1605                goto out;
1606
1607        if (!vma_start_read_locked(*dst_vmap)) {
1608                err = -EAGAIN;
1609                goto out;
1610        }
1611
1612        /* Nothing further to do if both vmas are locked. */
1613        if (*dst_vmap == *src_vmap)
1614                goto out;
1615
1616        if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) {
1617                /* Undo dst_vmap locking if src_vmap failed to lock */
1618                vma_end_read(*dst_vmap);
1619                err = -EAGAIN;
1620        }
1621out:
1622        mmap_read_unlock(mm);
1623        return err;
1624}
1625
1626static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1627                             struct vm_area_struct *src_vma)
1628{
1629        vma_end_read(src_vma);
1630        if (src_vma != dst_vma)
1631                vma_end_read(dst_vma);
1632}
1633
1634#else
1635
1636static int uffd_move_lock(struct mm_struct *mm,
1637                          unsigned long dst_start,
1638                          unsigned long src_start,
1639                          struct vm_area_struct **dst_vmap,
1640                          struct vm_area_struct **src_vmap)
1641{
1642        int err;
1643
1644        mmap_read_lock(mm);
1645        err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1646        if (err)
1647                mmap_read_unlock(mm);
1648        return err;
1649}
1650
1651static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1652                             struct vm_area_struct *src_vma)
1653{
1654        mmap_assert_locked(src_vma->vm_mm);
1655        mmap_read_unlock(dst_vma->vm_mm);
1656}
1657#endif
1658
1659/**
1660 * move_pages - move arbitrary anonymous pages of an existing vma
1661 * @ctx: pointer to the userfaultfd context
1662 * @dst_start: start of the destination virtual memory range
1663 * @src_start: start of the source virtual memory range
1664 * @len: length of the virtual memory range
1665 * @mode: flags from uffdio_move.mode
1666 *
1667 * It will either use the mmap_lock in read mode or per-vma locks
1668 *
1669 * move_pages() remaps arbitrary anonymous pages atomically in zero
1670 * copy. It only works on non shared anonymous pages because those can
1671 * be relocated without generating non linear anon_vmas in the rmap
1672 * code.
1673 *
1674 * It provides a zero copy mechanism to handle userspace page faults.
1675 * The source vma pages should have mapcount == 1, which can be
1676 * enforced by using madvise(MADV_DONTFORK) on src vma.
1677 *
1678 * The thread receiving the page during the userland page fault
1679 * will receive the faulting page in the source vma through the network,
1680 * storage or any other I/O device (MADV_DONTFORK in the source vma
1681 * avoids move_pages() to fail with -EBUSY if the process forks before
1682 * move_pages() is called), then it will call move_pages() to map the
1683 * page in the faulting address in the destination vma.
1684 *
1685 * This userfaultfd command works purely via pagetables, so it's the
1686 * most efficient way to move physical non shared anonymous pages
1687 * across different virtual addresses. Unlike mremap()/mmap()/munmap()
1688 * it does not create any new vmas. The mapping in the destination
1689 * address is atomic.
1690 *
1691 * It only works if the vma protection bits are identical from the
1692 * source and destination vma.
1693 *
1694 * It can remap non shared anonymous pages within the same vma too.
1695 *
1696 * If the source virtual memory range has any unmapped holes, or if
1697 * the destination virtual memory range is not a whole unmapped hole,
1698 * move_pages() will fail respectively with -ENOENT or -EEXIST. This
1699 * provides a very strict behavior to avoid any chance of memory
1700 * corruption going unnoticed if there are userland race conditions.
1701 * Only one thread should resolve the userland page fault at any given
1702 * time for any given faulting address. This means that if two threads
1703 * try to both call move_pages() on the same destination address at the
1704 * same time, the second thread will get an explicit error from this
1705 * command.
1706 *
1707 * The command retval will return "len" is successful. The command
1708 * however can be interrupted by fatal signals or errors. If
1709 * interrupted it will return the number of bytes successfully
1710 * remapped before the interruption if any, or the negative error if
1711 * none. It will never return zero. Either it will return an error or
1712 * an amount of bytes successfully moved. If the retval reports a
1713 * "short" remap, the move_pages() command should be repeated by
1714 * userland with src+retval, dst+reval, len-retval if it wants to know
1715 * about the error that interrupted it.
1716 *
1717 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to
1718 * prevent -ENOENT errors to materialize if there are holes in the
1719 * source virtual range that is being remapped. The holes will be
1720 * accounted as successfully remapped in the retval of the
1721 * command. This is mostly useful to remap hugepage naturally aligned
1722 * virtual regions without knowing if there are transparent hugepage
1723 * in the regions or not, but preventing the risk of having to split
1724 * the hugepmd during the remap.
1725 *
1726 * If there's any rmap walk that is taking the anon_vma locks without
1727 * first obtaining the folio lock (the only current instance is
1728 * folio_referenced), they will have to verify if the folio->mapping
1729 * has changed after taking the anon_vma lock. If it changed they
1730 * should release the lock and retry obtaining a new anon_vma, because
1731 * it means the anon_vma was changed by move_pages() before the lock
1732 * could be obtained. This is the only additional complexity added to
1733 * the rmap code to provide this anonymous page remapping functionality.
1734 */
1735ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
1736                   unsigned long src_start, unsigned long len, __u64 mode)
1737{
1738        struct mm_struct *mm = ctx->mm;
1739        struct vm_area_struct *src_vma, *dst_vma;
1740        unsigned long src_addr, dst_addr;
1741        pmd_t *src_pmd, *dst_pmd;
1742        long err = -EINVAL;
1743        ssize_t moved = 0;
1744
1745        /* Sanitize the command parameters. */
1746        VM_WARN_ON_ONCE(src_start & ~PAGE_MASK);
1747        VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
1748        VM_WARN_ON_ONCE(len & ~PAGE_MASK);
1749
1750        /* Does the address range wrap, or is the span zero-sized? */
1751        VM_WARN_ON_ONCE(src_start + len < src_start);
1752        VM_WARN_ON_ONCE(dst_start + len < dst_start);
1753
1754        err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
1755        if (err)
1756                goto out;
1757
1758        /* Re-check after taking map_changing_lock */
1759        err = -EAGAIN;
1760        down_read(&ctx->map_changing_lock);
1761        if (likely(atomic_read(&ctx->mmap_changing)))
1762                goto out_unlock;
1763        /*
1764         * Make sure the vma is not shared, that the src and dst remap
1765         * ranges are both valid and fully within a single existing
1766         * vma.
1767         */
1768        err = -EINVAL;
1769        if (src_vma->vm_flags & VM_SHARED)
1770                goto out_unlock;
1771        if (src_start + len > src_vma->vm_end)
1772                goto out_unlock;
1773
1774        if (dst_vma->vm_flags & VM_SHARED)
1775                goto out_unlock;
1776        if (dst_start + len > dst_vma->vm_end)
1777                goto out_unlock;
1778
1779        err = validate_move_areas(ctx, src_vma, dst_vma);
1780        if (err)
1781                goto out_unlock;
1782
1783        for (src_addr = src_start, dst_addr = dst_start;
1784             src_addr < src_start + len;) {
1785                spinlock_t *ptl;
1786                pmd_t dst_pmdval;
1787                unsigned long step_size;
1788
1789                /*
1790                 * Below works because anonymous area would not have a
1791                 * transparent huge PUD. If file-backed support is added,
1792                 * that case would need to be handled here.
1793                 */
1794                src_pmd = mm_find_pmd(mm, src_addr);
1795                if (unlikely(!src_pmd)) {
1796                        if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1797                                err = -ENOENT;
1798                                break;
1799                        }
1800                        src_pmd = mm_alloc_pmd(mm, src_addr);
1801                        if (unlikely(!src_pmd)) {
1802                                err = -ENOMEM;
1803                                break;
1804                        }
1805                }
1806                dst_pmd = mm_alloc_pmd(mm, dst_addr);
1807                if (unlikely(!dst_pmd)) {
1808                        err = -ENOMEM;
1809                        break;
1810                }
1811
1812                dst_pmdval = pmdp_get_lockless(dst_pmd);
1813                /*
1814                 * If the dst_pmd is mapped as THP don't override it and just
1815                 * be strict. If dst_pmd changes into TPH after this check, the
1816                 * move_pages_huge_pmd() will detect the change and retry
1817                 * while move_pages_pte() will detect the change and fail.
1818                 */
1819                if (unlikely(pmd_trans_huge(dst_pmdval))) {
1820                        err = -EEXIST;
1821                        break;
1822                }
1823
1824                ptl = pmd_trans_huge_lock(src_pmd, src_vma);
1825                if (ptl) {
1826                        /* Check if we can move the pmd without splitting it. */
1827                        if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
1828                            !pmd_none(dst_pmdval)) {
1829                                /* Can be a migration entry */
1830                                if (pmd_present(*src_pmd)) {
1831                                        struct folio *folio = pmd_folio(*src_pmd);
1832
1833                                        if (!is_huge_zero_folio(folio) &&
1834                                            !PageAnonExclusive(&folio->page)) {
1835                                                spin_unlock(ptl);
1836                                                err = -EBUSY;
1837                                                break;
1838                                        }
1839                                }
1840
1841                                spin_unlock(ptl);
1842                                split_huge_pmd(src_vma, src_pmd, src_addr);
1843                                /* The folio will be split by move_pages_pte() */
1844                                continue;
1845                        }
1846
1847                        err = move_pages_huge_pmd(mm, dst_pmd, src_pmd,
1848                                                  dst_pmdval, dst_vma, src_vma,
1849                                                  dst_addr, src_addr);
1850                        step_size = HPAGE_PMD_SIZE;
1851                } else {
1852                        if (pmd_none(*src_pmd)) {
1853                                if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1854                                        err = -ENOENT;
1855                                        break;
1856                                }
1857                                if (unlikely(__pte_alloc(mm, src_pmd))) {
1858                                        err = -ENOMEM;
1859                                        break;
1860                                }
1861                        }
1862
1863                        if (unlikely(pte_alloc(mm, dst_pmd))) {
1864                                err = -ENOMEM;
1865                                break;
1866                        }
1867
1868                        err = move_pages_pte(mm, dst_pmd, src_pmd,
1869                                             dst_vma, src_vma,
1870                                             dst_addr, src_addr, mode);
1871                        step_size = PAGE_SIZE;
1872                }
1873
1874                cond_resched();
1875
1876                if (fatal_signal_pending(current)) {
1877                        /* Do not override an error */
1878                        if (!err || err == -EAGAIN)
1879                                err = -EINTR;
1880                        break;
1881                }
1882
1883                if (err) {
1884                        if (err == -EAGAIN)
1885                                continue;
1886                        break;
1887                }
1888
1889                /* Proceed to the next page */
1890                dst_addr += step_size;
1891                src_addr += step_size;
1892                moved += step_size;
1893        }
1894
1895out_unlock:
1896        up_read(&ctx->map_changing_lock);
1897        uffd_move_unlock(dst_vma, src_vma);
1898out:
1899        VM_WARN_ON_ONCE(moved < 0);
1900        VM_WARN_ON_ONCE(err > 0);
1901        VM_WARN_ON_ONCE(!moved && !err);
1902        return moved ? moved : err;
1903}
1904
1905static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
1906                                     vm_flags_t vm_flags)
1907{
1908        const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP;
1909
1910        vm_flags_reset(vma, vm_flags);
1911        /*
1912         * For shared mappings, we want to enable writenotify while
1913         * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
1914         * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
1915         */
1916        if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
1917                vma_set_page_prot(vma);
1918}
1919
1920static void userfaultfd_set_ctx(struct vm_area_struct *vma,
1921                                struct userfaultfd_ctx *ctx,
1922                                vm_flags_t vm_flags)
1923{
1924        vma_start_write(vma);
1925        vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
1926        userfaultfd_set_vm_flags(vma,
1927                                 (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags);
1928}
1929
1930void userfaultfd_reset_ctx(struct vm_area_struct *vma)
1931{
1932        userfaultfd_set_ctx(vma, NULL, 0);
1933}
1934
1935struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
1936                                             struct vm_area_struct *prev,
1937                                             struct vm_area_struct *vma,
1938                                             unsigned long start,
1939                                             unsigned long end)
1940{
1941        struct vm_area_struct *ret;
1942        bool give_up_on_oom = false;
1943
1944        /*
1945         * If we are modifying only and not splitting, just give up on the merge
1946         * if OOM prevents us from merging successfully.
1947         */
1948        if (start == vma->vm_start && end == vma->vm_end)
1949                give_up_on_oom = true;
1950
1951        /* Reset ptes for the whole vma range if wr-protected */
1952        if (userfaultfd_wp(vma))
1953                uffd_wp_range(vma, start, end - start, false);
1954
1955        ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
1956                                    vma->vm_flags & ~__VM_UFFD_FLAGS,
1957                                    NULL_VM_UFFD_CTX, give_up_on_oom);
1958
1959        /*
1960         * In the vma_merge() successful mprotect-like case 8:
1961         * the next vma was merged into the current one and
1962         * the current one has not been updated yet.
1963         */
1964        if (!IS_ERR(ret))
1965                userfaultfd_reset_ctx(ret);
1966
1967        return ret;
1968}
1969
1970/* Assumes mmap write lock taken, and mm_struct pinned. */
1971int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
1972                               struct vm_area_struct *vma,
1973                               vm_flags_t vm_flags,
1974                               unsigned long start, unsigned long end,
1975                               bool wp_async)
1976{
1977        VMA_ITERATOR(vmi, ctx->mm, start);
1978        struct vm_area_struct *prev = vma_prev(&vmi);
1979        unsigned long vma_end;
1980        vm_flags_t new_flags;
1981
1982        if (vma->vm_start < start)
1983                prev = vma;
1984
1985        for_each_vma_range(vmi, vma, end) {
1986                cond_resched();
1987
1988                VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async));
1989                VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx &&
1990                                vma->vm_userfaultfd_ctx.ctx != ctx);
1991                VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
1992
1993                /*
1994                 * Nothing to do: this vma is already registered into this
1995                 * userfaultfd and with the right tracking mode too.
1996                 */
1997                if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1998                    (vma->vm_flags & vm_flags) == vm_flags)
1999                        goto skip;
2000
2001                if (vma->vm_start > start)
2002                        start = vma->vm_start;
2003                vma_end = min(end, vma->vm_end);
2004
2005                new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
2006                vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
2007                                            new_flags,
2008                                            (struct vm_userfaultfd_ctx){ctx},
2009                                            /* give_up_on_oom = */false);
2010                if (IS_ERR(vma))
2011                        return PTR_ERR(vma);
2012
2013                /*
2014                 * In the vma_merge() successful mprotect-like case 8:
2015                 * the next vma was merged into the current one and
2016                 * the current one has not been updated yet.
2017                 */
2018                userfaultfd_set_ctx(vma, ctx, vm_flags);
2019
2020                if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
2021                        hugetlb_unshare_all_pmds(vma);
2022
2023skip:
2024                prev = vma;
2025                start = vma->vm_end;
2026        }
2027
2028        return 0;
2029}
2030
2031void userfaultfd_release_new(struct userfaultfd_ctx *ctx)
2032{
2033        struct mm_struct *mm = ctx->mm;
2034        struct vm_area_struct *vma;
2035        VMA_ITERATOR(vmi, mm, 0);
2036
2037        /* the various vma->vm_userfaultfd_ctx still points to it */
2038        mmap_write_lock(mm);
2039        for_each_vma(vmi, vma) {
2040                if (vma->vm_userfaultfd_ctx.ctx == ctx)
2041                        userfaultfd_reset_ctx(vma);
2042        }
2043        mmap_write_unlock(mm);
2044}
2045
2046void userfaultfd_release_all(struct mm_struct *mm,
2047                             struct userfaultfd_ctx *ctx)
2048{
2049        struct vm_area_struct *vma, *prev;
2050        VMA_ITERATOR(vmi, mm, 0);
2051
2052        if (!mmget_not_zero(mm))
2053                return;
2054
2055        /*
2056         * Flush page faults out of all CPUs. NOTE: all page faults
2057         * must be retried without returning VM_FAULT_SIGBUS if
2058         * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
2059         * changes while handle_userfault released the mmap_lock. So
2060         * it's critical that released is set to true (above), before
2061         * taking the mmap_lock for writing.
2062         */
2063        mmap_write_lock(mm);
2064        prev = NULL;
2065        for_each_vma(vmi, vma) {
2066                cond_resched();
2067                VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^
2068                                !!(vma->vm_flags & __VM_UFFD_FLAGS));
2069                if (vma->vm_userfaultfd_ctx.ctx != ctx) {
2070                        prev = vma;
2071                        continue;
2072                }
2073
2074                vma = userfaultfd_clear_vma(&vmi, prev, vma,
2075                                            vma->vm_start, vma->vm_end);
2076                prev = vma;
2077        }
2078        mmap_write_unlock(mm);
2079        mmput(mm);
2080}
2081