linux/mm/rmap.c
<<
>>
Prefs
   1/*
   2 * mm/rmap.c - physical to virtual reverse mappings
   3 *
   4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5 * Released under the General Public License (GPL).
   6 *
   7 * Simple, low overhead reverse mapping scheme.
   8 * Please try to keep this thing as modular as possible.
   9 *
  10 * Provides methods for unmapping each kind of mapped page:
  11 * the anon methods track anonymous pages, and
  12 * the file methods track pages belonging to an inode.
  13 *
  14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17 * Contributions by Hugh Dickins 2003, 2004
  18 */
  19
  20/*
  21 * Lock ordering in mm:
  22 *
  23 * inode->i_rwsem       (while writing or truncating, not reading or faulting)
  24 *   mm->mmap_lock
  25 *     mapping->invalidate_lock (in filemap_fault)
  26 *       folio_lock
  27 *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
  28 *           vma_start_write
  29 *             mapping->i_mmap_rwsem
  30 *               anon_vma->rwsem
  31 *                 mm->page_table_lock or pte_lock
  32 *                   swap_lock (in swap_duplicate, swap_info_get)
  33 *                     mmlist_lock (in mmput, drain_mmlist and others)
  34 *                     mapping->private_lock (in block_dirty_folio)
  35 *                         i_pages lock (widely used)
  36 *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
  37 *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
  38 *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  39 *                       sb_lock (within inode_lock in fs/fs-writeback.c)
  40 *                       i_pages lock (widely used, in set_page_dirty,
  41 *                                 in arch-dependent flush_dcache_mmap_lock,
  42 *                                 within bdi.wb->list_lock in __sync_single_inode)
  43 *
  44 * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
  45 *   ->tasklist_lock
  46 *     pte map lock
  47 *
  48 * hugetlbfs PageHuge() take locks in this order:
  49 *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
  50 *     vma_lock (hugetlb specific lock for pmd_sharing)
  51 *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
  52 *         folio_lock
  53 */
  54
  55#include <linux/mm.h>
  56#include <linux/sched/mm.h>
  57#include <linux/sched/task.h>
  58#include <linux/pagemap.h>
  59#include <linux/swap.h>
  60#include <linux/swapops.h>
  61#include <linux/slab.h>
  62#include <linux/init.h>
  63#include <linux/ksm.h>
  64#include <linux/rmap.h>
  65#include <linux/rcupdate.h>
  66#include <linux/export.h>
  67#include <linux/memcontrol.h>
  68#include <linux/mmu_notifier.h>
  69#include <linux/migrate.h>
  70#include <linux/hugetlb.h>
  71#include <linux/huge_mm.h>
  72#include <linux/backing-dev.h>
  73#include <linux/page_idle.h>
  74#include <linux/memremap.h>
  75#include <linux/userfaultfd_k.h>
  76#include <linux/mm_inline.h>
  77#include <linux/oom.h>
  78
  79#include <asm/tlbflush.h>
  80
  81#define CREATE_TRACE_POINTS
  82#include <trace/events/tlb.h>
  83#include <trace/events/migrate.h>
  84
  85#include "internal.h"
  86
  87static struct kmem_cache *anon_vma_cachep;
  88static struct kmem_cache *anon_vma_chain_cachep;
  89
  90static inline struct anon_vma *anon_vma_alloc(void)
  91{
  92        struct anon_vma *anon_vma;
  93
  94        anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
  95        if (anon_vma) {
  96                atomic_set(&anon_vma->refcount, 1);
  97                anon_vma->num_children = 0;
  98                anon_vma->num_active_vmas = 0;
  99                anon_vma->parent = anon_vma;
 100                /*
 101                 * Initialise the anon_vma root to point to itself. If called
 102                 * from fork, the root will be reset to the parents anon_vma.
 103                 */
 104                anon_vma->root = anon_vma;
 105        }
 106
 107        return anon_vma;
 108}
 109
 110static inline void anon_vma_free(struct anon_vma *anon_vma)
 111{
 112        VM_BUG_ON(atomic_read(&anon_vma->refcount));
 113
 114        /*
 115         * Synchronize against folio_lock_anon_vma_read() such that
 116         * we can safely hold the lock without the anon_vma getting
 117         * freed.
 118         *
 119         * Relies on the full mb implied by the atomic_dec_and_test() from
 120         * put_anon_vma() against the acquire barrier implied by
 121         * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
 122         *
 123         * folio_lock_anon_vma_read()   VS      put_anon_vma()
 124         *   down_read_trylock()                  atomic_dec_and_test()
 125         *   LOCK                                 MB
 126         *   atomic_read()                        rwsem_is_locked()
 127         *
 128         * LOCK should suffice since the actual taking of the lock must
 129         * happen _before_ what follows.
 130         */
 131        might_sleep();
 132        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
 133                anon_vma_lock_write(anon_vma);
 134                anon_vma_unlock_write(anon_vma);
 135        }
 136
 137        kmem_cache_free(anon_vma_cachep, anon_vma);
 138}
 139
 140static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
 141{
 142        return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
 143}
 144
 145static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
 146{
 147        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
 148}
 149
 150static void anon_vma_chain_link(struct vm_area_struct *vma,
 151                                struct anon_vma_chain *avc,
 152                                struct anon_vma *anon_vma)
 153{
 154        avc->vma = vma;
 155        avc->anon_vma = anon_vma;
 156        list_add(&avc->same_vma, &vma->anon_vma_chain);
 157        anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
 158}
 159
 160/**
 161 * __anon_vma_prepare - attach an anon_vma to a memory region
 162 * @vma: the memory region in question
 163 *
 164 * This makes sure the memory mapping described by 'vma' has
 165 * an 'anon_vma' attached to it, so that we can associate the
 166 * anonymous pages mapped into it with that anon_vma.
 167 *
 168 * The common case will be that we already have one, which
 169 * is handled inline by anon_vma_prepare(). But if
 170 * not we either need to find an adjacent mapping that we
 171 * can re-use the anon_vma from (very common when the only
 172 * reason for splitting a vma has been mprotect()), or we
 173 * allocate a new one.
 174 *
 175 * Anon-vma allocations are very subtle, because we may have
 176 * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
 177 * and that may actually touch the rwsem even in the newly
 178 * allocated vma (it depends on RCU to make sure that the
 179 * anon_vma isn't actually destroyed).
 180 *
 181 * As a result, we need to do proper anon_vma locking even
 182 * for the new allocation. At the same time, we do not want
 183 * to do any locking for the common case of already having
 184 * an anon_vma.
 185 */
 186int __anon_vma_prepare(struct vm_area_struct *vma)
 187{
 188        struct mm_struct *mm = vma->vm_mm;
 189        struct anon_vma *anon_vma, *allocated;
 190        struct anon_vma_chain *avc;
 191
 192        mmap_assert_locked(mm);
 193        might_sleep();
 194
 195        avc = anon_vma_chain_alloc(GFP_KERNEL);
 196        if (!avc)
 197                goto out_enomem;
 198
 199        anon_vma = find_mergeable_anon_vma(vma);
 200        allocated = NULL;
 201        if (!anon_vma) {
 202                anon_vma = anon_vma_alloc();
 203                if (unlikely(!anon_vma))
 204                        goto out_enomem_free_avc;
 205                anon_vma->num_children++; /* self-parent link for new root */
 206                allocated = anon_vma;
 207        }
 208
 209        anon_vma_lock_write(anon_vma);
 210        /* page_table_lock to protect against threads */
 211        spin_lock(&mm->page_table_lock);
 212        if (likely(!vma->anon_vma)) {
 213                vma->anon_vma = anon_vma;
 214                anon_vma_chain_link(vma, avc, anon_vma);
 215                anon_vma->num_active_vmas++;
 216                allocated = NULL;
 217                avc = NULL;
 218        }
 219        spin_unlock(&mm->page_table_lock);
 220        anon_vma_unlock_write(anon_vma);
 221
 222        if (unlikely(allocated))
 223                put_anon_vma(allocated);
 224        if (unlikely(avc))
 225                anon_vma_chain_free(avc);
 226
 227        return 0;
 228
 229 out_enomem_free_avc:
 230        anon_vma_chain_free(avc);
 231 out_enomem:
 232        return -ENOMEM;
 233}
 234
 235/*
 236 * This is a useful helper function for locking the anon_vma root as
 237 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
 238 * have the same vma.
 239 *
 240 * Such anon_vma's should have the same root, so you'd expect to see
 241 * just a single mutex_lock for the whole traversal.
 242 */
 243static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
 244{
 245        struct anon_vma *new_root = anon_vma->root;
 246        if (new_root != root) {
 247                if (WARN_ON_ONCE(root))
 248                        up_write(&root->rwsem);
 249                root = new_root;
 250                down_write(&root->rwsem);
 251        }
 252        return root;
 253}
 254
 255static inline void unlock_anon_vma_root(struct anon_vma *root)
 256{
 257        if (root)
 258                up_write(&root->rwsem);
 259}
 260
 261/*
 262 * Attach the anon_vmas from src to dst.
 263 * Returns 0 on success, -ENOMEM on failure.
 264 *
 265 * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
 266 * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
 267 * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
 268 * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
 269 * call, we can identify this case by checking (!dst->anon_vma &&
 270 * src->anon_vma).
 271 *
 272 * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
 273 * and reuse existing anon_vma which has no vmas and only one child anon_vma.
 274 * This prevents degradation of anon_vma hierarchy to endless linear chain in
 275 * case of constantly forking task. On the other hand, an anon_vma with more
 276 * than one child isn't reused even if there was no alive vma, thus rmap
 277 * walker has a good chance of avoiding scanning the whole hierarchy when it
 278 * searches where page is mapped.
 279 */
 280int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 281{
 282        struct anon_vma_chain *avc, *pavc;
 283        struct anon_vma *root = NULL;
 284
 285        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
 286                struct anon_vma *anon_vma;
 287
 288                avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
 289                if (unlikely(!avc)) {
 290                        unlock_anon_vma_root(root);
 291                        root = NULL;
 292                        avc = anon_vma_chain_alloc(GFP_KERNEL);
 293                        if (!avc)
 294                                goto enomem_failure;
 295                }
 296                anon_vma = pavc->anon_vma;
 297                root = lock_anon_vma_root(root, anon_vma);
 298                anon_vma_chain_link(dst, avc, anon_vma);
 299
 300                /*
 301                 * Reuse existing anon_vma if it has no vma and only one
 302                 * anon_vma child.
 303                 *
 304                 * Root anon_vma is never reused:
 305                 * it has self-parent reference and at least one child.
 306                 */
 307                if (!dst->anon_vma && src->anon_vma &&
 308                    anon_vma->num_children < 2 &&
 309                    anon_vma->num_active_vmas == 0)
 310                        dst->anon_vma = anon_vma;
 311        }
 312        if (dst->anon_vma)
 313                dst->anon_vma->num_active_vmas++;
 314        unlock_anon_vma_root(root);
 315        return 0;
 316
 317 enomem_failure:
 318        /*
 319         * dst->anon_vma is dropped here otherwise its num_active_vmas can
 320         * be incorrectly decremented in unlink_anon_vmas().
 321         * We can safely do this because callers of anon_vma_clone() don't care
 322         * about dst->anon_vma if anon_vma_clone() failed.
 323         */
 324        dst->anon_vma = NULL;
 325        unlink_anon_vmas(dst);
 326        return -ENOMEM;
 327}
 328
 329/*
 330 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 331 * the corresponding VMA in the parent process is attached to.
 332 * Returns 0 on success, non-zero on failure.
 333 */
 334int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 335{
 336        struct anon_vma_chain *avc;
 337        struct anon_vma *anon_vma;
 338        int error;
 339
 340        /* Don't bother if the parent process has no anon_vma here. */
 341        if (!pvma->anon_vma)
 342                return 0;
 343
 344        /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
 345        vma->anon_vma = NULL;
 346
 347        /*
 348         * First, attach the new VMA to the parent VMA's anon_vmas,
 349         * so rmap can find non-COWed pages in child processes.
 350         */
 351        error = anon_vma_clone(vma, pvma);
 352        if (error)
 353                return error;
 354
 355        /* An existing anon_vma has been reused, all done then. */
 356        if (vma->anon_vma)
 357                return 0;
 358
 359        /* Then add our own anon_vma. */
 360        anon_vma = anon_vma_alloc();
 361        if (!anon_vma)
 362                goto out_error;
 363        anon_vma->num_active_vmas++;
 364        avc = anon_vma_chain_alloc(GFP_KERNEL);
 365        if (!avc)
 366                goto out_error_free_anon_vma;
 367
 368        /*
 369         * The root anon_vma's rwsem is the lock actually used when we
 370         * lock any of the anon_vmas in this anon_vma tree.
 371         */
 372        anon_vma->root = pvma->anon_vma->root;
 373        anon_vma->parent = pvma->anon_vma;
 374        /*
 375         * With refcounts, an anon_vma can stay around longer than the
 376         * process it belongs to. The root anon_vma needs to be pinned until
 377         * this anon_vma is freed, because the lock lives in the root.
 378         */
 379        get_anon_vma(anon_vma->root);
 380        /* Mark this anon_vma as the one where our new (COWed) pages go. */
 381        vma->anon_vma = anon_vma;
 382        anon_vma_lock_write(anon_vma);
 383        anon_vma_chain_link(vma, avc, anon_vma);
 384        anon_vma->parent->num_children++;
 385        anon_vma_unlock_write(anon_vma);
 386
 387        return 0;
 388
 389 out_error_free_anon_vma:
 390        put_anon_vma(anon_vma);
 391 out_error:
 392        unlink_anon_vmas(vma);
 393        return -ENOMEM;
 394}
 395
 396void unlink_anon_vmas(struct vm_area_struct *vma)
 397{
 398        struct anon_vma_chain *avc, *next;
 399        struct anon_vma *root = NULL;
 400
 401        /*
 402         * Unlink each anon_vma chained to the VMA.  This list is ordered
 403         * from newest to oldest, ensuring the root anon_vma gets freed last.
 404         */
 405        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 406                struct anon_vma *anon_vma = avc->anon_vma;
 407
 408                root = lock_anon_vma_root(root, anon_vma);
 409                anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
 410
 411                /*
 412                 * Leave empty anon_vmas on the list - we'll need
 413                 * to free them outside the lock.
 414                 */
 415                if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
 416                        anon_vma->parent->num_children--;
 417                        continue;
 418                }
 419
 420                list_del(&avc->same_vma);
 421                anon_vma_chain_free(avc);
 422        }
 423        if (vma->anon_vma) {
 424                vma->anon_vma->num_active_vmas--;
 425
 426                /*
 427                 * vma would still be needed after unlink, and anon_vma will be prepared
 428                 * when handle fault.
 429                 */
 430                vma->anon_vma = NULL;
 431        }
 432        unlock_anon_vma_root(root);
 433
 434        /*
 435         * Iterate the list once more, it now only contains empty and unlinked
 436         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
 437         * needing to write-acquire the anon_vma->root->rwsem.
 438         */
 439        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 440                struct anon_vma *anon_vma = avc->anon_vma;
 441
 442                VM_WARN_ON(anon_vma->num_children);
 443                VM_WARN_ON(anon_vma->num_active_vmas);
 444                put_anon_vma(anon_vma);
 445
 446                list_del(&avc->same_vma);
 447                anon_vma_chain_free(avc);
 448        }
 449}
 450
 451static void anon_vma_ctor(void *data)
 452{
 453        struct anon_vma *anon_vma = data;
 454
 455        init_rwsem(&anon_vma->rwsem);
 456        atomic_set(&anon_vma->refcount, 0);
 457        anon_vma->rb_root = RB_ROOT_CACHED;
 458}
 459
 460void __init anon_vma_init(void)
 461{
 462        anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
 463                        0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
 464                        anon_vma_ctor);
 465        anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
 466                        SLAB_PANIC|SLAB_ACCOUNT);
 467}
 468
 469/*
 470 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
 471 *
 472 * Since there is no serialization what so ever against folio_remove_rmap_*()
 473 * the best this function can do is return a refcount increased anon_vma
 474 * that might have been relevant to this page.
 475 *
 476 * The page might have been remapped to a different anon_vma or the anon_vma
 477 * returned may already be freed (and even reused).
 478 *
 479 * In case it was remapped to a different anon_vma, the new anon_vma will be a
 480 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
 481 * ensure that any anon_vma obtained from the page will still be valid for as
 482 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
 483 *
 484 * All users of this function must be very careful when walking the anon_vma
 485 * chain and verify that the page in question is indeed mapped in it
 486 * [ something equivalent to page_mapped_in_vma() ].
 487 *
 488 * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
 489 * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid
 490 * if there is a mapcount, we can dereference the anon_vma after observing
 491 * those.
 492 *
 493 * NOTE: the caller should normally hold folio lock when calling this.  If
 494 * not, the caller needs to double check the anon_vma didn't change after
 495 * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it
 496 * concurrently without folio lock protection). See folio_lock_anon_vma_read()
 497 * which has already covered that, and comment above remap_pages().
 498 */
 499struct anon_vma *folio_get_anon_vma(const struct folio *folio)
 500{
 501        struct anon_vma *anon_vma = NULL;
 502        unsigned long anon_mapping;
 503
 504        rcu_read_lock();
 505        anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
 506        if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
 507                goto out;
 508        if (!folio_mapped(folio))
 509                goto out;
 510
 511        anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
 512        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
 513                anon_vma = NULL;
 514                goto out;
 515        }
 516
 517        /*
 518         * If this folio is still mapped, then its anon_vma cannot have been
 519         * freed.  But if it has been unmapped, we have no security against the
 520         * anon_vma structure being freed and reused (for another anon_vma:
 521         * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
 522         * above cannot corrupt).
 523         */
 524        if (!folio_mapped(folio)) {
 525                rcu_read_unlock();
 526                put_anon_vma(anon_vma);
 527                return NULL;
 528        }
 529out:
 530        rcu_read_unlock();
 531
 532        return anon_vma;
 533}
 534
 535/*
 536 * Similar to folio_get_anon_vma() except it locks the anon_vma.
 537 *
 538 * Its a little more complex as it tries to keep the fast path to a single
 539 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 540 * reference like with folio_get_anon_vma() and then block on the mutex
 541 * on !rwc->try_lock case.
 542 */
 543struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
 544                                          struct rmap_walk_control *rwc)
 545{
 546        struct anon_vma *anon_vma = NULL;
 547        struct anon_vma *root_anon_vma;
 548        unsigned long anon_mapping;
 549
 550retry:
 551        rcu_read_lock();
 552        anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
 553        if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
 554                goto out;
 555        if (!folio_mapped(folio))
 556                goto out;
 557
 558        anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
 559        root_anon_vma = READ_ONCE(anon_vma->root);
 560        if (down_read_trylock(&root_anon_vma->rwsem)) {
 561                /*
 562                 * folio_move_anon_rmap() might have changed the anon_vma as we
 563                 * might not hold the folio lock here.
 564                 */
 565                if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
 566                             anon_mapping)) {
 567                        up_read(&root_anon_vma->rwsem);
 568                        rcu_read_unlock();
 569                        goto retry;
 570                }
 571
 572                /*
 573                 * If the folio is still mapped, then this anon_vma is still
 574                 * its anon_vma, and holding the mutex ensures that it will
 575                 * not go away, see anon_vma_free().
 576                 */
 577                if (!folio_mapped(folio)) {
 578                        up_read(&root_anon_vma->rwsem);
 579                        anon_vma = NULL;
 580                }
 581                goto out;
 582        }
 583
 584        if (rwc && rwc->try_lock) {
 585                anon_vma = NULL;
 586                rwc->contended = true;
 587                goto out;
 588        }
 589
 590        /* trylock failed, we got to sleep */
 591        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
 592                anon_vma = NULL;
 593                goto out;
 594        }
 595
 596        if (!folio_mapped(folio)) {
 597                rcu_read_unlock();
 598                put_anon_vma(anon_vma);
 599                return NULL;
 600        }
 601
 602        /* we pinned the anon_vma, its safe to sleep */
 603        rcu_read_unlock();
 604        anon_vma_lock_read(anon_vma);
 605
 606        /*
 607         * folio_move_anon_rmap() might have changed the anon_vma as we might
 608         * not hold the folio lock here.
 609         */
 610        if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
 611                     anon_mapping)) {
 612                anon_vma_unlock_read(anon_vma);
 613                put_anon_vma(anon_vma);
 614                anon_vma = NULL;
 615                goto retry;
 616        }
 617
 618        if (atomic_dec_and_test(&anon_vma->refcount)) {
 619                /*
 620                 * Oops, we held the last refcount, release the lock
 621                 * and bail -- can't simply use put_anon_vma() because
 622                 * we'll deadlock on the anon_vma_lock_write() recursion.
 623                 */
 624                anon_vma_unlock_read(anon_vma);
 625                __put_anon_vma(anon_vma);
 626                anon_vma = NULL;
 627        }
 628
 629        return anon_vma;
 630
 631out:
 632        rcu_read_unlock();
 633        return anon_vma;
 634}
 635
 636#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 637/*
 638 * Flush TLB entries for recently unmapped pages from remote CPUs. It is
 639 * important if a PTE was dirty when it was unmapped that it's flushed
 640 * before any IO is initiated on the page to prevent lost writes. Similarly,
 641 * it must be flushed before freeing to prevent data leakage.
 642 */
 643void try_to_unmap_flush(void)
 644{
 645        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
 646
 647        if (!tlb_ubc->flush_required)
 648                return;
 649
 650        arch_tlbbatch_flush(&tlb_ubc->arch);
 651        tlb_ubc->flush_required = false;
 652        tlb_ubc->writable = false;
 653}
 654
 655/* Flush iff there are potentially writable TLB entries that can race with IO */
 656void try_to_unmap_flush_dirty(void)
 657{
 658        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
 659
 660        if (tlb_ubc->writable)
 661                try_to_unmap_flush();
 662}
 663
 664/*
 665 * Bits 0-14 of mm->tlb_flush_batched record pending generations.
 666 * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
 667 */
 668#define TLB_FLUSH_BATCH_FLUSHED_SHIFT   16
 669#define TLB_FLUSH_BATCH_PENDING_MASK                    \
 670        ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
 671#define TLB_FLUSH_BATCH_PENDING_LARGE                   \
 672        (TLB_FLUSH_BATCH_PENDING_MASK / 2)
 673
 674static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
 675                unsigned long start, unsigned long end)
 676{
 677        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
 678        int batch;
 679        bool writable = pte_dirty(pteval);
 680
 681        if (!pte_accessible(mm, pteval))
 682                return;
 683
 684        arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end);
 685        tlb_ubc->flush_required = true;
 686
 687        /*
 688         * Ensure compiler does not re-order the setting of tlb_flush_batched
 689         * before the PTE is cleared.
 690         */
 691        barrier();
 692        batch = atomic_read(&mm->tlb_flush_batched);
 693retry:
 694        if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
 695                /*
 696                 * Prevent `pending' from catching up with `flushed' because of
 697                 * overflow.  Reset `pending' and `flushed' to be 1 and 0 if
 698                 * `pending' becomes large.
 699                 */
 700                if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
 701                        goto retry;
 702        } else {
 703                atomic_inc(&mm->tlb_flush_batched);
 704        }
 705
 706        /*
 707         * If the PTE was dirty then it's best to assume it's writable. The
 708         * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
 709         * before the page is queued for IO.
 710         */
 711        if (writable)
 712                tlb_ubc->writable = true;
 713}
 714
 715/*
 716 * Returns true if the TLB flush should be deferred to the end of a batch of
 717 * unmap operations to reduce IPIs.
 718 */
 719static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 720{
 721        if (!(flags & TTU_BATCH_FLUSH))
 722                return false;
 723
 724        return arch_tlbbatch_should_defer(mm);
 725}
 726
 727/*
 728 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
 729 * releasing the PTL if TLB flushes are batched. It's possible for a parallel
 730 * operation such as mprotect or munmap to race between reclaim unmapping
 731 * the page and flushing the page. If this race occurs, it potentially allows
 732 * access to data via a stale TLB entry. Tracking all mm's that have TLB
 733 * batching in flight would be expensive during reclaim so instead track
 734 * whether TLB batching occurred in the past and if so then do a flush here
 735 * if required. This will cost one additional flush per reclaim cycle paid
 736 * by the first operation at risk such as mprotect and mumap.
 737 *
 738 * This must be called under the PTL so that an access to tlb_flush_batched
 739 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
 740 * via the PTL.
 741 */
 742void flush_tlb_batched_pending(struct mm_struct *mm)
 743{
 744        int batch = atomic_read(&mm->tlb_flush_batched);
 745        int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
 746        int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
 747
 748        if (pending != flushed) {
 749                flush_tlb_mm(mm);
 750                /*
 751                 * If the new TLB flushing is pending during flushing, leave
 752                 * mm->tlb_flush_batched as is, to avoid losing flushing.
 753                 */
 754                atomic_cmpxchg(&mm->tlb_flush_batched, batch,
 755                               pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
 756        }
 757}
 758#else
 759static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
 760                unsigned long start, unsigned long end)
 761{
 762}
 763
 764static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 765{
 766        return false;
 767}
 768#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 769
 770/**
 771 * page_address_in_vma - The virtual address of a page in this VMA.
 772 * @folio: The folio containing the page.
 773 * @page: The page within the folio.
 774 * @vma: The VMA we need to know the address in.
 775 *
 776 * Calculates the user virtual address of this page in the specified VMA.
 777 * It is the caller's responsibility to check the page is actually
 778 * within the VMA.  There may not currently be a PTE pointing at this
 779 * page, but if a page fault occurs at this address, this is the page
 780 * which will be accessed.
 781 *
 782 * Context: Caller should hold a reference to the folio.  Caller should
 783 * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the
 784 * VMA from being altered.
 785 *
 786 * Return: The virtual address corresponding to this page in the VMA.
 787 */
 788unsigned long page_address_in_vma(const struct folio *folio,
 789                const struct page *page, const struct vm_area_struct *vma)
 790{
 791        if (folio_test_anon(folio)) {
 792                struct anon_vma *anon_vma = folio_anon_vma(folio);
 793                /*
 794                 * Note: swapoff's unuse_vma() is more efficient with this
 795                 * check, and needs it to match anon_vma when KSM is active.
 796                 */
 797                if (!vma->anon_vma || !anon_vma ||
 798                    vma->anon_vma->root != anon_vma->root)
 799                        return -EFAULT;
 800        } else if (!vma->vm_file) {
 801                return -EFAULT;
 802        } else if (vma->vm_file->f_mapping != folio->mapping) {
 803                return -EFAULT;
 804        }
 805
 806        /* KSM folios don't reach here because of the !anon_vma check */
 807        return vma_address(vma, page_pgoff(folio, page), 1);
 808}
 809
 810/*
 811 * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
 812 * NULL if it doesn't exist.  No guarantees / checks on what the pmd_t*
 813 * represents.
 814 */
 815pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
 816{
 817        pgd_t *pgd;
 818        p4d_t *p4d;
 819        pud_t *pud;
 820        pmd_t *pmd = NULL;
 821
 822        pgd = pgd_offset(mm, address);
 823        if (!pgd_present(*pgd))
 824                goto out;
 825
 826        p4d = p4d_offset(pgd, address);
 827        if (!p4d_present(*p4d))
 828                goto out;
 829
 830        pud = pud_offset(p4d, address);
 831        if (!pud_present(*pud))
 832                goto out;
 833
 834        pmd = pmd_offset(pud, address);
 835out:
 836        return pmd;
 837}
 838
 839struct folio_referenced_arg {
 840        int mapcount;
 841        int referenced;
 842        vm_flags_t vm_flags;
 843        struct mem_cgroup *memcg;
 844};
 845
 846/*
 847 * arg: folio_referenced_arg will be passed
 848 */
 849static bool folio_referenced_one(struct folio *folio,
 850                struct vm_area_struct *vma, unsigned long address, void *arg)
 851{
 852        struct folio_referenced_arg *pra = arg;
 853        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
 854        int referenced = 0;
 855        unsigned long start = address, ptes = 0;
 856
 857        while (page_vma_mapped_walk(&pvmw)) {
 858                address = pvmw.address;
 859
 860                if (vma->vm_flags & VM_LOCKED) {
 861                        if (!folio_test_large(folio) || !pvmw.pte) {
 862                                /* Restore the mlock which got missed */
 863                                mlock_vma_folio(folio, vma);
 864                                page_vma_mapped_walk_done(&pvmw);
 865                                pra->vm_flags |= VM_LOCKED;
 866                                return false; /* To break the loop */
 867                        }
 868                        /*
 869                         * For large folio fully mapped to VMA, will
 870                         * be handled after the pvmw loop.
 871                         *
 872                         * For large folio cross VMA boundaries, it's
 873                         * expected to be picked  by page reclaim. But
 874                         * should skip reference of pages which are in
 875                         * the range of VM_LOCKED vma. As page reclaim
 876                         * should just count the reference of pages out
 877                         * the range of VM_LOCKED vma.
 878                         */
 879                        ptes++;
 880                        pra->mapcount--;
 881                        continue;
 882                }
 883
 884                /*
 885                 * Skip the non-shared swapbacked folio mapped solely by
 886                 * the exiting or OOM-reaped process. This avoids redundant
 887                 * swap-out followed by an immediate unmap.
 888                 */
 889                if ((!atomic_read(&vma->vm_mm->mm_users) ||
 890                    check_stable_address_space(vma->vm_mm)) &&
 891                    folio_test_anon(folio) && folio_test_swapbacked(folio) &&
 892                    !folio_maybe_mapped_shared(folio)) {
 893                        pra->referenced = -1;
 894                        page_vma_mapped_walk_done(&pvmw);
 895                        return false;
 896                }
 897
 898                if (lru_gen_enabled() && pvmw.pte) {
 899                        if (lru_gen_look_around(&pvmw))
 900                                referenced++;
 901                } else if (pvmw.pte) {
 902                        if (ptep_clear_flush_young_notify(vma, address,
 903                                                pvmw.pte))
 904                                referenced++;
 905                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
 906                        if (pmdp_clear_flush_young_notify(vma, address,
 907                                                pvmw.pmd))
 908                                referenced++;
 909                } else {
 910                        /* unexpected pmd-mapped folio? */
 911                        WARN_ON_ONCE(1);
 912                }
 913
 914                pra->mapcount--;
 915        }
 916
 917        if ((vma->vm_flags & VM_LOCKED) &&
 918                        folio_test_large(folio) &&
 919                        folio_within_vma(folio, vma)) {
 920                unsigned long s_align, e_align;
 921
 922                s_align = ALIGN_DOWN(start, PMD_SIZE);
 923                e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);
 924
 925                /* folio doesn't cross page table boundary and fully mapped */
 926                if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
 927                        /* Restore the mlock which got missed */
 928                        mlock_vma_folio(folio, vma);
 929                        pra->vm_flags |= VM_LOCKED;
 930                        return false; /* To break the loop */
 931                }
 932        }
 933
 934        if (referenced)
 935                folio_clear_idle(folio);
 936        if (folio_test_clear_young(folio))
 937                referenced++;
 938
 939        if (referenced) {
 940                pra->referenced++;
 941                pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
 942        }
 943
 944        if (!pra->mapcount)
 945                return false; /* To break the loop */
 946
 947        return true;
 948}
 949
 950static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
 951{
 952        struct folio_referenced_arg *pra = arg;
 953        struct mem_cgroup *memcg = pra->memcg;
 954
 955        /*
 956         * Ignore references from this mapping if it has no recency. If the
 957         * folio has been used in another mapping, we will catch it; if this
 958         * other mapping is already gone, the unmap path will have set the
 959         * referenced flag or activated the folio in zap_pte_range().
 960         */
 961        if (!vma_has_recency(vma))
 962                return true;
 963
 964        /*
 965         * If we are reclaiming on behalf of a cgroup, skip counting on behalf
 966         * of references from different cgroups.
 967         */
 968        if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
 969                return true;
 970
 971        return false;
 972}
 973
 974/**
 975 * folio_referenced() - Test if the folio was referenced.
 976 * @folio: The folio to test.
 977 * @is_locked: Caller holds lock on the folio.
 978 * @memcg: target memory cgroup
 979 * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
 980 *
 981 * Quick test_and_clear_referenced for all mappings of a folio,
 982 *
 983 * Return: The number of mappings which referenced the folio. Return -1 if
 984 * the function bailed out due to rmap lock contention.
 985 */
 986int folio_referenced(struct folio *folio, int is_locked,
 987                     struct mem_cgroup *memcg, vm_flags_t *vm_flags)
 988{
 989        bool we_locked = false;
 990        struct folio_referenced_arg pra = {
 991                .mapcount = folio_mapcount(folio),
 992                .memcg = memcg,
 993        };
 994        struct rmap_walk_control rwc = {
 995                .rmap_one = folio_referenced_one,
 996                .arg = (void *)&pra,
 997                .anon_lock = folio_lock_anon_vma_read,
 998                .try_lock = true,
 999                .invalid_vma = invalid_folio_referenced_vma,
1000        };
1001
1002        *vm_flags = 0;
1003        if (!pra.mapcount)
1004                return 0;
1005
1006        if (!folio_raw_mapping(folio))
1007                return 0;
1008
1009        if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
1010                we_locked = folio_trylock(folio);
1011                if (!we_locked)
1012                        return 1;
1013        }
1014
1015        rmap_walk(folio, &rwc);
1016        *vm_flags = pra.vm_flags;
1017
1018        if (we_locked)
1019                folio_unlock(folio);
1020
1021        return rwc.contended ? -1 : pra.referenced;
1022}
1023
1024static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
1025{
1026        int cleaned = 0;
1027        struct vm_area_struct *vma = pvmw->vma;
1028        struct mmu_notifier_range range;
1029        unsigned long address = pvmw->address;
1030
1031        /*
1032         * We have to assume the worse case ie pmd for invalidation. Note that
1033         * the folio can not be freed from this function.
1034         */
1035        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
1036                                vma->vm_mm, address, vma_address_end(pvmw));
1037        mmu_notifier_invalidate_range_start(&range);
1038
1039        while (page_vma_mapped_walk(pvmw)) {
1040                int ret = 0;
1041
1042                address = pvmw->address;
1043                if (pvmw->pte) {
1044                        pte_t *pte = pvmw->pte;
1045                        pte_t entry = ptep_get(pte);
1046
1047                        /*
1048                         * PFN swap PTEs, such as device-exclusive ones, that
1049                         * actually map pages are clean and not writable from a
1050                         * CPU perspective. The MMU notifier takes care of any
1051                         * device aspects.
1052                         */
1053                        if (!pte_present(entry))
1054                                continue;
1055                        if (!pte_dirty(entry) && !pte_write(entry))
1056                                continue;
1057
1058                        flush_cache_page(vma, address, pte_pfn(entry));
1059                        entry = ptep_clear_flush(vma, address, pte);
1060                        entry = pte_wrprotect(entry);
1061                        entry = pte_mkclean(entry);
1062                        set_pte_at(vma->vm_mm, address, pte, entry);
1063                        ret = 1;
1064                } else {
1065#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1066                        pmd_t *pmd = pvmw->pmd;
1067                        pmd_t entry;
1068
1069                        if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
1070                                continue;
1071
1072                        flush_cache_range(vma, address,
1073                                          address + HPAGE_PMD_SIZE);
1074                        entry = pmdp_invalidate(vma, address, pmd);
1075                        entry = pmd_wrprotect(entry);
1076                        entry = pmd_mkclean(entry);
1077                        set_pmd_at(vma->vm_mm, address, pmd, entry);
1078                        ret = 1;
1079#else
1080                        /* unexpected pmd-mapped folio? */
1081                        WARN_ON_ONCE(1);
1082#endif
1083                }
1084
1085                if (ret)
1086                        cleaned++;
1087        }
1088
1089        mmu_notifier_invalidate_range_end(&range);
1090
1091        return cleaned;
1092}
1093
1094static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
1095                             unsigned long address, void *arg)
1096{
1097        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
1098        int *cleaned = arg;
1099
1100        *cleaned += page_vma_mkclean_one(&pvmw);
1101
1102        return true;
1103}
1104
1105static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
1106{
1107        if (vma->vm_flags & VM_SHARED)
1108                return false;
1109
1110        return true;
1111}
1112
1113int folio_mkclean(struct folio *folio)
1114{
1115        int cleaned = 0;
1116        struct address_space *mapping;
1117        struct rmap_walk_control rwc = {
1118                .arg = (void *)&cleaned,
1119                .rmap_one = page_mkclean_one,
1120                .invalid_vma = invalid_mkclean_vma,
1121        };
1122
1123        BUG_ON(!folio_test_locked(folio));
1124
1125        if (!folio_mapped(folio))
1126                return 0;
1127
1128        mapping = folio_mapping(folio);
1129        if (!mapping)
1130                return 0;
1131
1132        rmap_walk(folio, &rwc);
1133
1134        return cleaned;
1135}
1136EXPORT_SYMBOL_GPL(folio_mkclean);
1137
1138struct wrprotect_file_state {
1139        int cleaned;
1140        pgoff_t pgoff;
1141        unsigned long pfn;
1142        unsigned long nr_pages;
1143};
1144
1145static bool mapping_wrprotect_range_one(struct folio *folio,
1146                struct vm_area_struct *vma, unsigned long address, void *arg)
1147{
1148        struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
1149        struct page_vma_mapped_walk pvmw = {
1150                .pfn            = state->pfn,
1151                .nr_pages       = state->nr_pages,
1152                .pgoff          = state->pgoff,
1153                .vma            = vma,
1154                .address        = address,
1155                .flags          = PVMW_SYNC,
1156        };
1157
1158        state->cleaned += page_vma_mkclean_one(&pvmw);
1159
1160        return true;
1161}
1162
1163static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
1164                             pgoff_t pgoff_start, unsigned long nr_pages,
1165                             struct rmap_walk_control *rwc, bool locked);
1166
1167/**
1168 * mapping_wrprotect_range() - Write-protect all mappings in a specified range.
1169 *
1170 * @mapping:    The mapping whose reverse mapping should be traversed.
1171 * @pgoff:      The page offset at which @pfn is mapped within @mapping.
1172 * @pfn:        The PFN of the page mapped in @mapping at @pgoff.
1173 * @nr_pages:   The number of physically contiguous base pages spanned.
1174 *
1175 * Traverses the reverse mapping, finding all VMAs which contain a shared
1176 * mapping of the pages in the specified range in @mapping, and write-protects
1177 * them (that is, updates the page tables to mark the mappings read-only such
1178 * that a write protection fault arises when the mappings are written to).
1179 *
1180 * The @pfn value need not refer to a folio, but rather can reference a kernel
1181 * allocation which is mapped into userland. We therefore do not require that
1182 * the page maps to a folio with a valid mapping or index field, rather the
1183 * caller specifies these in @mapping and @pgoff.
1184 *
1185 * Return: the number of write-protected PTEs, or an error.
1186 */
1187int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
1188                unsigned long pfn, unsigned long nr_pages)
1189{
1190        struct wrprotect_file_state state = {
1191                .cleaned = 0,
1192                .pgoff = pgoff,
1193                .pfn = pfn,
1194                .nr_pages = nr_pages,
1195        };
1196        struct rmap_walk_control rwc = {
1197                .arg = (void *)&state,
1198                .rmap_one = mapping_wrprotect_range_one,
1199                .invalid_vma = invalid_mkclean_vma,
1200        };
1201
1202        if (!mapping)
1203                return 0;
1204
1205        __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
1206                         /* locked = */false);
1207
1208        return state.cleaned;
1209}
1210EXPORT_SYMBOL_GPL(mapping_wrprotect_range);
1211
1212/**
1213 * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
1214 *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
1215 *                     within the @vma of shared mappings. And since clean PTEs
1216 *                     should also be readonly, write protects them too.
1217 * @pfn: start pfn.
1218 * @nr_pages: number of physically contiguous pages srarting with @pfn.
1219 * @pgoff: page offset that the @pfn mapped with.
1220 * @vma: vma that @pfn mapped within.
1221 *
1222 * Returns the number of cleaned PTEs (including PMDs).
1223 */
1224int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
1225                      struct vm_area_struct *vma)
1226{
1227        struct page_vma_mapped_walk pvmw = {
1228                .pfn            = pfn,
1229                .nr_pages       = nr_pages,
1230                .pgoff          = pgoff,
1231                .vma            = vma,
1232                .flags          = PVMW_SYNC,
1233        };
1234
1235        if (invalid_mkclean_vma(vma, NULL))
1236                return 0;
1237
1238        pvmw.address = vma_address(vma, pgoff, nr_pages);
1239        VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
1240
1241        return page_vma_mkclean_one(&pvmw);
1242}
1243
1244static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
1245                struct page *page, int nr_pages, struct vm_area_struct *vma,
1246                enum rmap_level level, int *nr_pmdmapped)
1247{
1248        atomic_t *mapped = &folio->_nr_pages_mapped;
1249        const int orig_nr_pages = nr_pages;
1250        int first = 0, nr = 0;
1251
1252        __folio_rmap_sanity_checks(folio, page, nr_pages, level);
1253
1254        switch (level) {
1255        case RMAP_LEVEL_PTE:
1256                if (!folio_test_large(folio)) {
1257                        nr = atomic_inc_and_test(&folio->_mapcount);
1258                        break;
1259                }
1260
1261                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1262                        nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma);
1263                        if (nr == orig_nr_pages)
1264                                /* Was completely unmapped. */
1265                                nr = folio_large_nr_pages(folio);
1266                        else
1267                                nr = 0;
1268                        break;
1269                }
1270
1271                do {
1272                        first += atomic_inc_and_test(&page->_mapcount);
1273                } while (page++, --nr_pages > 0);
1274
1275                if (first &&
1276                    atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
1277                        nr = first;
1278
1279                folio_add_large_mapcount(folio, orig_nr_pages, vma);
1280                break;
1281        case RMAP_LEVEL_PMD:
1282        case RMAP_LEVEL_PUD:
1283                first = atomic_inc_and_test(&folio->_entire_mapcount);
1284                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1285                        if (level == RMAP_LEVEL_PMD && first)
1286                                *nr_pmdmapped = folio_large_nr_pages(folio);
1287                        nr = folio_inc_return_large_mapcount(folio, vma);
1288                        if (nr == 1)
1289                                /* Was completely unmapped. */
1290                                nr = folio_large_nr_pages(folio);
1291                        else
1292                                nr = 0;
1293                        break;
1294                }
1295
1296                if (first) {
1297                        nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
1298                        if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
1299                                nr_pages = folio_large_nr_pages(folio);
1300                                /*
1301                                 * We only track PMD mappings of PMD-sized
1302                                 * folios separately.
1303                                 */
1304                                if (level == RMAP_LEVEL_PMD)
1305                                        *nr_pmdmapped = nr_pages;
1306                                nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
1307                                /* Raced ahead of a remove and another add? */
1308                                if (unlikely(nr < 0))
1309                                        nr = 0;
1310                        } else {
1311                                /* Raced ahead of a remove of ENTIRELY_MAPPED */
1312                                nr = 0;
1313                        }
1314                }
1315                folio_inc_large_mapcount(folio, vma);
1316                break;
1317        }
1318        return nr;
1319}
1320
1321/**
1322 * folio_move_anon_rmap - move a folio to our anon_vma
1323 * @folio:      The folio to move to our anon_vma
1324 * @vma:        The vma the folio belongs to
1325 *
1326 * When a folio belongs exclusively to one process after a COW event,
1327 * that folio can be moved into the anon_vma that belongs to just that
1328 * process, so the rmap code will not search the parent or sibling processes.
1329 */
1330void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
1331{
1332        void *anon_vma = vma->anon_vma;
1333
1334        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1335        VM_BUG_ON_VMA(!anon_vma, vma);
1336
1337        anon_vma += FOLIO_MAPPING_ANON;
1338        /*
1339         * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written
1340         * simultaneously, so a concurrent reader (eg folio_referenced()'s
1341         * folio_test_anon()) will not see one without the other.
1342         */
1343        WRITE_ONCE(folio->mapping, anon_vma);
1344}
1345
1346/**
1347 * __folio_set_anon - set up a new anonymous rmap for a folio
1348 * @folio:      The folio to set up the new anonymous rmap for.
1349 * @vma:        VM area to add the folio to.
1350 * @address:    User virtual address of the mapping
1351 * @exclusive:  Whether the folio is exclusive to the process.
1352 */
1353static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
1354                             unsigned long address, bool exclusive)
1355{
1356        struct anon_vma *anon_vma = vma->anon_vma;
1357
1358        BUG_ON(!anon_vma);
1359
1360        /*
1361         * If the folio isn't exclusive to this vma, we must use the _oldest_
1362         * possible anon_vma for the folio mapping!
1363         */
1364        if (!exclusive)
1365                anon_vma = anon_vma->root;
1366
1367        /*
1368         * page_idle does a lockless/optimistic rmap scan on folio->mapping.
1369         * Make sure the compiler doesn't split the stores of anon_vma and
1370         * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code
1371         * could mistake the mapping for a struct address_space and crash.
1372         */
1373        anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON;
1374        WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
1375        folio->index = linear_page_index(vma, address);
1376}
1377
1378/**
1379 * __page_check_anon_rmap - sanity check anonymous rmap addition
1380 * @folio:      The folio containing @page.
1381 * @page:       the page to check the mapping of
1382 * @vma:        the vm area in which the mapping is added
1383 * @address:    the user virtual address mapped
1384 */
1385static void __page_check_anon_rmap(const struct folio *folio,
1386                const struct page *page, struct vm_area_struct *vma,
1387                unsigned long address)
1388{
1389        /*
1390         * The page's anon-rmap details (mapping and index) are guaranteed to
1391         * be set up correctly at this point.
1392         *
1393         * We have exclusion against folio_add_anon_rmap_*() because the caller
1394         * always holds the page locked.
1395         *
1396         * We have exclusion against folio_add_new_anon_rmap because those pages
1397         * are initially only visible via the pagetables, and the pte is locked
1398         * over the call to folio_add_new_anon_rmap.
1399         */
1400        VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
1401                        folio);
1402        VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
1403                       page);
1404}
1405
1406static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
1407{
1408        int idx;
1409
1410        if (nr) {
1411                idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
1412                __lruvec_stat_mod_folio(folio, idx, nr);
1413        }
1414        if (nr_pmdmapped) {
1415                if (folio_test_anon(folio)) {
1416                        idx = NR_ANON_THPS;
1417                        __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
1418                } else {
1419                        /* NR_*_PMDMAPPED are not maintained per-memcg */
1420                        idx = folio_test_swapbacked(folio) ?
1421                                NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
1422                        __mod_node_page_state(folio_pgdat(folio), idx,
1423                                              nr_pmdmapped);
1424                }
1425        }
1426}
1427
1428static __always_inline void __folio_add_anon_rmap(struct folio *folio,
1429                struct page *page, int nr_pages, struct vm_area_struct *vma,
1430                unsigned long address, rmap_t flags, enum rmap_level level)
1431{
1432        int i, nr, nr_pmdmapped = 0;
1433
1434        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
1435
1436        nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
1437
1438        if (likely(!folio_test_ksm(folio)))
1439                __page_check_anon_rmap(folio, page, vma, address);
1440
1441        __folio_mod_stat(folio, nr, nr_pmdmapped);
1442
1443        if (flags & RMAP_EXCLUSIVE) {
1444                switch (level) {
1445                case RMAP_LEVEL_PTE:
1446                        for (i = 0; i < nr_pages; i++)
1447                                SetPageAnonExclusive(page + i);
1448                        break;
1449                case RMAP_LEVEL_PMD:
1450                        SetPageAnonExclusive(page);
1451                        break;
1452                case RMAP_LEVEL_PUD:
1453                        /*
1454                         * Keep the compiler happy, we don't support anonymous
1455                         * PUD mappings.
1456                         */
1457                        WARN_ON_ONCE(1);
1458                        break;
1459                }
1460        }
1461
1462        VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
1463                         atomic_read(&folio->_mapcount) > 0, folio);
1464        for (i = 0; i < nr_pages; i++) {
1465                struct page *cur_page = page + i;
1466
1467                VM_WARN_ON_FOLIO(folio_test_large(folio) &&
1468                                 folio_entire_mapcount(folio) > 1 &&
1469                                 PageAnonExclusive(cur_page), folio);
1470                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
1471                        continue;
1472
1473                /*
1474                 * While PTE-mapping a THP we have a PMD and a PTE
1475                 * mapping.
1476                 */
1477                VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 &&
1478                                 PageAnonExclusive(cur_page), folio);
1479        }
1480
1481        /*
1482         * For large folio, only mlock it if it's fully mapped to VMA. It's
1483         * not easy to check whether the large folio is fully mapped to VMA
1484         * here. Only mlock normal 4K folio and leave page reclaim to handle
1485         * large folio.
1486         */
1487        if (!folio_test_large(folio))
1488                mlock_vma_folio(folio, vma);
1489}
1490
1491/**
1492 * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio
1493 * @folio:      The folio to add the mappings to
1494 * @page:       The first page to add
1495 * @nr_pages:   The number of pages which will be mapped
1496 * @vma:        The vm area in which the mappings are added
1497 * @address:    The user virtual address of the first page to map
1498 * @flags:      The rmap flags
1499 *
1500 * The page range of folio is defined by [first_page, first_page + nr_pages)
1501 *
1502 * The caller needs to hold the page table lock, and the page must be locked in
1503 * the anon_vma case: to serialize mapping,index checking after setting,
1504 * and to ensure that an anon folio is not being upgraded racily to a KSM folio
1505 * (but KSM folios are never downgraded).
1506 */
1507void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
1508                int nr_pages, struct vm_area_struct *vma, unsigned long address,
1509                rmap_t flags)
1510{
1511        __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
1512                              RMAP_LEVEL_PTE);
1513}
1514
1515/**
1516 * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio
1517 * @folio:      The folio to add the mapping to
1518 * @page:       The first page to add
1519 * @vma:        The vm area in which the mapping is added
1520 * @address:    The user virtual address of the first page to map
1521 * @flags:      The rmap flags
1522 *
1523 * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR)
1524 *
1525 * The caller needs to hold the page table lock, and the page must be locked in
1526 * the anon_vma case: to serialize mapping,index checking after setting.
1527 */
1528void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
1529                struct vm_area_struct *vma, unsigned long address, rmap_t flags)
1530{
1531#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1532        __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
1533                              RMAP_LEVEL_PMD);
1534#else
1535        WARN_ON_ONCE(true);
1536#endif
1537}
1538
1539/**
1540 * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
1541 * @folio:      The folio to add the mapping to.
1542 * @vma:        the vm area in which the mapping is added
1543 * @address:    the user virtual address mapped
1544 * @flags:      The rmap flags
1545 *
1546 * Like folio_add_anon_rmap_*() but must only be called on *new* folios.
1547 * This means the inc-and-test can be bypassed.
1548 * The folio doesn't necessarily need to be locked while it's exclusive
1549 * unless two threads map it concurrently. However, the folio must be
1550 * locked if it's shared.
1551 *
1552 * If the folio is pmd-mappable, it is accounted as a THP.
1553 */
1554void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
1555                unsigned long address, rmap_t flags)
1556{
1557        const bool exclusive = flags & RMAP_EXCLUSIVE;
1558        int nr = 1, nr_pmdmapped = 0;
1559
1560        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
1561        VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
1562
1563        /*
1564         * VM_DROPPABLE mappings don't swap; instead they're just dropped when
1565         * under memory pressure.
1566         */
1567        if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
1568                __folio_set_swapbacked(folio);
1569        __folio_set_anon(folio, vma, address, exclusive);
1570
1571        if (likely(!folio_test_large(folio))) {
1572                /* increment count (starts at -1) */
1573                atomic_set(&folio->_mapcount, 0);
1574                if (exclusive)
1575                        SetPageAnonExclusive(&folio->page);
1576        } else if (!folio_test_pmd_mappable(folio)) {
1577                int i;
1578
1579                nr = folio_large_nr_pages(folio);
1580                for (i = 0; i < nr; i++) {
1581                        struct page *page = folio_page(folio, i);
1582
1583                        if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1584                                /* increment count (starts at -1) */
1585                                atomic_set(&page->_mapcount, 0);
1586                        if (exclusive)
1587                                SetPageAnonExclusive(page);
1588                }
1589
1590                folio_set_large_mapcount(folio, nr, vma);
1591                if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1592                        atomic_set(&folio->_nr_pages_mapped, nr);
1593        } else {
1594                nr = folio_large_nr_pages(folio);
1595                /* increment count (starts at -1) */
1596                atomic_set(&folio->_entire_mapcount, 0);
1597                folio_set_large_mapcount(folio, 1, vma);
1598                if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1599                        atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
1600                if (exclusive)
1601                        SetPageAnonExclusive(&folio->page);
1602                nr_pmdmapped = nr;
1603        }
1604
1605        VM_WARN_ON_ONCE(address < vma->vm_start ||
1606                        address + (nr << PAGE_SHIFT) > vma->vm_end);
1607
1608        __folio_mod_stat(folio, nr, nr_pmdmapped);
1609        mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
1610}
1611
1612static __always_inline void __folio_add_file_rmap(struct folio *folio,
1613                struct page *page, int nr_pages, struct vm_area_struct *vma,
1614                enum rmap_level level)
1615{
1616        int nr, nr_pmdmapped = 0;
1617
1618        VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
1619
1620        nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
1621        __folio_mod_stat(folio, nr, nr_pmdmapped);
1622
1623        /* See comments in folio_add_anon_rmap_*() */
1624        if (!folio_test_large(folio))
1625                mlock_vma_folio(folio, vma);
1626}
1627
1628/**
1629 * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
1630 * @folio:      The folio to add the mappings to
1631 * @page:       The first page to add
1632 * @nr_pages:   The number of pages that will be mapped using PTEs
1633 * @vma:        The vm area in which the mappings are added
1634 *
1635 * The page range of the folio is defined by [page, page + nr_pages)
1636 *
1637 * The caller needs to hold the page table lock.
1638 */
1639void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
1640                int nr_pages, struct vm_area_struct *vma)
1641{
1642        __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
1643}
1644
1645/**
1646 * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
1647 * @folio:      The folio to add the mapping to
1648 * @page:       The first page to add
1649 * @vma:        The vm area in which the mapping is added
1650 *
1651 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
1652 *
1653 * The caller needs to hold the page table lock.
1654 */
1655void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
1656                struct vm_area_struct *vma)
1657{
1658#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1659        __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
1660#else
1661        WARN_ON_ONCE(true);
1662#endif
1663}
1664
1665/**
1666 * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
1667 * @folio:      The folio to add the mapping to
1668 * @page:       The first page to add
1669 * @vma:        The vm area in which the mapping is added
1670 *
1671 * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
1672 *
1673 * The caller needs to hold the page table lock.
1674 */
1675void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
1676                struct vm_area_struct *vma)
1677{
1678#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
1679        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1680        __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
1681#else
1682        WARN_ON_ONCE(true);
1683#endif
1684}
1685
1686static __always_inline void __folio_remove_rmap(struct folio *folio,
1687                struct page *page, int nr_pages, struct vm_area_struct *vma,
1688                enum rmap_level level)
1689{
1690        atomic_t *mapped = &folio->_nr_pages_mapped;
1691        int last = 0, nr = 0, nr_pmdmapped = 0;
1692        bool partially_mapped = false;
1693
1694        __folio_rmap_sanity_checks(folio, page, nr_pages, level);
1695
1696        switch (level) {
1697        case RMAP_LEVEL_PTE:
1698                if (!folio_test_large(folio)) {
1699                        nr = atomic_add_negative(-1, &folio->_mapcount);
1700                        break;
1701                }
1702
1703                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1704                        nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
1705                        if (!nr) {
1706                                /* Now completely unmapped. */
1707                                nr = folio_nr_pages(folio);
1708                        } else {
1709                                partially_mapped = nr < folio_large_nr_pages(folio) &&
1710                                                   !folio_entire_mapcount(folio);
1711                                nr = 0;
1712                        }
1713                        break;
1714                }
1715
1716                folio_sub_large_mapcount(folio, nr_pages, vma);
1717                do {
1718                        last += atomic_add_negative(-1, &page->_mapcount);
1719                } while (page++, --nr_pages > 0);
1720
1721                if (last &&
1722                    atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED)
1723                        nr = last;
1724
1725                partially_mapped = nr && atomic_read(mapped);
1726                break;
1727        case RMAP_LEVEL_PMD:
1728        case RMAP_LEVEL_PUD:
1729                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1730                        last = atomic_add_negative(-1, &folio->_entire_mapcount);
1731                        if (level == RMAP_LEVEL_PMD && last)
1732                                nr_pmdmapped = folio_large_nr_pages(folio);
1733                        nr = folio_dec_return_large_mapcount(folio, vma);
1734                        if (!nr) {
1735                                /* Now completely unmapped. */
1736                                nr = folio_large_nr_pages(folio);
1737                        } else {
1738                                partially_mapped = last &&
1739                                                   nr < folio_large_nr_pages(folio);
1740                                nr = 0;
1741                        }
1742                        break;
1743                }
1744
1745                folio_dec_large_mapcount(folio, vma);
1746                last = atomic_add_negative(-1, &folio->_entire_mapcount);
1747                if (last) {
1748                        nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
1749                        if (likely(nr < ENTIRELY_MAPPED)) {
1750                                nr_pages = folio_large_nr_pages(folio);
1751                                if (level == RMAP_LEVEL_PMD)
1752                                        nr_pmdmapped = nr_pages;
1753                                nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
1754                                /* Raced ahead of another remove and an add? */
1755                                if (unlikely(nr < 0))
1756                                        nr = 0;
1757                        } else {
1758                                /* An add of ENTIRELY_MAPPED raced ahead */
1759                                nr = 0;
1760                        }
1761                }
1762
1763                partially_mapped = nr && nr < nr_pmdmapped;
1764                break;
1765        }
1766
1767        /*
1768         * Queue anon large folio for deferred split if at least one page of
1769         * the folio is unmapped and at least one page is still mapped.
1770         *
1771         * Check partially_mapped first to ensure it is a large folio.
1772         */
1773        if (partially_mapped && folio_test_anon(folio) &&
1774            !folio_test_partially_mapped(folio))
1775                deferred_split_folio(folio, true);
1776
1777        __folio_mod_stat(folio, -nr, -nr_pmdmapped);
1778
1779        /*
1780         * It would be tidy to reset folio_test_anon mapping when fully
1781         * unmapped, but that might overwrite a racing folio_add_anon_rmap_*()
1782         * which increments mapcount after us but sets mapping before us:
1783         * so leave the reset to free_pages_prepare, and remember that
1784         * it's only reliable while mapped.
1785         */
1786
1787        munlock_vma_folio(folio, vma);
1788}
1789
1790/**
1791 * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio
1792 * @folio:      The folio to remove the mappings from
1793 * @page:       The first page to remove
1794 * @nr_pages:   The number of pages that will be removed from the mapping
1795 * @vma:        The vm area from which the mappings are removed
1796 *
1797 * The page range of the folio is defined by [page, page + nr_pages)
1798 *
1799 * The caller needs to hold the page table lock.
1800 */
1801void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
1802                int nr_pages, struct vm_area_struct *vma)
1803{
1804        __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
1805}
1806
1807/**
1808 * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio
1809 * @folio:      The folio to remove the mapping from
1810 * @page:       The first page to remove
1811 * @vma:        The vm area from which the mapping is removed
1812 *
1813 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
1814 *
1815 * The caller needs to hold the page table lock.
1816 */
1817void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
1818                struct vm_area_struct *vma)
1819{
1820#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1821        __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
1822#else
1823        WARN_ON_ONCE(true);
1824#endif
1825}
1826
1827/**
1828 * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
1829 * @folio:      The folio to remove the mapping from
1830 * @page:       The first page to remove
1831 * @vma:        The vm area from which the mapping is removed
1832 *
1833 * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
1834 *
1835 * The caller needs to hold the page table lock.
1836 */
1837void folio_remove_rmap_pud(struct folio *folio, struct page *page,
1838                struct vm_area_struct *vma)
1839{
1840#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
1841        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1842        __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
1843#else
1844        WARN_ON_ONCE(true);
1845#endif
1846}
1847
1848static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
1849                        struct page_vma_mapped_walk *pvmw,
1850                        enum ttu_flags flags, pte_t pte)
1851{
1852        unsigned long end_addr, addr = pvmw->address;
1853        struct vm_area_struct *vma = pvmw->vma;
1854        unsigned int max_nr;
1855
1856        if (flags & TTU_HWPOISON)
1857                return 1;
1858        if (!folio_test_large(folio))
1859                return 1;
1860
1861        /* We may only batch within a single VMA and a single page table. */
1862        end_addr = pmd_addr_end(addr, vma->vm_end);
1863        max_nr = (end_addr - addr) >> PAGE_SHIFT;
1864
1865        /* We only support lazyfree batching for now ... */
1866        if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
1867                return 1;
1868        if (pte_unused(pte))
1869                return 1;
1870
1871        return folio_pte_batch(folio, pvmw->pte, pte, max_nr);
1872}
1873
1874/*
1875 * @arg: enum ttu_flags will be passed to this argument
1876 */
1877static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
1878                     unsigned long address, void *arg)
1879{
1880        struct mm_struct *mm = vma->vm_mm;
1881        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
1882        bool anon_exclusive, ret = true;
1883        pte_t pteval;
1884        struct page *subpage;
1885        struct mmu_notifier_range range;
1886        enum ttu_flags flags = (enum ttu_flags)(long)arg;
1887        unsigned long nr_pages = 1, end_addr;
1888        unsigned long pfn;
1889        unsigned long hsz = 0;
1890
1891        /*
1892         * When racing against e.g. zap_pte_range() on another cpu,
1893         * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
1894         * try_to_unmap() may return before page_mapped() has become false,
1895         * if page table locking is skipped: use TTU_SYNC to wait for that.
1896         */
1897        if (flags & TTU_SYNC)
1898                pvmw.flags = PVMW_SYNC;
1899
1900        /*
1901         * For THP, we have to assume the worse case ie pmd for invalidation.
1902         * For hugetlb, it could be much worse if we need to do pud
1903         * invalidation in the case of pmd sharing.
1904         *
1905         * Note that the folio can not be freed in this function as call of
1906         * try_to_unmap() must hold a reference on the folio.
1907         */
1908        range.end = vma_address_end(&pvmw);
1909        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
1910                                address, range.end);
1911        if (folio_test_hugetlb(folio)) {
1912                /*
1913                 * If sharing is possible, start and end will be adjusted
1914                 * accordingly.
1915                 */
1916                adjust_range_if_pmd_sharing_possible(vma, &range.start,
1917                                                     &range.end);
1918
1919                /* We need the huge page size for set_huge_pte_at() */
1920                hsz = huge_page_size(hstate_vma(vma));
1921        }
1922        mmu_notifier_invalidate_range_start(&range);
1923
1924        while (page_vma_mapped_walk(&pvmw)) {
1925                /*
1926                 * If the folio is in an mlock()d vma, we must not swap it out.
1927                 */
1928                if (!(flags & TTU_IGNORE_MLOCK) &&
1929                    (vma->vm_flags & VM_LOCKED)) {
1930                        /* Restore the mlock which got missed */
1931                        if (!folio_test_large(folio))
1932                                mlock_vma_folio(folio, vma);
1933                        goto walk_abort;
1934                }
1935
1936                if (!pvmw.pte) {
1937                        if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
1938                                if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
1939                                        goto walk_done;
1940                                /*
1941                                 * unmap_huge_pmd_locked has either already marked
1942                                 * the folio as swap-backed or decided to retain it
1943                                 * due to GUP or speculative references.
1944                                 */
1945                                goto walk_abort;
1946                        }
1947
1948                        if (flags & TTU_SPLIT_HUGE_PMD) {
1949                                /*
1950                                 * We temporarily have to drop the PTL and
1951                                 * restart so we can process the PTE-mapped THP.
1952                                 */
1953                                split_huge_pmd_locked(vma, pvmw.address,
1954                                                      pvmw.pmd, false);
1955                                flags &= ~TTU_SPLIT_HUGE_PMD;
1956                                page_vma_mapped_walk_restart(&pvmw);
1957                                continue;
1958                        }
1959                }
1960
1961                /* Unexpected PMD-mapped THP? */
1962                VM_BUG_ON_FOLIO(!pvmw.pte, folio);
1963
1964                /*
1965                 * Handle PFN swap PTEs, such as device-exclusive ones, that
1966                 * actually map pages.
1967                 */
1968                pteval = ptep_get(pvmw.pte);
1969                if (likely(pte_present(pteval))) {
1970                        pfn = pte_pfn(pteval);
1971                } else {
1972                        pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
1973                        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
1974                }
1975
1976                subpage = folio_page(folio, pfn - folio_pfn(folio));
1977                address = pvmw.address;
1978                anon_exclusive = folio_test_anon(folio) &&
1979                                 PageAnonExclusive(subpage);
1980
1981                if (folio_test_hugetlb(folio)) {
1982                        bool anon = folio_test_anon(folio);
1983
1984                        /*
1985                         * The try_to_unmap() is only passed a hugetlb page
1986                         * in the case where the hugetlb page is poisoned.
1987                         */
1988                        VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
1989                        /*
1990                         * huge_pmd_unshare may unmap an entire PMD page.
1991                         * There is no way of knowing exactly which PMDs may
1992                         * be cached for this mm, so we must flush them all.
1993                         * start/end were already adjusted above to cover this
1994                         * range.
1995                         */
1996                        flush_cache_range(vma, range.start, range.end);
1997
1998                        /*
1999                         * To call huge_pmd_unshare, i_mmap_rwsem must be
2000                         * held in write mode.  Caller needs to explicitly
2001                         * do this outside rmap routines.
2002                         *
2003                         * We also must hold hugetlb vma_lock in write mode.
2004                         * Lock order dictates acquiring vma_lock BEFORE
2005                         * i_mmap_rwsem.  We can only try lock here and fail
2006                         * if unsuccessful.
2007                         */
2008                        if (!anon) {
2009                                VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
2010                                if (!hugetlb_vma_trylock_write(vma))
2011                                        goto walk_abort;
2012                                if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
2013                                        hugetlb_vma_unlock_write(vma);
2014                                        flush_tlb_range(vma,
2015                                                range.start, range.end);
2016                                        /*
2017                                         * The ref count of the PMD page was
2018                                         * dropped which is part of the way map
2019                                         * counting is done for shared PMDs.
2020                                         * Return 'true' here.  When there is
2021                                         * no other sharing, huge_pmd_unshare
2022                                         * returns false and we will unmap the
2023                                         * actual page and drop map count
2024                                         * to zero.
2025                                         */
2026                                        goto walk_done;
2027                                }
2028                                hugetlb_vma_unlock_write(vma);
2029                        }
2030                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
2031                        if (pte_dirty(pteval))
2032                                folio_mark_dirty(folio);
2033                } else if (likely(pte_present(pteval))) {
2034                        nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
2035                        end_addr = address + nr_pages * PAGE_SIZE;
2036                        flush_cache_range(vma, address, end_addr);
2037
2038                        /* Nuke the page table entry. */
2039                        pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages);
2040                        /*
2041                         * We clear the PTE but do not flush so potentially
2042                         * a remote CPU could still be writing to the folio.
2043                         * If the entry was previously clean then the
2044                         * architecture must guarantee that a clear->dirty
2045                         * transition on a cached TLB entry is written through
2046                         * and traps if the PTE is unmapped.
2047                         */
2048                        if (should_defer_flush(mm, flags))
2049                                set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
2050                        else
2051                                flush_tlb_range(vma, address, end_addr);
2052                        if (pte_dirty(pteval))
2053                                folio_mark_dirty(folio);
2054                } else {
2055                        pte_clear(mm, address, pvmw.pte);
2056                }
2057
2058                /*
2059                 * Now the pte is cleared. If this pte was uffd-wp armed,
2060                 * we may want to replace a none pte with a marker pte if
2061                 * it's file-backed, so we don't lose the tracking info.
2062                 */
2063                pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
2064
2065                /* Update high watermark before we lower rss */
2066                update_hiwater_rss(mm);
2067
2068                if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
2069                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
2070                        if (folio_test_hugetlb(folio)) {
2071                                hugetlb_count_sub(folio_nr_pages(folio), mm);
2072                                set_huge_pte_at(mm, address, pvmw.pte, pteval,
2073                                                hsz);
2074                        } else {
2075                                dec_mm_counter(mm, mm_counter(folio));
2076                                set_pte_at(mm, address, pvmw.pte, pteval);
2077                        }
2078                } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
2079                           !userfaultfd_armed(vma)) {
2080                        /*
2081                         * The guest indicated that the page content is of no
2082                         * interest anymore. Simply discard the pte, vmscan
2083                         * will take care of the rest.
2084                         * A future reference will then fault in a new zero
2085                         * page. When userfaultfd is active, we must not drop
2086                         * this page though, as its main user (postcopy
2087                         * migration) will not expect userfaults on already
2088                         * copied pages.
2089                         */
2090                        dec_mm_counter(mm, mm_counter(folio));
2091                } else if (folio_test_anon(folio)) {
2092                        swp_entry_t entry = page_swap_entry(subpage);
2093                        pte_t swp_pte;
2094                        /*
2095                         * Store the swap location in the pte.
2096                         * See handle_pte_fault() ...
2097                         */
2098                        if (unlikely(folio_test_swapbacked(folio) !=
2099                                        folio_test_swapcache(folio))) {
2100                                WARN_ON_ONCE(1);
2101                                goto walk_abort;
2102                        }
2103
2104                        /* MADV_FREE page check */
2105                        if (!folio_test_swapbacked(folio)) {
2106                                int ref_count, map_count;
2107
2108                                /*
2109                                 * Synchronize with gup_pte_range():
2110                                 * - clear PTE; barrier; read refcount
2111                                 * - inc refcount; barrier; read PTE
2112                                 */
2113                                smp_mb();
2114
2115                                ref_count = folio_ref_count(folio);
2116                                map_count = folio_mapcount(folio);
2117
2118                                /*
2119                                 * Order reads for page refcount and dirty flag
2120                                 * (see comments in __remove_mapping()).
2121                                 */
2122                                smp_rmb();
2123
2124                                if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
2125                                        /*
2126                                         * redirtied either using the page table or a previously
2127                                         * obtained GUP reference.
2128                                         */
2129                                        set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
2130                                        folio_set_swapbacked(folio);
2131                                        goto walk_abort;
2132                                } else if (ref_count != 1 + map_count) {
2133                                        /*
2134                                         * Additional reference. Could be a GUP reference or any
2135                                         * speculative reference. GUP users must mark the folio
2136                                         * dirty if there was a modification. This folio cannot be
2137                                         * reclaimed right now either way, so act just like nothing
2138                                         * happened.
2139                                         * We'll come back here later and detect if the folio was
2140                                         * dirtied when the additional reference is gone.
2141                                         */
2142                                        set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
2143                                        goto walk_abort;
2144                                }
2145                                add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
2146                                goto discard;
2147                        }
2148
2149                        if (swap_duplicate(entry) < 0) {
2150                                set_pte_at(mm, address, pvmw.pte, pteval);
2151                                goto walk_abort;
2152                        }
2153
2154                        /*
2155                         * arch_unmap_one() is expected to be a NOP on
2156                         * architectures where we could have PFN swap PTEs,
2157                         * so we'll not check/care.
2158                         */
2159                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
2160                                swap_free(entry);
2161                                set_pte_at(mm, address, pvmw.pte, pteval);
2162                                goto walk_abort;
2163                        }
2164
2165                        /* See folio_try_share_anon_rmap(): clear PTE first. */
2166                        if (anon_exclusive &&
2167                            folio_try_share_anon_rmap_pte(folio, subpage)) {
2168                                swap_free(entry);
2169                                set_pte_at(mm, address, pvmw.pte, pteval);
2170                                goto walk_abort;
2171                        }
2172                        if (list_empty(&mm->mmlist)) {
2173                                spin_lock(&mmlist_lock);
2174                                if (list_empty(&mm->mmlist))
2175                                        list_add(&mm->mmlist, &init_mm.mmlist);
2176                                spin_unlock(&mmlist_lock);
2177                        }
2178                        dec_mm_counter(mm, MM_ANONPAGES);
2179                        inc_mm_counter(mm, MM_SWAPENTS);
2180                        swp_pte = swp_entry_to_pte(entry);
2181                        if (anon_exclusive)
2182                                swp_pte = pte_swp_mkexclusive(swp_pte);
2183                        if (likely(pte_present(pteval))) {
2184                                if (pte_soft_dirty(pteval))
2185                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
2186                                if (pte_uffd_wp(pteval))
2187                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
2188                        } else {
2189                                if (pte_swp_soft_dirty(pteval))
2190                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
2191                                if (pte_swp_uffd_wp(pteval))
2192                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
2193                        }
2194                        set_pte_at(mm, address, pvmw.pte, swp_pte);
2195                } else {
2196                        /*
2197                         * This is a locked file-backed folio,
2198                         * so it cannot be removed from the page
2199                         * cache and replaced by a new folio before
2200                         * mmu_notifier_invalidate_range_end, so no
2201                         * concurrent thread might update its page table
2202                         * to point at a new folio while a device is
2203                         * still using this folio.
2204                         *
2205                         * See Documentation/mm/mmu_notifier.rst
2206                         */
2207                        dec_mm_counter(mm, mm_counter_file(folio));
2208                }
2209discard:
2210                if (unlikely(folio_test_hugetlb(folio))) {
2211                        hugetlb_remove_rmap(folio);
2212                } else {
2213                        folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
2214                }
2215                if (vma->vm_flags & VM_LOCKED)
2216                        mlock_drain_local();
2217                folio_put_refs(folio, nr_pages);
2218
2219                /*
2220                 * If we are sure that we batched the entire folio and cleared
2221                 * all PTEs, we can just optimize and stop right here.
2222                 */
2223                if (nr_pages == folio_nr_pages(folio))
2224                        goto walk_done;
2225                continue;
2226walk_abort:
2227                ret = false;
2228walk_done:
2229                page_vma_mapped_walk_done(&pvmw);
2230                break;
2231        }
2232
2233        mmu_notifier_invalidate_range_end(&range);
2234
2235        return ret;
2236}
2237
2238static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
2239{
2240        return vma_is_temporary_stack(vma);
2241}
2242
2243static int folio_not_mapped(struct folio *folio)
2244{
2245        return !folio_mapped(folio);
2246}
2247
2248/**
2249 * try_to_unmap - Try to remove all page table mappings to a folio.
2250 * @folio: The folio to unmap.
2251 * @flags: action and flags
2252 *
2253 * Tries to remove all the page table entries which are mapping this
2254 * folio.  It is the caller's responsibility to check if the folio is
2255 * still mapped if needed (use TTU_SYNC to prevent accounting races).
2256 *
2257 * Context: Caller must hold the folio lock.
2258 */
2259void try_to_unmap(struct folio *folio, enum ttu_flags flags)
2260{
2261        struct rmap_walk_control rwc = {
2262                .rmap_one = try_to_unmap_one,
2263                .arg = (void *)flags,
2264                .done = folio_not_mapped,
2265                .anon_lock = folio_lock_anon_vma_read,
2266        };
2267
2268        if (flags & TTU_RMAP_LOCKED)
2269                rmap_walk_locked(folio, &rwc);
2270        else
2271                rmap_walk(folio, &rwc);
2272}
2273
2274/*
2275 * @arg: enum ttu_flags will be passed to this argument.
2276 *
2277 * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
2278 * containing migration entries.
2279 */
2280static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
2281                     unsigned long address, void *arg)
2282{
2283        struct mm_struct *mm = vma->vm_mm;
2284        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
2285        bool anon_exclusive, writable, ret = true;
2286        pte_t pteval;
2287        struct page *subpage;
2288        struct mmu_notifier_range range;
2289        enum ttu_flags flags = (enum ttu_flags)(long)arg;
2290        unsigned long pfn;
2291        unsigned long hsz = 0;
2292
2293        /*
2294         * When racing against e.g. zap_pte_range() on another cpu,
2295         * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
2296         * try_to_migrate() may return before page_mapped() has become false,
2297         * if page table locking is skipped: use TTU_SYNC to wait for that.
2298         */
2299        if (flags & TTU_SYNC)
2300                pvmw.flags = PVMW_SYNC;
2301
2302        /*
2303         * For THP, we have to assume the worse case ie pmd for invalidation.
2304         * For hugetlb, it could be much worse if we need to do pud
2305         * invalidation in the case of pmd sharing.
2306         *
2307         * Note that the page can not be free in this function as call of
2308         * try_to_unmap() must hold a reference on the page.
2309         */
2310        range.end = vma_address_end(&pvmw);
2311        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2312                                address, range.end);
2313        if (folio_test_hugetlb(folio)) {
2314                /*
2315                 * If sharing is possible, start and end will be adjusted
2316                 * accordingly.
2317                 */
2318                adjust_range_if_pmd_sharing_possible(vma, &range.start,
2319                                                     &range.end);
2320
2321                /* We need the huge page size for set_huge_pte_at() */
2322                hsz = huge_page_size(hstate_vma(vma));
2323        }
2324        mmu_notifier_invalidate_range_start(&range);
2325
2326        while (page_vma_mapped_walk(&pvmw)) {
2327                /* PMD-mapped THP migration entry */
2328                if (!pvmw.pte) {
2329                        if (flags & TTU_SPLIT_HUGE_PMD) {
2330                                split_huge_pmd_locked(vma, pvmw.address,
2331                                                      pvmw.pmd, true);
2332                                ret = false;
2333                                page_vma_mapped_walk_done(&pvmw);
2334                                break;
2335                        }
2336#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2337                        subpage = folio_page(folio,
2338                                pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
2339                        VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
2340                                        !folio_test_pmd_mappable(folio), folio);
2341
2342                        if (set_pmd_migration_entry(&pvmw, subpage)) {
2343                                ret = false;
2344                                page_vma_mapped_walk_done(&pvmw);
2345                                break;
2346                        }
2347                        continue;
2348#endif
2349                }
2350
2351                /* Unexpected PMD-mapped THP? */
2352                VM_BUG_ON_FOLIO(!pvmw.pte, folio);
2353
2354                /*
2355                 * Handle PFN swap PTEs, such as device-exclusive ones, that
2356                 * actually map pages.
2357                 */
2358                pteval = ptep_get(pvmw.pte);
2359                if (likely(pte_present(pteval))) {
2360                        pfn = pte_pfn(pteval);
2361                } else {
2362                        pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
2363                        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
2364                }
2365
2366                subpage = folio_page(folio, pfn - folio_pfn(folio));
2367                address = pvmw.address;
2368                anon_exclusive = folio_test_anon(folio) &&
2369                                 PageAnonExclusive(subpage);
2370
2371                if (folio_test_hugetlb(folio)) {
2372                        bool anon = folio_test_anon(folio);
2373
2374                        /*
2375                         * huge_pmd_unshare may unmap an entire PMD page.
2376                         * There is no way of knowing exactly which PMDs may
2377                         * be cached for this mm, so we must flush them all.
2378                         * start/end were already adjusted above to cover this
2379                         * range.
2380                         */
2381                        flush_cache_range(vma, range.start, range.end);
2382
2383                        /*
2384                         * To call huge_pmd_unshare, i_mmap_rwsem must be
2385                         * held in write mode.  Caller needs to explicitly
2386                         * do this outside rmap routines.
2387                         *
2388                         * We also must hold hugetlb vma_lock in write mode.
2389                         * Lock order dictates acquiring vma_lock BEFORE
2390                         * i_mmap_rwsem.  We can only try lock here and
2391                         * fail if unsuccessful.
2392                         */
2393                        if (!anon) {
2394                                VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
2395                                if (!hugetlb_vma_trylock_write(vma)) {
2396                                        page_vma_mapped_walk_done(&pvmw);
2397                                        ret = false;
2398                                        break;
2399                                }
2400                                if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
2401                                        hugetlb_vma_unlock_write(vma);
2402                                        flush_tlb_range(vma,
2403                                                range.start, range.end);
2404
2405                                        /*
2406                                         * The ref count of the PMD page was
2407                                         * dropped which is part of the way map
2408                                         * counting is done for shared PMDs.
2409                                         * Return 'true' here.  When there is
2410                                         * no other sharing, huge_pmd_unshare
2411                                         * returns false and we will unmap the
2412                                         * actual page and drop map count
2413                                         * to zero.
2414                                         */
2415                                        page_vma_mapped_walk_done(&pvmw);
2416                                        break;
2417                                }
2418                                hugetlb_vma_unlock_write(vma);
2419                        }
2420                        /* Nuke the hugetlb page table entry */
2421                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
2422                        if (pte_dirty(pteval))
2423                                folio_mark_dirty(folio);
2424                        writable = pte_write(pteval);
2425                } else if (likely(pte_present(pteval))) {
2426                        flush_cache_page(vma, address, pfn);
2427                        /* Nuke the page table entry. */
2428                        if (should_defer_flush(mm, flags)) {
2429                                /*
2430                                 * We clear the PTE but do not flush so potentially
2431                                 * a remote CPU could still be writing to the folio.
2432                                 * If the entry was previously clean then the
2433                                 * architecture must guarantee that a clear->dirty
2434                                 * transition on a cached TLB entry is written through
2435                                 * and traps if the PTE is unmapped.
2436                                 */
2437                                pteval = ptep_get_and_clear(mm, address, pvmw.pte);
2438
2439                                set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
2440                        } else {
2441                                pteval = ptep_clear_flush(vma, address, pvmw.pte);
2442                        }
2443                        if (pte_dirty(pteval))
2444                                folio_mark_dirty(folio);
2445                        writable = pte_write(pteval);
2446                } else {
2447                        pte_clear(mm, address, pvmw.pte);
2448                        writable = is_writable_device_private_entry(pte_to_swp_entry(pteval));
2449                }
2450
2451                VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
2452                                !anon_exclusive, folio);
2453
2454                /* Update high watermark before we lower rss */
2455                update_hiwater_rss(mm);
2456
2457                if (PageHWPoison(subpage)) {
2458                        VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
2459
2460                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
2461                        if (folio_test_hugetlb(folio)) {
2462                                hugetlb_count_sub(folio_nr_pages(folio), mm);
2463                                set_huge_pte_at(mm, address, pvmw.pte, pteval,
2464                                                hsz);
2465                        } else {
2466                                dec_mm_counter(mm, mm_counter(folio));
2467                                set_pte_at(mm, address, pvmw.pte, pteval);
2468                        }
2469                } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
2470                           !userfaultfd_armed(vma)) {
2471                        /*
2472                         * The guest indicated that the page content is of no
2473                         * interest anymore. Simply discard the pte, vmscan
2474                         * will take care of the rest.
2475                         * A future reference will then fault in a new zero
2476                         * page. When userfaultfd is active, we must not drop
2477                         * this page though, as its main user (postcopy
2478                         * migration) will not expect userfaults on already
2479                         * copied pages.
2480                         */
2481                        dec_mm_counter(mm, mm_counter(folio));
2482                } else {
2483                        swp_entry_t entry;
2484                        pte_t swp_pte;
2485
2486                        /*
2487                         * arch_unmap_one() is expected to be a NOP on
2488                         * architectures where we could have PFN swap PTEs,
2489                         * so we'll not check/care.
2490                         */
2491                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
2492                                if (folio_test_hugetlb(folio))
2493                                        set_huge_pte_at(mm, address, pvmw.pte,
2494                                                        pteval, hsz);
2495                                else
2496                                        set_pte_at(mm, address, pvmw.pte, pteval);
2497                                ret = false;
2498                                page_vma_mapped_walk_done(&pvmw);
2499                                break;
2500                        }
2501
2502                        /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
2503                        if (folio_test_hugetlb(folio)) {
2504                                if (anon_exclusive &&
2505                                    hugetlb_try_share_anon_rmap(folio)) {
2506                                        set_huge_pte_at(mm, address, pvmw.pte,
2507                                                        pteval, hsz);
2508                                        ret = false;
2509                                        page_vma_mapped_walk_done(&pvmw);
2510                                        break;
2511                                }
2512                        } else if (anon_exclusive &&
2513                                   folio_try_share_anon_rmap_pte(folio, subpage)) {
2514                                set_pte_at(mm, address, pvmw.pte, pteval);
2515                                ret = false;
2516                                page_vma_mapped_walk_done(&pvmw);
2517                                break;
2518                        }
2519
2520                        /*
2521                         * Store the pfn of the page in a special migration
2522                         * pte. do_swap_page() will wait until the migration
2523                         * pte is removed and then restart fault handling.
2524                         */
2525                        if (writable)
2526                                entry = make_writable_migration_entry(
2527                                                        page_to_pfn(subpage));
2528                        else if (anon_exclusive)
2529                                entry = make_readable_exclusive_migration_entry(
2530                                                        page_to_pfn(subpage));
2531                        else
2532                                entry = make_readable_migration_entry(
2533                                                        page_to_pfn(subpage));
2534                        if (likely(pte_present(pteval))) {
2535                                if (pte_young(pteval))
2536                                        entry = make_migration_entry_young(entry);
2537                                if (pte_dirty(pteval))
2538                                        entry = make_migration_entry_dirty(entry);
2539                                swp_pte = swp_entry_to_pte(entry);
2540                                if (pte_soft_dirty(pteval))
2541                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
2542                                if (pte_uffd_wp(pteval))
2543                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
2544                        } else {
2545                                swp_pte = swp_entry_to_pte(entry);
2546                                if (pte_swp_soft_dirty(pteval))
2547                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
2548                                if (pte_swp_uffd_wp(pteval))
2549                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
2550                        }
2551                        if (folio_test_hugetlb(folio))
2552                                set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
2553                                                hsz);
2554                        else
2555                                set_pte_at(mm, address, pvmw.pte, swp_pte);
2556                        trace_set_migration_pte(address, pte_val(swp_pte),
2557                                                folio_order(folio));
2558                        /*
2559                         * No need to invalidate here it will synchronize on
2560                         * against the special swap migration pte.
2561                         */
2562                }
2563
2564                if (unlikely(folio_test_hugetlb(folio)))
2565                        hugetlb_remove_rmap(folio);
2566                else
2567                        folio_remove_rmap_pte(folio, subpage, vma);
2568                if (vma->vm_flags & VM_LOCKED)
2569                        mlock_drain_local();
2570                folio_put(folio);
2571        }
2572
2573        mmu_notifier_invalidate_range_end(&range);
2574
2575        return ret;
2576}
2577
2578/**
2579 * try_to_migrate - try to replace all page table mappings with swap entries
2580 * @folio: the folio to replace page table entries for
2581 * @flags: action and flags
2582 *
2583 * Tries to remove all the page table entries which are mapping this folio and
2584 * replace them with special swap entries. Caller must hold the folio lock.
2585 */
2586void try_to_migrate(struct folio *folio, enum ttu_flags flags)
2587{
2588        struct rmap_walk_control rwc = {
2589                .rmap_one = try_to_migrate_one,
2590                .arg = (void *)flags,
2591                .done = folio_not_mapped,
2592                .anon_lock = folio_lock_anon_vma_read,
2593        };
2594
2595        /*
2596         * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
2597         * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
2598         */
2599        if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
2600                                        TTU_SYNC | TTU_BATCH_FLUSH)))
2601                return;
2602
2603        if (folio_is_zone_device(folio) &&
2604            (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
2605                return;
2606
2607        /*
2608         * During exec, a temporary VMA is setup and later moved.
2609         * The VMA is moved under the anon_vma lock but not the
2610         * page tables leading to a race where migration cannot
2611         * find the migration ptes. Rather than increasing the
2612         * locking requirements of exec(), migration skips
2613         * temporary VMAs until after exec() completes.
2614         */
2615        if (!folio_test_ksm(folio) && folio_test_anon(folio))
2616                rwc.invalid_vma = invalid_migration_vma;
2617
2618        if (flags & TTU_RMAP_LOCKED)
2619                rmap_walk_locked(folio, &rwc);
2620        else
2621                rmap_walk(folio, &rwc);
2622}
2623
2624#ifdef CONFIG_DEVICE_PRIVATE
2625/**
2626 * make_device_exclusive() - Mark a page for exclusive use by a device
2627 * @mm: mm_struct of associated target process
2628 * @addr: the virtual address to mark for exclusive device access
2629 * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
2630 * @foliop: folio pointer will be stored here on success.
2631 *
2632 * This function looks up the page mapped at the given address, grabs a
2633 * folio reference, locks the folio and replaces the PTE with special
2634 * device-exclusive PFN swap entry, preventing access through the process
2635 * page tables. The function will return with the folio locked and referenced.
2636 *
2637 * On fault, the device-exclusive entries are replaced with the original PTE
2638 * under folio lock, after calling MMU notifiers.
2639 *
2640 * Only anonymous non-hugetlb folios are supported and the VMA must have
2641 * write permissions such that we can fault in the anonymous page writable
2642 * in order to mark it exclusive. The caller must hold the mmap_lock in read
2643 * mode.
2644 *
2645 * A driver using this to program access from a device must use a mmu notifier
2646 * critical section to hold a device specific lock during programming. Once
2647 * programming is complete it should drop the folio lock and reference after
2648 * which point CPU access to the page will revoke the exclusive access.
2649 *
2650 * Notes:
2651 *   #. This function always operates on individual PTEs mapping individual
2652 *      pages. PMD-sized THPs are first remapped to be mapped by PTEs before
2653 *      the conversion happens on a single PTE corresponding to @addr.
2654 *   #. While concurrent access through the process page tables is prevented,
2655 *      concurrent access through other page references (e.g., earlier GUP
2656 *      invocation) is not handled and not supported.
2657 *   #. device-exclusive entries are considered "clean" and "old" by core-mm.
2658 *      Device drivers must update the folio state when informed by MMU
2659 *      notifiers.
2660 *
2661 * Returns: pointer to mapped page on success, otherwise a negative error.
2662 */
2663struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
2664                void *owner, struct folio **foliop)
2665{
2666        struct mmu_notifier_range range;
2667        struct folio *folio, *fw_folio;
2668        struct vm_area_struct *vma;
2669        struct folio_walk fw;
2670        struct page *page;
2671        swp_entry_t entry;
2672        pte_t swp_pte;
2673        int ret;
2674
2675        mmap_assert_locked(mm);
2676        addr = PAGE_ALIGN_DOWN(addr);
2677
2678        /*
2679         * Fault in the page writable and try to lock it; note that if the
2680         * address would already be marked for exclusive use by a device,
2681         * the GUP call would undo that first by triggering a fault.
2682         *
2683         * If any other device would already map this page exclusively, the
2684         * fault will trigger a conversion to an ordinary
2685         * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
2686         */
2687retry:
2688        page = get_user_page_vma_remote(mm, addr,
2689                                        FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
2690                                        &vma);
2691        if (IS_ERR(page))
2692                return page;
2693        folio = page_folio(page);
2694
2695        if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
2696                folio_put(folio);
2697                return ERR_PTR(-EOPNOTSUPP);
2698        }
2699
2700        ret = folio_lock_killable(folio);
2701        if (ret) {
2702                folio_put(folio);
2703                return ERR_PTR(ret);
2704        }
2705
2706        /*
2707         * Inform secondary MMUs that we are going to convert this PTE to
2708         * device-exclusive, such that they unmap it now. Note that the
2709         * caller must filter this event out to prevent livelocks.
2710         */
2711        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
2712                                      mm, addr, addr + PAGE_SIZE, owner);
2713        mmu_notifier_invalidate_range_start(&range);
2714
2715        /*
2716         * Let's do a second walk and make sure we still find the same page
2717         * mapped writable. Note that any page of an anonymous folio can
2718         * only be mapped writable using exactly one PTE ("exclusive"), so
2719         * there cannot be other mappings.
2720         */
2721        fw_folio = folio_walk_start(&fw, vma, addr, 0);
2722        if (fw_folio != folio || fw.page != page ||
2723            fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
2724                if (fw_folio)
2725                        folio_walk_end(&fw, vma);
2726                mmu_notifier_invalidate_range_end(&range);
2727                folio_unlock(folio);
2728                folio_put(folio);
2729                goto retry;
2730        }
2731
2732        /* Nuke the page table entry so we get the uptodate dirty bit. */
2733        flush_cache_page(vma, addr, page_to_pfn(page));
2734        fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
2735
2736        /* Set the dirty flag on the folio now the PTE is gone. */
2737        if (pte_dirty(fw.pte))
2738                folio_mark_dirty(folio);
2739
2740        /*
2741         * Store the pfn of the page in a special device-exclusive PFN swap PTE.
2742         * do_swap_page() will trigger the conversion back while holding the
2743         * folio lock.
2744         */
2745        entry = make_device_exclusive_entry(page_to_pfn(page));
2746        swp_pte = swp_entry_to_pte(entry);
2747        if (pte_soft_dirty(fw.pte))
2748                swp_pte = pte_swp_mksoft_dirty(swp_pte);
2749        /* The pte is writable, uffd-wp does not apply. */
2750        set_pte_at(mm, addr, fw.ptep, swp_pte);
2751
2752        folio_walk_end(&fw, vma);
2753        mmu_notifier_invalidate_range_end(&range);
2754        *foliop = folio;
2755        return page;
2756}
2757EXPORT_SYMBOL_GPL(make_device_exclusive);
2758#endif
2759
2760void __put_anon_vma(struct anon_vma *anon_vma)
2761{
2762        struct anon_vma *root = anon_vma->root;
2763
2764        anon_vma_free(anon_vma);
2765        if (root != anon_vma && atomic_dec_and_test(&root->refcount))
2766                anon_vma_free(root);
2767}
2768
2769static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
2770                                            struct rmap_walk_control *rwc)
2771{
2772        struct anon_vma *anon_vma;
2773
2774        if (rwc->anon_lock)
2775                return rwc->anon_lock(folio, rwc);
2776
2777        /*
2778         * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
2779         * because that depends on page_mapped(); but not all its usages
2780         * are holding mmap_lock. Users without mmap_lock are required to
2781         * take a reference count to prevent the anon_vma disappearing
2782         */
2783        anon_vma = folio_anon_vma(folio);
2784        if (!anon_vma)
2785                return NULL;
2786
2787        if (anon_vma_trylock_read(anon_vma))
2788                goto out;
2789
2790        if (rwc->try_lock) {
2791                anon_vma = NULL;
2792                rwc->contended = true;
2793                goto out;
2794        }
2795
2796        anon_vma_lock_read(anon_vma);
2797out:
2798        return anon_vma;
2799}
2800
2801/*
2802 * rmap_walk_anon - do something to anonymous page using the object-based
2803 * rmap method
2804 * @folio: the folio to be handled
2805 * @rwc: control variable according to each walk type
2806 * @locked: caller holds relevant rmap lock
2807 *
2808 * Find all the mappings of a folio using the mapping pointer and the vma
2809 * chains contained in the anon_vma struct it points to.
2810 */
2811static void rmap_walk_anon(struct folio *folio,
2812                struct rmap_walk_control *rwc, bool locked)
2813{
2814        struct anon_vma *anon_vma;
2815        pgoff_t pgoff_start, pgoff_end;
2816        struct anon_vma_chain *avc;
2817
2818        if (locked) {
2819                anon_vma = folio_anon_vma(folio);
2820                /* anon_vma disappear under us? */
2821                VM_BUG_ON_FOLIO(!anon_vma, folio);
2822        } else {
2823                anon_vma = rmap_walk_anon_lock(folio, rwc);
2824        }
2825        if (!anon_vma)
2826                return;
2827
2828        pgoff_start = folio_pgoff(folio);
2829        pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
2830        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
2831                        pgoff_start, pgoff_end) {
2832                struct vm_area_struct *vma = avc->vma;
2833                unsigned long address = vma_address(vma, pgoff_start,
2834                                folio_nr_pages(folio));
2835
2836                VM_BUG_ON_VMA(address == -EFAULT, vma);
2837                cond_resched();
2838
2839                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2840                        continue;
2841
2842                if (!rwc->rmap_one(folio, vma, address, rwc->arg))
2843                        break;
2844                if (rwc->done && rwc->done(folio))
2845                        break;
2846        }
2847
2848        if (!locked)
2849                anon_vma_unlock_read(anon_vma);
2850}
2851
2852/**
2853 * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
2854 * of a page mapped within a specified page cache object at a specified offset.
2855 *
2856 * @folio:              Either the folio whose mappings to traverse, or if NULL,
2857 *                      the callbacks specified in @rwc will be configured such
2858 *                      as to be able to look up mappings correctly.
2859 * @mapping:            The page cache object whose mapping VMAs we intend to
2860 *                      traverse. If @folio is non-NULL, this should be equal to
2861 *                      folio_mapping(folio).
2862 * @pgoff_start:        The offset within @mapping of the page which we are
2863 *                      looking up. If @folio is non-NULL, this should be equal
2864 *                      to folio_pgoff(folio).
2865 * @nr_pages:           The number of pages mapped by the mapping. If @folio is
2866 *                      non-NULL, this should be equal to folio_nr_pages(folio).
2867 * @rwc:                The reverse mapping walk control object describing how
2868 *                      the traversal should proceed.
2869 * @locked:             Is the @mapping already locked? If not, we acquire the
2870 *                      lock.
2871 */
2872static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
2873                             pgoff_t pgoff_start, unsigned long nr_pages,
2874                             struct rmap_walk_control *rwc, bool locked)
2875{
2876        pgoff_t pgoff_end = pgoff_start + nr_pages - 1;
2877        struct vm_area_struct *vma;
2878
2879        VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
2880        VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
2881        VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);
2882
2883        if (!locked) {
2884                if (i_mmap_trylock_read(mapping))
2885                        goto lookup;
2886
2887                if (rwc->try_lock) {
2888                        rwc->contended = true;
2889                        return;
2890                }
2891
2892                i_mmap_lock_read(mapping);
2893        }
2894lookup:
2895        vma_interval_tree_foreach(vma, &mapping->i_mmap,
2896                        pgoff_start, pgoff_end) {
2897                unsigned long address = vma_address(vma, pgoff_start, nr_pages);
2898
2899                VM_BUG_ON_VMA(address == -EFAULT, vma);
2900                cond_resched();
2901
2902                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2903                        continue;
2904
2905                if (!rwc->rmap_one(folio, vma, address, rwc->arg))
2906                        goto done;
2907                if (rwc->done && rwc->done(folio))
2908                        goto done;
2909        }
2910done:
2911        if (!locked)
2912                i_mmap_unlock_read(mapping);
2913}
2914
2915/*
2916 * rmap_walk_file - do something to file page using the object-based rmap method
2917 * @folio: the folio to be handled
2918 * @rwc: control variable according to each walk type
2919 * @locked: caller holds relevant rmap lock
2920 *
2921 * Find all the mappings of a folio using the mapping pointer and the vma chains
2922 * contained in the address_space struct it points to.
2923 */
2924static void rmap_walk_file(struct folio *folio,
2925                struct rmap_walk_control *rwc, bool locked)
2926{
2927        /*
2928         * The folio lock not only makes sure that folio->mapping cannot
2929         * suddenly be NULLified by truncation, it makes sure that the structure
2930         * at mapping cannot be freed and reused yet, so we can safely take
2931         * mapping->i_mmap_rwsem.
2932         */
2933        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
2934
2935        if (!folio->mapping)
2936                return;
2937
2938        __rmap_walk_file(folio, folio->mapping, folio->index,
2939                         folio_nr_pages(folio), rwc, locked);
2940}
2941
2942void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
2943{
2944        if (unlikely(folio_test_ksm(folio)))
2945                rmap_walk_ksm(folio, rwc);
2946        else if (folio_test_anon(folio))
2947                rmap_walk_anon(folio, rwc, false);
2948        else
2949                rmap_walk_file(folio, rwc, false);
2950}
2951
2952/* Like rmap_walk, but caller holds relevant rmap lock */
2953void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
2954{
2955        /* no ksm support for now */
2956        VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
2957        if (folio_test_anon(folio))
2958                rmap_walk_anon(folio, rwc, true);
2959        else
2960                rmap_walk_file(folio, rwc, true);
2961}
2962
2963#ifdef CONFIG_HUGETLB_PAGE
2964/*
2965 * The following two functions are for anonymous (private mapped) hugepages.
2966 * Unlike common anonymous pages, anonymous hugepages have no accounting code
2967 * and no lru code, because we handle hugepages differently from common pages.
2968 */
2969void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
2970                unsigned long address, rmap_t flags)
2971{
2972        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
2973        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
2974
2975        atomic_inc(&folio->_entire_mapcount);
2976        atomic_inc(&folio->_large_mapcount);
2977        if (flags & RMAP_EXCLUSIVE)
2978                SetPageAnonExclusive(&folio->page);
2979        VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
2980                         PageAnonExclusive(&folio->page), folio);
2981}
2982
2983void hugetlb_add_new_anon_rmap(struct folio *folio,
2984                struct vm_area_struct *vma, unsigned long address)
2985{
2986        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
2987
2988        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
2989        /* increment count (starts at -1) */
2990        atomic_set(&folio->_entire_mapcount, 0);
2991        atomic_set(&folio->_large_mapcount, 0);
2992        folio_clear_hugetlb_restore_reserve(folio);
2993        __folio_set_anon(folio, vma, address, true);
2994        SetPageAnonExclusive(&folio->page);
2995}
2996#endif /* CONFIG_HUGETLB_PAGE */
2997