linux/mm/huge_memory.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  Copyright (C) 2009  Red Hat, Inc.
   4 */
   5
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/mm.h>
   9#include <linux/sched.h>
  10#include <linux/sched/coredump.h>
  11#include <linux/sched/numa_balancing.h>
  12#include <linux/highmem.h>
  13#include <linux/hugetlb.h>
  14#include <linux/mmu_notifier.h>
  15#include <linux/rmap.h>
  16#include <linux/swap.h>
  17#include <linux/shrinker.h>
  18#include <linux/mm_inline.h>
  19#include <linux/swapops.h>
  20#include <linux/dax.h>
  21#include <linux/khugepaged.h>
  22#include <linux/freezer.h>
  23#include <linux/pfn_t.h>
  24#include <linux/mman.h>
  25#include <linux/memremap.h>
  26#include <linux/pagemap.h>
  27#include <linux/debugfs.h>
  28#include <linux/migrate.h>
  29#include <linux/hashtable.h>
  30#include <linux/userfaultfd_k.h>
  31#include <linux/page_idle.h>
  32#include <linux/shmem_fs.h>
  33#include <linux/oom.h>
  34#include <linux/numa.h>
  35
  36#include <asm/tlb.h>
  37#include <asm/pgalloc.h>
  38#include "internal.h"
  39
  40/*
  41 * By default, transparent hugepage support is disabled in order to avoid
  42 * risking an increased memory footprint for applications that are not
  43 * guaranteed to benefit from it. When transparent hugepage support is
  44 * enabled, it is for all mappings, and khugepaged scans all mappings.
  45 * Defrag is invoked by khugepaged hugepage allocations and by page faults
  46 * for all hugepage allocations.
  47 */
  48unsigned long transparent_hugepage_flags __read_mostly =
  49#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
  50        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
  51#endif
  52#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  53        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  54#endif
  55        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
  56        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  57        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  58
  59static struct shrinker deferred_split_shrinker;
  60
  61static atomic_t huge_zero_refcount;
  62struct page *huge_zero_page __read_mostly;
  63
  64bool transparent_hugepage_enabled(struct vm_area_struct *vma)
  65{
  66        if (vma_is_anonymous(vma))
  67                return __transparent_hugepage_enabled(vma);
  68        if (vma_is_shmem(vma) && shmem_huge_enabled(vma))
  69                return __transparent_hugepage_enabled(vma);
  70
  71        return false;
  72}
  73
  74static struct page *get_huge_zero_page(void)
  75{
  76        struct page *zero_page;
  77retry:
  78        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
  79                return READ_ONCE(huge_zero_page);
  80
  81        zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
  82                        HPAGE_PMD_ORDER);
  83        if (!zero_page) {
  84                count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
  85                return NULL;
  86        }
  87        count_vm_event(THP_ZERO_PAGE_ALLOC);
  88        preempt_disable();
  89        if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
  90                preempt_enable();
  91                __free_pages(zero_page, compound_order(zero_page));
  92                goto retry;
  93        }
  94
  95        /* We take additional reference here. It will be put back by shrinker */
  96        atomic_set(&huge_zero_refcount, 2);
  97        preempt_enable();
  98        return READ_ONCE(huge_zero_page);
  99}
 100
 101static void put_huge_zero_page(void)
 102{
 103        /*
 104         * Counter should never go to zero here. Only shrinker can put
 105         * last reference.
 106         */
 107        BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 108}
 109
 110struct page *mm_get_huge_zero_page(struct mm_struct *mm)
 111{
 112        if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 113                return READ_ONCE(huge_zero_page);
 114
 115        if (!get_huge_zero_page())
 116                return NULL;
 117
 118        if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 119                put_huge_zero_page();
 120
 121        return READ_ONCE(huge_zero_page);
 122}
 123
 124void mm_put_huge_zero_page(struct mm_struct *mm)
 125{
 126        if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 127                put_huge_zero_page();
 128}
 129
 130static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
 131                                        struct shrink_control *sc)
 132{
 133        /* we can free zero page only if last reference remains */
 134        return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
 135}
 136
 137static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
 138                                       struct shrink_control *sc)
 139{
 140        if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
 141                struct page *zero_page = xchg(&huge_zero_page, NULL);
 142                BUG_ON(zero_page == NULL);
 143                __free_pages(zero_page, compound_order(zero_page));
 144                return HPAGE_PMD_NR;
 145        }
 146
 147        return 0;
 148}
 149
 150static struct shrinker huge_zero_page_shrinker = {
 151        .count_objects = shrink_huge_zero_page_count,
 152        .scan_objects = shrink_huge_zero_page_scan,
 153        .seeks = DEFAULT_SEEKS,
 154};
 155
 156#ifdef CONFIG_SYSFS
 157static ssize_t enabled_show(struct kobject *kobj,
 158                            struct kobj_attribute *attr, char *buf)
 159{
 160        if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
 161                return sprintf(buf, "[always] madvise never\n");
 162        else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
 163                return sprintf(buf, "always [madvise] never\n");
 164        else
 165                return sprintf(buf, "always madvise [never]\n");
 166}
 167
 168static ssize_t enabled_store(struct kobject *kobj,
 169                             struct kobj_attribute *attr,
 170                             const char *buf, size_t count)
 171{
 172        ssize_t ret = count;
 173
 174        if (!memcmp("always", buf,
 175                    min(sizeof("always")-1, count))) {
 176                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 177                set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 178        } else if (!memcmp("madvise", buf,
 179                           min(sizeof("madvise")-1, count))) {
 180                clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 181                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 182        } else if (!memcmp("never", buf,
 183                           min(sizeof("never")-1, count))) {
 184                clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 185                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 186        } else
 187                ret = -EINVAL;
 188
 189        if (ret > 0) {
 190                int err = start_stop_khugepaged();
 191                if (err)
 192                        ret = err;
 193        }
 194        return ret;
 195}
 196static struct kobj_attribute enabled_attr =
 197        __ATTR(enabled, 0644, enabled_show, enabled_store);
 198
 199ssize_t single_hugepage_flag_show(struct kobject *kobj,
 200                                struct kobj_attribute *attr, char *buf,
 201                                enum transparent_hugepage_flag flag)
 202{
 203        return sprintf(buf, "%d\n",
 204                       !!test_bit(flag, &transparent_hugepage_flags));
 205}
 206
 207ssize_t single_hugepage_flag_store(struct kobject *kobj,
 208                                 struct kobj_attribute *attr,
 209                                 const char *buf, size_t count,
 210                                 enum transparent_hugepage_flag flag)
 211{
 212        unsigned long value;
 213        int ret;
 214
 215        ret = kstrtoul(buf, 10, &value);
 216        if (ret < 0)
 217                return ret;
 218        if (value > 1)
 219                return -EINVAL;
 220
 221        if (value)
 222                set_bit(flag, &transparent_hugepage_flags);
 223        else
 224                clear_bit(flag, &transparent_hugepage_flags);
 225
 226        return count;
 227}
 228
 229static ssize_t defrag_show(struct kobject *kobj,
 230                           struct kobj_attribute *attr, char *buf)
 231{
 232        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
 233                return sprintf(buf, "[always] defer defer+madvise madvise never\n");
 234        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
 235                return sprintf(buf, "always [defer] defer+madvise madvise never\n");
 236        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
 237                return sprintf(buf, "always defer [defer+madvise] madvise never\n");
 238        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
 239                return sprintf(buf, "always defer defer+madvise [madvise] never\n");
 240        return sprintf(buf, "always defer defer+madvise madvise [never]\n");
 241}
 242
 243static ssize_t defrag_store(struct kobject *kobj,
 244                            struct kobj_attribute *attr,
 245                            const char *buf, size_t count)
 246{
 247        if (!memcmp("always", buf,
 248                    min(sizeof("always")-1, count))) {
 249                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 250                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 251                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 252                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 253        } else if (!memcmp("defer+madvise", buf,
 254                    min(sizeof("defer+madvise")-1, count))) {
 255                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 256                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 257                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 258                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 259        } else if (!memcmp("defer", buf,
 260                    min(sizeof("defer")-1, count))) {
 261                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 262                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 263                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 264                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 265        } else if (!memcmp("madvise", buf,
 266                           min(sizeof("madvise")-1, count))) {
 267                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 268                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 269                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 270                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 271        } else if (!memcmp("never", buf,
 272                           min(sizeof("never")-1, count))) {
 273                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 274                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 275                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 276                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 277        } else
 278                return -EINVAL;
 279
 280        return count;
 281}
 282static struct kobj_attribute defrag_attr =
 283        __ATTR(defrag, 0644, defrag_show, defrag_store);
 284
 285static ssize_t use_zero_page_show(struct kobject *kobj,
 286                struct kobj_attribute *attr, char *buf)
 287{
 288        return single_hugepage_flag_show(kobj, attr, buf,
 289                                TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 290}
 291static ssize_t use_zero_page_store(struct kobject *kobj,
 292                struct kobj_attribute *attr, const char *buf, size_t count)
 293{
 294        return single_hugepage_flag_store(kobj, attr, buf, count,
 295                                 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 296}
 297static struct kobj_attribute use_zero_page_attr =
 298        __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
 299
 300static ssize_t hpage_pmd_size_show(struct kobject *kobj,
 301                struct kobj_attribute *attr, char *buf)
 302{
 303        return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
 304}
 305static struct kobj_attribute hpage_pmd_size_attr =
 306        __ATTR_RO(hpage_pmd_size);
 307
 308#ifdef CONFIG_DEBUG_VM
 309static ssize_t debug_cow_show(struct kobject *kobj,
 310                                struct kobj_attribute *attr, char *buf)
 311{
 312        return single_hugepage_flag_show(kobj, attr, buf,
 313                                TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
 314}
 315static ssize_t debug_cow_store(struct kobject *kobj,
 316                               struct kobj_attribute *attr,
 317                               const char *buf, size_t count)
 318{
 319        return single_hugepage_flag_store(kobj, attr, buf, count,
 320                                 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
 321}
 322static struct kobj_attribute debug_cow_attr =
 323        __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
 324#endif /* CONFIG_DEBUG_VM */
 325
 326static struct attribute *hugepage_attr[] = {
 327        &enabled_attr.attr,
 328        &defrag_attr.attr,
 329        &use_zero_page_attr.attr,
 330        &hpage_pmd_size_attr.attr,
 331#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
 332        &shmem_enabled_attr.attr,
 333#endif
 334#ifdef CONFIG_DEBUG_VM
 335        &debug_cow_attr.attr,
 336#endif
 337        NULL,
 338};
 339
 340static const struct attribute_group hugepage_attr_group = {
 341        .attrs = hugepage_attr,
 342};
 343
 344static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 345{
 346        int err;
 347
 348        *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
 349        if (unlikely(!*hugepage_kobj)) {
 350                pr_err("failed to create transparent hugepage kobject\n");
 351                return -ENOMEM;
 352        }
 353
 354        err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
 355        if (err) {
 356                pr_err("failed to register transparent hugepage group\n");
 357                goto delete_obj;
 358        }
 359
 360        err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
 361        if (err) {
 362                pr_err("failed to register transparent hugepage group\n");
 363                goto remove_hp_group;
 364        }
 365
 366        return 0;
 367
 368remove_hp_group:
 369        sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
 370delete_obj:
 371        kobject_put(*hugepage_kobj);
 372        return err;
 373}
 374
 375static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 376{
 377        sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
 378        sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
 379        kobject_put(hugepage_kobj);
 380}
 381#else
 382static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
 383{
 384        return 0;
 385}
 386
 387static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 388{
 389}
 390#endif /* CONFIG_SYSFS */
 391
 392static int __init hugepage_init(void)
 393{
 394        int err;
 395        struct kobject *hugepage_kobj;
 396
 397        if (!has_transparent_hugepage()) {
 398                transparent_hugepage_flags = 0;
 399                return -EINVAL;
 400        }
 401
 402        /*
 403         * hugepages can't be allocated by the buddy allocator
 404         */
 405        MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
 406        /*
 407         * we use page->mapping and page->index in second tail page
 408         * as list_head: assuming THP order >= 2
 409         */
 410        MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
 411
 412        err = hugepage_init_sysfs(&hugepage_kobj);
 413        if (err)
 414                goto err_sysfs;
 415
 416        err = khugepaged_init();
 417        if (err)
 418                goto err_slab;
 419
 420        err = register_shrinker(&huge_zero_page_shrinker);
 421        if (err)
 422                goto err_hzp_shrinker;
 423        err = register_shrinker(&deferred_split_shrinker);
 424        if (err)
 425                goto err_split_shrinker;
 426
 427        /*
 428         * By default disable transparent hugepages on smaller systems,
 429         * where the extra memory used could hurt more than TLB overhead
 430         * is likely to save.  The admin can still enable it through /sys.
 431         */
 432        if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
 433                transparent_hugepage_flags = 0;
 434                return 0;
 435        }
 436
 437        err = start_stop_khugepaged();
 438        if (err)
 439                goto err_khugepaged;
 440
 441        return 0;
 442err_khugepaged:
 443        unregister_shrinker(&deferred_split_shrinker);
 444err_split_shrinker:
 445        unregister_shrinker(&huge_zero_page_shrinker);
 446err_hzp_shrinker:
 447        khugepaged_destroy();
 448err_slab:
 449        hugepage_exit_sysfs(hugepage_kobj);
 450err_sysfs:
 451        return err;
 452}
 453subsys_initcall(hugepage_init);
 454
 455static int __init setup_transparent_hugepage(char *str)
 456{
 457        int ret = 0;
 458        if (!str)
 459                goto out;
 460        if (!strcmp(str, "always")) {
 461                set_bit(TRANSPARENT_HUGEPAGE_FLAG,
 462                        &transparent_hugepage_flags);
 463                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 464                          &transparent_hugepage_flags);
 465                ret = 1;
 466        } else if (!strcmp(str, "madvise")) {
 467                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 468                          &transparent_hugepage_flags);
 469                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 470                        &transparent_hugepage_flags);
 471                ret = 1;
 472        } else if (!strcmp(str, "never")) {
 473                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 474                          &transparent_hugepage_flags);
 475                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 476                          &transparent_hugepage_flags);
 477                ret = 1;
 478        }
 479out:
 480        if (!ret)
 481                pr_warn("transparent_hugepage= cannot parse, ignored\n");
 482        return ret;
 483}
 484__setup("transparent_hugepage=", setup_transparent_hugepage);
 485
 486pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 487{
 488        if (likely(vma->vm_flags & VM_WRITE))
 489                pmd = pmd_mkwrite(pmd);
 490        return pmd;
 491}
 492
 493static inline struct list_head *page_deferred_list(struct page *page)
 494{
 495        /* ->lru in the tail pages is occupied by compound_head. */
 496        return &page[2].deferred_list;
 497}
 498
 499void prep_transhuge_page(struct page *page)
 500{
 501        /*
 502         * we use page->mapping and page->indexlru in second tail page
 503         * as list_head: assuming THP order >= 2
 504         */
 505
 506        INIT_LIST_HEAD(page_deferred_list(page));
 507        set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
 508}
 509
 510static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
 511                loff_t off, unsigned long flags, unsigned long size)
 512{
 513        unsigned long addr;
 514        loff_t off_end = off + len;
 515        loff_t off_align = round_up(off, size);
 516        unsigned long len_pad;
 517
 518        if (off_end <= off_align || (off_end - off_align) < size)
 519                return 0;
 520
 521        len_pad = len + size;
 522        if (len_pad < len || (off + len_pad) < off)
 523                return 0;
 524
 525        addr = current->mm->get_unmapped_area(filp, 0, len_pad,
 526                                              off >> PAGE_SHIFT, flags);
 527        if (IS_ERR_VALUE(addr))
 528                return 0;
 529
 530        addr += (off - addr) & (size - 1);
 531        return addr;
 532}
 533
 534unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 535                unsigned long len, unsigned long pgoff, unsigned long flags)
 536{
 537        loff_t off = (loff_t)pgoff << PAGE_SHIFT;
 538
 539        if (addr)
 540                goto out;
 541        if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
 542                goto out;
 543
 544        addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
 545        if (addr)
 546                return addr;
 547
 548 out:
 549        return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
 550}
 551EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 552
 553static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 554                        struct page *page, gfp_t gfp)
 555{
 556        struct vm_area_struct *vma = vmf->vma;
 557        struct mem_cgroup *memcg;
 558        pgtable_t pgtable;
 559        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 560        vm_fault_t ret = 0;
 561
 562        VM_BUG_ON_PAGE(!PageCompound(page), page);
 563
 564        if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
 565                put_page(page);
 566                count_vm_event(THP_FAULT_FALLBACK);
 567                return VM_FAULT_FALLBACK;
 568        }
 569
 570        pgtable = pte_alloc_one(vma->vm_mm);
 571        if (unlikely(!pgtable)) {
 572                ret = VM_FAULT_OOM;
 573                goto release;
 574        }
 575
 576        clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
 577        /*
 578         * The memory barrier inside __SetPageUptodate makes sure that
 579         * clear_huge_page writes become visible before the set_pmd_at()
 580         * write.
 581         */
 582        __SetPageUptodate(page);
 583
 584        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 585        if (unlikely(!pmd_none(*vmf->pmd))) {
 586                goto unlock_release;
 587        } else {
 588                pmd_t entry;
 589
 590                ret = check_stable_address_space(vma->vm_mm);
 591                if (ret)
 592                        goto unlock_release;
 593
 594                /* Deliver the page fault to userland */
 595                if (userfaultfd_missing(vma)) {
 596                        vm_fault_t ret2;
 597
 598                        spin_unlock(vmf->ptl);
 599                        mem_cgroup_cancel_charge(page, memcg, true);
 600                        put_page(page);
 601                        pte_free(vma->vm_mm, pgtable);
 602                        ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
 603                        VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
 604                        return ret2;
 605                }
 606
 607                entry = mk_huge_pmd(page, vma->vm_page_prot);
 608                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 609                page_add_new_anon_rmap(page, vma, haddr, true);
 610                mem_cgroup_commit_charge(page, memcg, false, true);
 611                lru_cache_add_active_or_unevictable(page, vma);
 612                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 613                set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 614                add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 615                mm_inc_nr_ptes(vma->vm_mm);
 616                spin_unlock(vmf->ptl);
 617                count_vm_event(THP_FAULT_ALLOC);
 618                count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
 619        }
 620
 621        return 0;
 622unlock_release:
 623        spin_unlock(vmf->ptl);
 624release:
 625        if (pgtable)
 626                pte_free(vma->vm_mm, pgtable);
 627        mem_cgroup_cancel_charge(page, memcg, true);
 628        put_page(page);
 629        return ret;
 630
 631}
 632
 633/*
 634 * always: directly stall for all thp allocations
 635 * defer: wake kswapd and fail if not immediately available
 636 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
 637 *                fail if not immediately available
 638 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
 639 *          available
 640 * never: never stall for any thp allocation
 641 */
 642static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 643{
 644        const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
 645
 646        /* Always do synchronous compaction */
 647        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
 648                return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
 649
 650        /* Kick kcompactd and fail quickly */
 651        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
 652                return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
 653
 654        /* Synchronous compaction if madvised, otherwise kick kcompactd */
 655        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
 656                return GFP_TRANSHUGE_LIGHT |
 657                        (vma_madvised ? __GFP_DIRECT_RECLAIM :
 658                                        __GFP_KSWAPD_RECLAIM);
 659
 660        /* Only do synchronous compaction if madvised */
 661        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
 662                return GFP_TRANSHUGE_LIGHT |
 663                       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
 664
 665        return GFP_TRANSHUGE_LIGHT;
 666}
 667
 668/* Caller must hold page table lock. */
 669static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 670                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
 671                struct page *zero_page)
 672{
 673        pmd_t entry;
 674        if (!pmd_none(*pmd))
 675                return false;
 676        entry = mk_pmd(zero_page, vma->vm_page_prot);
 677        entry = pmd_mkhuge(entry);
 678        if (pgtable)
 679                pgtable_trans_huge_deposit(mm, pmd, pgtable);
 680        set_pmd_at(mm, haddr, pmd, entry);
 681        mm_inc_nr_ptes(mm);
 682        return true;
 683}
 684
 685vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 686{
 687        struct vm_area_struct *vma = vmf->vma;
 688        gfp_t gfp;
 689        struct page *page;
 690        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 691
 692        if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
 693                return VM_FAULT_FALLBACK;
 694        if (unlikely(anon_vma_prepare(vma)))
 695                return VM_FAULT_OOM;
 696        if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
 697                return VM_FAULT_OOM;
 698        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
 699                        !mm_forbids_zeropage(vma->vm_mm) &&
 700                        transparent_hugepage_use_zero_page()) {
 701                pgtable_t pgtable;
 702                struct page *zero_page;
 703                bool set;
 704                vm_fault_t ret;
 705                pgtable = pte_alloc_one(vma->vm_mm);
 706                if (unlikely(!pgtable))
 707                        return VM_FAULT_OOM;
 708                zero_page = mm_get_huge_zero_page(vma->vm_mm);
 709                if (unlikely(!zero_page)) {
 710                        pte_free(vma->vm_mm, pgtable);
 711                        count_vm_event(THP_FAULT_FALLBACK);
 712                        return VM_FAULT_FALLBACK;
 713                }
 714                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 715                ret = 0;
 716                set = false;
 717                if (pmd_none(*vmf->pmd)) {
 718                        ret = check_stable_address_space(vma->vm_mm);
 719                        if (ret) {
 720                                spin_unlock(vmf->ptl);
 721                        } else if (userfaultfd_missing(vma)) {
 722                                spin_unlock(vmf->ptl);
 723                                ret = handle_userfault(vmf, VM_UFFD_MISSING);
 724                                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
 725                        } else {
 726                                set_huge_zero_page(pgtable, vma->vm_mm, vma,
 727                                                   haddr, vmf->pmd, zero_page);
 728                                spin_unlock(vmf->ptl);
 729                                set = true;
 730                        }
 731                } else
 732                        spin_unlock(vmf->ptl);
 733                if (!set)
 734                        pte_free(vma->vm_mm, pgtable);
 735                return ret;
 736        }
 737        gfp = alloc_hugepage_direct_gfpmask(vma);
 738        page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 739        if (unlikely(!page)) {
 740                count_vm_event(THP_FAULT_FALLBACK);
 741                return VM_FAULT_FALLBACK;
 742        }
 743        prep_transhuge_page(page);
 744        return __do_huge_pmd_anonymous_page(vmf, page, gfp);
 745}
 746
 747static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 748                pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
 749                pgtable_t pgtable)
 750{
 751        struct mm_struct *mm = vma->vm_mm;
 752        pmd_t entry;
 753        spinlock_t *ptl;
 754
 755        ptl = pmd_lock(mm, pmd);
 756        if (!pmd_none(*pmd)) {
 757                if (write) {
 758                        if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
 759                                WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
 760                                goto out_unlock;
 761                        }
 762                        entry = pmd_mkyoung(*pmd);
 763                        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 764                        if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
 765                                update_mmu_cache_pmd(vma, addr, pmd);
 766                }
 767
 768                goto out_unlock;
 769        }
 770
 771        entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
 772        if (pfn_t_devmap(pfn))
 773                entry = pmd_mkdevmap(entry);
 774        if (write) {
 775                entry = pmd_mkyoung(pmd_mkdirty(entry));
 776                entry = maybe_pmd_mkwrite(entry, vma);
 777        }
 778
 779        if (pgtable) {
 780                pgtable_trans_huge_deposit(mm, pmd, pgtable);
 781                mm_inc_nr_ptes(mm);
 782                pgtable = NULL;
 783        }
 784
 785        set_pmd_at(mm, addr, pmd, entry);
 786        update_mmu_cache_pmd(vma, addr, pmd);
 787
 788out_unlock:
 789        spin_unlock(ptl);
 790        if (pgtable)
 791                pte_free(mm, pgtable);
 792}
 793
 794vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
 795{
 796        unsigned long addr = vmf->address & PMD_MASK;
 797        struct vm_area_struct *vma = vmf->vma;
 798        pgprot_t pgprot = vma->vm_page_prot;
 799        pgtable_t pgtable = NULL;
 800
 801        /*
 802         * If we had pmd_special, we could avoid all these restrictions,
 803         * but we need to be consistent with PTEs and architectures that
 804         * can't support a 'special' bit.
 805         */
 806        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
 807                        !pfn_t_devmap(pfn));
 808        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
 809                                                (VM_PFNMAP|VM_MIXEDMAP));
 810        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
 811
 812        if (addr < vma->vm_start || addr >= vma->vm_end)
 813                return VM_FAULT_SIGBUS;
 814
 815        if (arch_needs_pgtable_deposit()) {
 816                pgtable = pte_alloc_one(vma->vm_mm);
 817                if (!pgtable)
 818                        return VM_FAULT_OOM;
 819        }
 820
 821        track_pfn_insert(vma, &pgprot, pfn);
 822
 823        insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
 824        return VM_FAULT_NOPAGE;
 825}
 826EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 827
 828#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 829static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
 830{
 831        if (likely(vma->vm_flags & VM_WRITE))
 832                pud = pud_mkwrite(pud);
 833        return pud;
 834}
 835
 836static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
 837                pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
 838{
 839        struct mm_struct *mm = vma->vm_mm;
 840        pud_t entry;
 841        spinlock_t *ptl;
 842
 843        ptl = pud_lock(mm, pud);
 844        if (!pud_none(*pud)) {
 845                if (write) {
 846                        if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
 847                                WARN_ON_ONCE(!is_huge_zero_pud(*pud));
 848                                goto out_unlock;
 849                        }
 850                        entry = pud_mkyoung(*pud);
 851                        entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
 852                        if (pudp_set_access_flags(vma, addr, pud, entry, 1))
 853                                update_mmu_cache_pud(vma, addr, pud);
 854                }
 855                goto out_unlock;
 856        }
 857
 858        entry = pud_mkhuge(pfn_t_pud(pfn, prot));
 859        if (pfn_t_devmap(pfn))
 860                entry = pud_mkdevmap(entry);
 861        if (write) {
 862                entry = pud_mkyoung(pud_mkdirty(entry));
 863                entry = maybe_pud_mkwrite(entry, vma);
 864        }
 865        set_pud_at(mm, addr, pud, entry);
 866        update_mmu_cache_pud(vma, addr, pud);
 867
 868out_unlock:
 869        spin_unlock(ptl);
 870}
 871
 872vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
 873{
 874        unsigned long addr = vmf->address & PUD_MASK;
 875        struct vm_area_struct *vma = vmf->vma;
 876        pgprot_t pgprot = vma->vm_page_prot;
 877
 878        /*
 879         * If we had pud_special, we could avoid all these restrictions,
 880         * but we need to be consistent with PTEs and architectures that
 881         * can't support a 'special' bit.
 882         */
 883        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
 884                        !pfn_t_devmap(pfn));
 885        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
 886                                                (VM_PFNMAP|VM_MIXEDMAP));
 887        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
 888
 889        if (addr < vma->vm_start || addr >= vma->vm_end)
 890                return VM_FAULT_SIGBUS;
 891
 892        track_pfn_insert(vma, &pgprot, pfn);
 893
 894        insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
 895        return VM_FAULT_NOPAGE;
 896}
 897EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
 898#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 899
 900static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 901                pmd_t *pmd, int flags)
 902{
 903        pmd_t _pmd;
 904
 905        _pmd = pmd_mkyoung(*pmd);
 906        if (flags & FOLL_WRITE)
 907                _pmd = pmd_mkdirty(_pmd);
 908        if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
 909                                pmd, _pmd, flags & FOLL_WRITE))
 910                update_mmu_cache_pmd(vma, addr, pmd);
 911}
 912
 913struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 914                pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
 915{
 916        unsigned long pfn = pmd_pfn(*pmd);
 917        struct mm_struct *mm = vma->vm_mm;
 918        struct page *page;
 919
 920        assert_spin_locked(pmd_lockptr(mm, pmd));
 921
 922        /*
 923         * When we COW a devmap PMD entry, we split it into PTEs, so we should
 924         * not be in this function with `flags & FOLL_COW` set.
 925         */
 926        WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
 927
 928        if (flags & FOLL_WRITE && !pmd_write(*pmd))
 929                return NULL;
 930
 931        if (pmd_present(*pmd) && pmd_devmap(*pmd))
 932                /* pass */;
 933        else
 934                return NULL;
 935
 936        if (flags & FOLL_TOUCH)
 937                touch_pmd(vma, addr, pmd, flags);
 938
 939        /*
 940         * device mapped pages can only be returned if the
 941         * caller will manage the page reference count.
 942         */
 943        if (!(flags & FOLL_GET))
 944                return ERR_PTR(-EEXIST);
 945
 946        pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
 947        *pgmap = get_dev_pagemap(pfn, *pgmap);
 948        if (!*pgmap)
 949                return ERR_PTR(-EFAULT);
 950        page = pfn_to_page(pfn);
 951        get_page(page);
 952
 953        return page;
 954}
 955
 956int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 957                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 958                  struct vm_area_struct *vma)
 959{
 960        spinlock_t *dst_ptl, *src_ptl;
 961        struct page *src_page;
 962        pmd_t pmd;
 963        pgtable_t pgtable = NULL;
 964        int ret = -ENOMEM;
 965
 966        /* Skip if can be re-fill on fault */
 967        if (!vma_is_anonymous(vma))
 968                return 0;
 969
 970        pgtable = pte_alloc_one(dst_mm);
 971        if (unlikely(!pgtable))
 972                goto out;
 973
 974        dst_ptl = pmd_lock(dst_mm, dst_pmd);
 975        src_ptl = pmd_lockptr(src_mm, src_pmd);
 976        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 977
 978        ret = -EAGAIN;
 979        pmd = *src_pmd;
 980
 981#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 982        if (unlikely(is_swap_pmd(pmd))) {
 983                swp_entry_t entry = pmd_to_swp_entry(pmd);
 984
 985                VM_BUG_ON(!is_pmd_migration_entry(pmd));
 986                if (is_write_migration_entry(entry)) {
 987                        make_migration_entry_read(&entry);
 988                        pmd = swp_entry_to_pmd(entry);
 989                        if (pmd_swp_soft_dirty(*src_pmd))
 990                                pmd = pmd_swp_mksoft_dirty(pmd);
 991                        set_pmd_at(src_mm, addr, src_pmd, pmd);
 992                }
 993                add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 994                mm_inc_nr_ptes(dst_mm);
 995                pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 996                set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 997                ret = 0;
 998                goto out_unlock;
 999        }
1000#endif
1001
1002        if (unlikely(!pmd_trans_huge(pmd))) {
1003                pte_free(dst_mm, pgtable);
1004                goto out_unlock;
1005        }
1006        /*
1007         * When page table lock is held, the huge zero pmd should not be
1008         * under splitting since we don't split the page itself, only pmd to
1009         * a page table.
1010         */
1011        if (is_huge_zero_pmd(pmd)) {
1012                struct page *zero_page;
1013                /*
1014                 * get_huge_zero_page() will never allocate a new page here,
1015                 * since we already have a zero page to copy. It just takes a
1016                 * reference.
1017                 */
1018                zero_page = mm_get_huge_zero_page(dst_mm);
1019                set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
1020                                zero_page);
1021                ret = 0;
1022                goto out_unlock;
1023        }
1024
1025        src_page = pmd_page(pmd);
1026        VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1027        get_page(src_page);
1028        page_dup_rmap(src_page, true);
1029        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1030        mm_inc_nr_ptes(dst_mm);
1031        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1032
1033        pmdp_set_wrprotect(src_mm, addr, src_pmd);
1034        pmd = pmd_mkold(pmd_wrprotect(pmd));
1035        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1036
1037        ret = 0;
1038out_unlock:
1039        spin_unlock(src_ptl);
1040        spin_unlock(dst_ptl);
1041out:
1042        return ret;
1043}
1044
1045#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1046static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1047                pud_t *pud, int flags)
1048{
1049        pud_t _pud;
1050
1051        _pud = pud_mkyoung(*pud);
1052        if (flags & FOLL_WRITE)
1053                _pud = pud_mkdirty(_pud);
1054        if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1055                                pud, _pud, flags & FOLL_WRITE))
1056                update_mmu_cache_pud(vma, addr, pud);
1057}
1058
1059struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
1060                pud_t *pud, int flags, struct dev_pagemap **pgmap)
1061{
1062        unsigned long pfn = pud_pfn(*pud);
1063        struct mm_struct *mm = vma->vm_mm;
1064        struct page *page;
1065
1066        assert_spin_locked(pud_lockptr(mm, pud));
1067
1068        if (flags & FOLL_WRITE && !pud_write(*pud))
1069                return NULL;
1070
1071        if (pud_present(*pud) && pud_devmap(*pud))
1072                /* pass */;
1073        else
1074                return NULL;
1075
1076        if (flags & FOLL_TOUCH)
1077                touch_pud(vma, addr, pud, flags);
1078
1079        /*
1080         * device mapped pages can only be returned if the
1081         * caller will manage the page reference count.
1082         */
1083        if (!(flags & FOLL_GET))
1084                return ERR_PTR(-EEXIST);
1085
1086        pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
1087        *pgmap = get_dev_pagemap(pfn, *pgmap);
1088        if (!*pgmap)
1089                return ERR_PTR(-EFAULT);
1090        page = pfn_to_page(pfn);
1091        get_page(page);
1092
1093        return page;
1094}
1095
1096int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1097                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1098                  struct vm_area_struct *vma)
1099{
1100        spinlock_t *dst_ptl, *src_ptl;
1101        pud_t pud;
1102        int ret;
1103
1104        dst_ptl = pud_lock(dst_mm, dst_pud);
1105        src_ptl = pud_lockptr(src_mm, src_pud);
1106        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1107
1108        ret = -EAGAIN;
1109        pud = *src_pud;
1110        if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1111                goto out_unlock;
1112
1113        /*
1114         * When page table lock is held, the huge zero pud should not be
1115         * under splitting since we don't split the page itself, only pud to
1116         * a page table.
1117         */
1118        if (is_huge_zero_pud(pud)) {
1119                /* No huge zero pud yet */
1120        }
1121
1122        pudp_set_wrprotect(src_mm, addr, src_pud);
1123        pud = pud_mkold(pud_wrprotect(pud));
1124        set_pud_at(dst_mm, addr, dst_pud, pud);
1125
1126        ret = 0;
1127out_unlock:
1128        spin_unlock(src_ptl);
1129        spin_unlock(dst_ptl);
1130        return ret;
1131}
1132
1133void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1134{
1135        pud_t entry;
1136        unsigned long haddr;
1137        bool write = vmf->flags & FAULT_FLAG_WRITE;
1138
1139        vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1140        if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1141                goto unlock;
1142
1143        entry = pud_mkyoung(orig_pud);
1144        if (write)
1145                entry = pud_mkdirty(entry);
1146        haddr = vmf->address & HPAGE_PUD_MASK;
1147        if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
1148                update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
1149
1150unlock:
1151        spin_unlock(vmf->ptl);
1152}
1153#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1154
1155void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
1156{
1157        pmd_t entry;
1158        unsigned long haddr;
1159        bool write = vmf->flags & FAULT_FLAG_WRITE;
1160
1161        vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1162        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1163                goto unlock;
1164
1165        entry = pmd_mkyoung(orig_pmd);
1166        if (write)
1167                entry = pmd_mkdirty(entry);
1168        haddr = vmf->address & HPAGE_PMD_MASK;
1169        if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
1170                update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
1171
1172unlock:
1173        spin_unlock(vmf->ptl);
1174}
1175
1176static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
1177                        pmd_t orig_pmd, struct page *page)
1178{
1179        struct vm_area_struct *vma = vmf->vma;
1180        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1181        struct mem_cgroup *memcg;
1182        pgtable_t pgtable;
1183        pmd_t _pmd;
1184        int i;
1185        vm_fault_t ret = 0;
1186        struct page **pages;
1187        struct mmu_notifier_range range;
1188
1189        pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
1190                              GFP_KERNEL);
1191        if (unlikely(!pages)) {
1192                ret |= VM_FAULT_OOM;
1193                goto out;
1194        }
1195
1196        for (i = 0; i < HPAGE_PMD_NR; i++) {
1197                pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
1198                                               vmf->address, page_to_nid(page));
1199                if (unlikely(!pages[i] ||
1200                             mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
1201                                     GFP_KERNEL, &memcg, false))) {
1202                        if (pages[i])
1203                                put_page(pages[i]);
1204                        while (--i >= 0) {
1205                                memcg = (void *)page_private(pages[i]);
1206                                set_page_private(pages[i], 0);
1207                                mem_cgroup_cancel_charge(pages[i], memcg,
1208                                                false);
1209                                put_page(pages[i]);
1210                        }
1211                        kfree(pages);
1212                        ret |= VM_FAULT_OOM;
1213                        goto out;
1214                }
1215                set_page_private(pages[i], (unsigned long)memcg);
1216        }
1217
1218        for (i = 0; i < HPAGE_PMD_NR; i++) {
1219                copy_user_highpage(pages[i], page + i,
1220                                   haddr + PAGE_SIZE * i, vma);
1221                __SetPageUptodate(pages[i]);
1222                cond_resched();
1223        }
1224
1225        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1226                                haddr, haddr + HPAGE_PMD_SIZE);
1227        mmu_notifier_invalidate_range_start(&range);
1228
1229        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1230        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1231                goto out_free_pages;
1232        VM_BUG_ON_PAGE(!PageHead(page), page);
1233
1234        /*
1235         * Leave pmd empty until pte is filled note we must notify here as
1236         * concurrent CPU thread might write to new page before the call to
1237         * mmu_notifier_invalidate_range_end() happens which can lead to a
1238         * device seeing memory write in different order than CPU.
1239         *
1240         * See Documentation/vm/mmu_notifier.rst
1241         */
1242        pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1243
1244        pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
1245        pmd_populate(vma->vm_mm, &_pmd, pgtable);
1246
1247        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1248                pte_t entry;
1249                entry = mk_pte(pages[i], vma->vm_page_prot);
1250                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1251                memcg = (void *)page_private(pages[i]);
1252                set_page_private(pages[i], 0);
1253                page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
1254                mem_cgroup_commit_charge(pages[i], memcg, false, false);
1255                lru_cache_add_active_or_unevictable(pages[i], vma);
1256                vmf->pte = pte_offset_map(&_pmd, haddr);
1257                VM_BUG_ON(!pte_none(*vmf->pte));
1258                set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
1259                pte_unmap(vmf->pte);
1260        }
1261        kfree(pages);
1262
1263        smp_wmb(); /* make pte visible before pmd */
1264        pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
1265        page_remove_rmap(page, true);
1266        spin_unlock(vmf->ptl);
1267
1268        /*
1269         * No need to double call mmu_notifier->invalidate_range() callback as
1270         * the above pmdp_huge_clear_flush_notify() did already call it.
1271         */
1272        mmu_notifier_invalidate_range_only_end(&range);
1273
1274        ret |= VM_FAULT_WRITE;
1275        put_page(page);
1276
1277out:
1278        return ret;
1279
1280out_free_pages:
1281        spin_unlock(vmf->ptl);
1282        mmu_notifier_invalidate_range_end(&range);
1283        for (i = 0; i < HPAGE_PMD_NR; i++) {
1284                memcg = (void *)page_private(pages[i]);
1285                set_page_private(pages[i], 0);
1286                mem_cgroup_cancel_charge(pages[i], memcg, false);
1287                put_page(pages[i]);
1288        }
1289        kfree(pages);
1290        goto out;
1291}
1292
1293vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
1294{
1295        struct vm_area_struct *vma = vmf->vma;
1296        struct page *page = NULL, *new_page;
1297        struct mem_cgroup *memcg;
1298        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1299        struct mmu_notifier_range range;
1300        gfp_t huge_gfp;                 /* for allocation and charge */
1301        vm_fault_t ret = 0;
1302
1303        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1304        VM_BUG_ON_VMA(!vma->anon_vma, vma);
1305        if (is_huge_zero_pmd(orig_pmd))
1306                goto alloc;
1307        spin_lock(vmf->ptl);
1308        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1309                goto out_unlock;
1310
1311        page = pmd_page(orig_pmd);
1312        VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
1313        /*
1314         * We can only reuse the page if nobody else maps the huge page or it's
1315         * part.
1316         */
1317        if (!trylock_page(page)) {
1318                get_page(page);
1319                spin_unlock(vmf->ptl);
1320                lock_page(page);
1321                spin_lock(vmf->ptl);
1322                if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1323                        unlock_page(page);
1324                        put_page(page);
1325                        goto out_unlock;
1326                }
1327                put_page(page);
1328        }
1329        if (reuse_swap_page(page, NULL)) {
1330                pmd_t entry;
1331                entry = pmd_mkyoung(orig_pmd);
1332                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1333                if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
1334                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1335                ret |= VM_FAULT_WRITE;
1336                unlock_page(page);
1337                goto out_unlock;
1338        }
1339        unlock_page(page);
1340        get_page(page);
1341        spin_unlock(vmf->ptl);
1342alloc:
1343        if (__transparent_hugepage_enabled(vma) &&
1344            !transparent_hugepage_debug_cow()) {
1345                huge_gfp = alloc_hugepage_direct_gfpmask(vma);
1346                new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
1347        } else
1348                new_page = NULL;
1349
1350        if (likely(new_page)) {
1351                prep_transhuge_page(new_page);
1352        } else {
1353                if (!page) {
1354                        split_huge_pmd(vma, vmf->pmd, vmf->address);
1355                        ret |= VM_FAULT_FALLBACK;
1356                } else {
1357                        ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
1358                        if (ret & VM_FAULT_OOM) {
1359                                split_huge_pmd(vma, vmf->pmd, vmf->address);
1360                                ret |= VM_FAULT_FALLBACK;
1361                        }
1362                        put_page(page);
1363                }
1364                count_vm_event(THP_FAULT_FALLBACK);
1365                goto out;
1366        }
1367
1368        if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
1369                                        huge_gfp, &memcg, true))) {
1370                put_page(new_page);
1371                split_huge_pmd(vma, vmf->pmd, vmf->address);
1372                if (page)
1373                        put_page(page);
1374                ret |= VM_FAULT_FALLBACK;
1375                count_vm_event(THP_FAULT_FALLBACK);
1376                goto out;
1377        }
1378
1379        count_vm_event(THP_FAULT_ALLOC);
1380        count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
1381
1382        if (!page)
1383                clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
1384        else
1385                copy_user_huge_page(new_page, page, vmf->address,
1386                                    vma, HPAGE_PMD_NR);
1387        __SetPageUptodate(new_page);
1388
1389        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1390                                haddr, haddr + HPAGE_PMD_SIZE);
1391        mmu_notifier_invalidate_range_start(&range);
1392
1393        spin_lock(vmf->ptl);
1394        if (page)
1395                put_page(page);
1396        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1397                spin_unlock(vmf->ptl);
1398                mem_cgroup_cancel_charge(new_page, memcg, true);
1399                put_page(new_page);
1400                goto out_mn;
1401        } else {
1402                pmd_t entry;
1403                entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1404                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1405                pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1406                page_add_new_anon_rmap(new_page, vma, haddr, true);
1407                mem_cgroup_commit_charge(new_page, memcg, false, true);
1408                lru_cache_add_active_or_unevictable(new_page, vma);
1409                set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
1410                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1411                if (!page) {
1412                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1413                } else {
1414                        VM_BUG_ON_PAGE(!PageHead(page), page);
1415                        page_remove_rmap(page, true);
1416                        put_page(page);
1417                }
1418                ret |= VM_FAULT_WRITE;
1419        }
1420        spin_unlock(vmf->ptl);
1421out_mn:
1422        /*
1423         * No need to double call mmu_notifier->invalidate_range() callback as
1424         * the above pmdp_huge_clear_flush_notify() did already call it.
1425         */
1426        mmu_notifier_invalidate_range_only_end(&range);
1427out:
1428        return ret;
1429out_unlock:
1430        spin_unlock(vmf->ptl);
1431        return ret;
1432}
1433
1434/*
1435 * FOLL_FORCE can write to even unwritable pmd's, but only
1436 * after we've gone through a COW cycle and they are dirty.
1437 */
1438static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
1439{
1440        return pmd_write(pmd) ||
1441               ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
1442}
1443
1444struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1445                                   unsigned long addr,
1446                                   pmd_t *pmd,
1447                                   unsigned int flags)
1448{
1449        struct mm_struct *mm = vma->vm_mm;
1450        struct page *page = NULL;
1451
1452        assert_spin_locked(pmd_lockptr(mm, pmd));
1453
1454        if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
1455                goto out;
1456
1457        /* Avoid dumping huge zero page */
1458        if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1459                return ERR_PTR(-EFAULT);
1460
1461        /* Full NUMA hinting faults to serialise migration in fault paths */
1462        if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
1463                goto out;
1464
1465        page = pmd_page(*pmd);
1466        VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
1467        if (flags & FOLL_TOUCH)
1468                touch_pmd(vma, addr, pmd, flags);
1469        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1470                /*
1471                 * We don't mlock() pte-mapped THPs. This way we can avoid
1472                 * leaking mlocked pages into non-VM_LOCKED VMAs.
1473                 *
1474                 * For anon THP:
1475                 *
1476                 * In most cases the pmd is the only mapping of the page as we
1477                 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
1478                 * writable private mappings in populate_vma_page_range().
1479                 *
1480                 * The only scenario when we have the page shared here is if we
1481                 * mlocking read-only mapping shared over fork(). We skip
1482                 * mlocking such pages.
1483                 *
1484                 * For file THP:
1485                 *
1486                 * We can expect PageDoubleMap() to be stable under page lock:
1487                 * for file pages we set it in page_add_file_rmap(), which
1488                 * requires page to be locked.
1489                 */
1490
1491                if (PageAnon(page) && compound_mapcount(page) != 1)
1492                        goto skip_mlock;
1493                if (PageDoubleMap(page) || !page->mapping)
1494                        goto skip_mlock;
1495                if (!trylock_page(page))
1496                        goto skip_mlock;
1497                lru_add_drain();
1498                if (page->mapping && !PageDoubleMap(page))
1499                        mlock_vma_page(page);
1500                unlock_page(page);
1501        }
1502skip_mlock:
1503        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1504        VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
1505        if (flags & FOLL_GET)
1506                get_page(page);
1507
1508out:
1509        return page;
1510}
1511
1512/* NUMA hinting page fault entry point for trans huge pmds */
1513vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1514{
1515        struct vm_area_struct *vma = vmf->vma;
1516        struct anon_vma *anon_vma = NULL;
1517        struct page *page;
1518        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1519        int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
1520        int target_nid, last_cpupid = -1;
1521        bool page_locked;
1522        bool migrated = false;
1523        bool was_writable;
1524        int flags = 0;
1525
1526        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1527        if (unlikely(!pmd_same(pmd, *vmf->pmd)))
1528                goto out_unlock;
1529
1530        /*
1531         * If there are potential migrations, wait for completion and retry
1532         * without disrupting NUMA hinting information. Do not relock and
1533         * check_same as the page may no longer be mapped.
1534         */
1535        if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
1536                page = pmd_page(*vmf->pmd);
1537                if (!get_page_unless_zero(page))
1538                        goto out_unlock;
1539                spin_unlock(vmf->ptl);
1540                put_and_wait_on_page_locked(page);
1541                goto out;
1542        }
1543
1544        page = pmd_page(pmd);
1545        BUG_ON(is_huge_zero_page(page));
1546        page_nid = page_to_nid(page);
1547        last_cpupid = page_cpupid_last(page);
1548        count_vm_numa_event(NUMA_HINT_FAULTS);
1549        if (page_nid == this_nid) {
1550                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1551                flags |= TNF_FAULT_LOCAL;
1552        }
1553
1554        /* See similar comment in do_numa_page for explanation */
1555        if (!pmd_savedwrite(pmd))
1556                flags |= TNF_NO_GROUP;
1557
1558        /*
1559         * Acquire the page lock to serialise THP migrations but avoid dropping
1560         * page_table_lock if at all possible
1561         */
1562        page_locked = trylock_page(page);
1563        target_nid = mpol_misplaced(page, vma, haddr);
1564        if (target_nid == NUMA_NO_NODE) {
1565                /* If the page was locked, there are no parallel migrations */
1566                if (page_locked)
1567                        goto clear_pmdnuma;
1568        }
1569
1570        /* Migration could have started since the pmd_trans_migrating check */
1571        if (!page_locked) {
1572                page_nid = NUMA_NO_NODE;
1573                if (!get_page_unless_zero(page))
1574                        goto out_unlock;
1575                spin_unlock(vmf->ptl);
1576                put_and_wait_on_page_locked(page);
1577                goto out;
1578        }
1579
1580        /*
1581         * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
1582         * to serialises splits
1583         */
1584        get_page(page);
1585        spin_unlock(vmf->ptl);
1586        anon_vma = page_lock_anon_vma_read(page);
1587
1588        /* Confirm the PMD did not change while page_table_lock was released */
1589        spin_lock(vmf->ptl);
1590        if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
1591                unlock_page(page);
1592                put_page(page);
1593                page_nid = NUMA_NO_NODE;
1594                goto out_unlock;
1595        }
1596
1597        /* Bail if we fail to protect against THP splits for any reason */
1598        if (unlikely(!anon_vma)) {
1599                put_page(page);
1600                page_nid = NUMA_NO_NODE;
1601                goto clear_pmdnuma;
1602        }
1603
1604        /*
1605         * Since we took the NUMA fault, we must have observed the !accessible
1606         * bit. Make sure all other CPUs agree with that, to avoid them
1607         * modifying the page we're about to migrate.
1608         *
1609         * Must be done under PTL such that we'll observe the relevant
1610         * inc_tlb_flush_pending().
1611         *
1612         * We are not sure a pending tlb flush here is for a huge page
1613         * mapping or not. Hence use the tlb range variant
1614         */
1615        if (mm_tlb_flush_pending(vma->vm_mm)) {
1616                flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
1617                /*
1618                 * change_huge_pmd() released the pmd lock before
1619                 * invalidating the secondary MMUs sharing the primary
1620                 * MMU pagetables (with ->invalidate_range()). The
1621                 * mmu_notifier_invalidate_range_end() (which
1622                 * internally calls ->invalidate_range()) in
1623                 * change_pmd_range() will run after us, so we can't
1624                 * rely on it here and we need an explicit invalidate.
1625                 */
1626                mmu_notifier_invalidate_range(vma->vm_mm, haddr,
1627                                              haddr + HPAGE_PMD_SIZE);
1628        }
1629
1630        /*
1631         * Migrate the THP to the requested node, returns with page unlocked
1632         * and access rights restored.
1633         */
1634        spin_unlock(vmf->ptl);
1635
1636        migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
1637                                vmf->pmd, pmd, vmf->address, page, target_nid);
1638        if (migrated) {
1639                flags |= TNF_MIGRATED;
1640                page_nid = target_nid;
1641        } else
1642                flags |= TNF_MIGRATE_FAIL;
1643
1644        goto out;
1645clear_pmdnuma:
1646        BUG_ON(!PageLocked(page));
1647        was_writable = pmd_savedwrite(pmd);
1648        pmd = pmd_modify(pmd, vma->vm_page_prot);
1649        pmd = pmd_mkyoung(pmd);
1650        if (was_writable)
1651                pmd = pmd_mkwrite(pmd);
1652        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1653        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1654        unlock_page(page);
1655out_unlock:
1656        spin_unlock(vmf->ptl);
1657
1658out:
1659        if (anon_vma)
1660                page_unlock_anon_vma_read(anon_vma);
1661
1662        if (page_nid != NUMA_NO_NODE)
1663                task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
1664                                flags);
1665
1666        return 0;
1667}
1668
1669/*
1670 * Return true if we do MADV_FREE successfully on entire pmd page.
1671 * Otherwise, return false.
1672 */
1673bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1674                pmd_t *pmd, unsigned long addr, unsigned long next)
1675{
1676        spinlock_t *ptl;
1677        pmd_t orig_pmd;
1678        struct page *page;
1679        struct mm_struct *mm = tlb->mm;
1680        bool ret = false;
1681
1682        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1683
1684        ptl = pmd_trans_huge_lock(pmd, vma);
1685        if (!ptl)
1686                goto out_unlocked;
1687
1688        orig_pmd = *pmd;
1689        if (is_huge_zero_pmd(orig_pmd))
1690                goto out;
1691
1692        if (unlikely(!pmd_present(orig_pmd))) {
1693                VM_BUG_ON(thp_migration_supported() &&
1694                                  !is_pmd_migration_entry(orig_pmd));
1695                goto out;
1696        }
1697
1698        page = pmd_page(orig_pmd);
1699        /*
1700         * If other processes are mapping this page, we couldn't discard
1701         * the page unless they all do MADV_FREE so let's skip the page.
1702         */
1703        if (page_mapcount(page) != 1)
1704                goto out;
1705
1706        if (!trylock_page(page))
1707                goto out;
1708
1709        /*
1710         * If user want to discard part-pages of THP, split it so MADV_FREE
1711         * will deactivate only them.
1712         */
1713        if (next - addr != HPAGE_PMD_SIZE) {
1714                get_page(page);
1715                spin_unlock(ptl);
1716                split_huge_page(page);
1717                unlock_page(page);
1718                put_page(page);
1719                goto out_unlocked;
1720        }
1721
1722        if (PageDirty(page))
1723                ClearPageDirty(page);
1724        unlock_page(page);
1725
1726        if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
1727                pmdp_invalidate(vma, addr, pmd);
1728                orig_pmd = pmd_mkold(orig_pmd);
1729                orig_pmd = pmd_mkclean(orig_pmd);
1730
1731                set_pmd_at(mm, addr, pmd, orig_pmd);
1732                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1733        }
1734
1735        mark_page_lazyfree(page);
1736        ret = true;
1737out:
1738        spin_unlock(ptl);
1739out_unlocked:
1740        return ret;
1741}
1742
1743static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1744{
1745        pgtable_t pgtable;
1746
1747        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1748        pte_free(mm, pgtable);
1749        mm_dec_nr_ptes(mm);
1750}
1751
1752int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1753                 pmd_t *pmd, unsigned long addr)
1754{
1755        pmd_t orig_pmd;
1756        spinlock_t *ptl;
1757
1758        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1759
1760        ptl = __pmd_trans_huge_lock(pmd, vma);
1761        if (!ptl)
1762                return 0;
1763        /*
1764         * For architectures like ppc64 we look at deposited pgtable
1765         * when calling pmdp_huge_get_and_clear. So do the
1766         * pgtable_trans_huge_withdraw after finishing pmdp related
1767         * operations.
1768         */
1769        orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
1770                        tlb->fullmm);
1771        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1772        if (vma_is_dax(vma)) {
1773                if (arch_needs_pgtable_deposit())
1774                        zap_deposited_table(tlb->mm, pmd);
1775                spin_unlock(ptl);
1776                if (is_huge_zero_pmd(orig_pmd))
1777                        tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
1778        } else if (is_huge_zero_pmd(orig_pmd)) {
1779                zap_deposited_table(tlb->mm, pmd);
1780                spin_unlock(ptl);
1781                tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
1782        } else {
1783                struct page *page = NULL;
1784                int flush_needed = 1;
1785
1786                if (pmd_present(orig_pmd)) {
1787                        page = pmd_page(orig_pmd);
1788                        page_remove_rmap(page, true);
1789                        VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1790                        VM_BUG_ON_PAGE(!PageHead(page), page);
1791                } else if (thp_migration_supported()) {
1792                        swp_entry_t entry;
1793
1794                        VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
1795                        entry = pmd_to_swp_entry(orig_pmd);
1796                        page = pfn_to_page(swp_offset(entry));
1797                        flush_needed = 0;
1798                } else
1799                        WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
1800
1801                if (PageAnon(page)) {
1802                        zap_deposited_table(tlb->mm, pmd);
1803                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1804                } else {
1805                        if (arch_needs_pgtable_deposit())
1806                                zap_deposited_table(tlb->mm, pmd);
1807                        add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
1808                }
1809
1810                spin_unlock(ptl);
1811                if (flush_needed)
1812                        tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
1813        }
1814        return 1;
1815}
1816
1817#ifndef pmd_move_must_withdraw
1818static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
1819                                         spinlock_t *old_pmd_ptl,
1820                                         struct vm_area_struct *vma)
1821{
1822        /*
1823         * With split pmd lock we also need to move preallocated
1824         * PTE page table if new_pmd is on different PMD page table.
1825         *
1826         * We also don't deposit and withdraw tables for file pages.
1827         */
1828        return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
1829}
1830#endif
1831
1832static pmd_t move_soft_dirty_pmd(pmd_t pmd)
1833{
1834#ifdef CONFIG_MEM_SOFT_DIRTY
1835        if (unlikely(is_pmd_migration_entry(pmd)))
1836                pmd = pmd_swp_mksoft_dirty(pmd);
1837        else if (pmd_present(pmd))
1838                pmd = pmd_mksoft_dirty(pmd);
1839#endif
1840        return pmd;
1841}
1842
1843bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1844                  unsigned long new_addr, unsigned long old_end,
1845                  pmd_t *old_pmd, pmd_t *new_pmd)
1846{
1847        spinlock_t *old_ptl, *new_ptl;
1848        pmd_t pmd;
1849        struct mm_struct *mm = vma->vm_mm;
1850        bool force_flush = false;
1851
1852        if ((old_addr & ~HPAGE_PMD_MASK) ||
1853            (new_addr & ~HPAGE_PMD_MASK) ||
1854            old_end - old_addr < HPAGE_PMD_SIZE)
1855                return false;
1856
1857        /*
1858         * The destination pmd shouldn't be established, free_pgtables()
1859         * should have release it.
1860         */
1861        if (WARN_ON(!pmd_none(*new_pmd))) {
1862                VM_BUG_ON(pmd_trans_huge(*new_pmd));
1863                return false;
1864        }
1865
1866        /*
1867         * We don't have to worry about the ordering of src and dst
1868         * ptlocks because exclusive mmap_sem prevents deadlock.
1869         */
1870        old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
1871        if (old_ptl) {
1872                new_ptl = pmd_lockptr(mm, new_pmd);
1873                if (new_ptl != old_ptl)
1874                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1875                pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
1876                if (pmd_present(pmd))
1877                        force_flush = true;
1878                VM_BUG_ON(!pmd_none(*new_pmd));
1879
1880                if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
1881                        pgtable_t pgtable;
1882                        pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
1883                        pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
1884                }
1885                pmd = move_soft_dirty_pmd(pmd);
1886                set_pmd_at(mm, new_addr, new_pmd, pmd);
1887                if (force_flush)
1888                        flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
1889                if (new_ptl != old_ptl)
1890                        spin_unlock(new_ptl);
1891                spin_unlock(old_ptl);
1892                return true;
1893        }
1894        return false;
1895}
1896
1897/*
1898 * Returns
1899 *  - 0 if PMD could not be locked
1900 *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
1901 *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
1902 */
1903int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1904                unsigned long addr, pgprot_t newprot, int prot_numa)
1905{
1906        struct mm_struct *mm = vma->vm_mm;
1907        spinlock_t *ptl;
1908        pmd_t entry;
1909        bool preserve_write;
1910        int ret;
1911
1912        ptl = __pmd_trans_huge_lock(pmd, vma);
1913        if (!ptl)
1914                return 0;
1915
1916        preserve_write = prot_numa && pmd_write(*pmd);
1917        ret = 1;
1918
1919#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1920        if (is_swap_pmd(*pmd)) {
1921                swp_entry_t entry = pmd_to_swp_entry(*pmd);
1922
1923                VM_BUG_ON(!is_pmd_migration_entry(*pmd));
1924                if (is_write_migration_entry(entry)) {
1925                        pmd_t newpmd;
1926                        /*
1927                         * A protection check is difficult so
1928                         * just be safe and disable write
1929                         */
1930                        make_migration_entry_read(&entry);
1931                        newpmd = swp_entry_to_pmd(entry);
1932                        if (pmd_swp_soft_dirty(*pmd))
1933                                newpmd = pmd_swp_mksoft_dirty(newpmd);
1934                        set_pmd_at(mm, addr, pmd, newpmd);
1935                }
1936                goto unlock;
1937        }
1938#endif
1939
1940        /*
1941         * Avoid trapping faults against the zero page. The read-only
1942         * data is likely to be read-cached on the local CPU and
1943         * local/remote hits to the zero page are not interesting.
1944         */
1945        if (prot_numa && is_huge_zero_pmd(*pmd))
1946                goto unlock;
1947
1948        if (prot_numa && pmd_protnone(*pmd))
1949                goto unlock;
1950
1951        /*
1952         * In case prot_numa, we are under down_read(mmap_sem). It's critical
1953         * to not clear pmd intermittently to avoid race with MADV_DONTNEED
1954         * which is also under down_read(mmap_sem):
1955         *
1956         *      CPU0:                           CPU1:
1957         *                              change_huge_pmd(prot_numa=1)
1958         *                               pmdp_huge_get_and_clear_notify()
1959         * madvise_dontneed()
1960         *  zap_pmd_range()
1961         *   pmd_trans_huge(*pmd) == 0 (without ptl)
1962         *   // skip the pmd
1963         *                               set_pmd_at();
1964         *                               // pmd is re-established
1965         *
1966         * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
1967         * which may break userspace.
1968         *
1969         * pmdp_invalidate() is required to make sure we don't miss
1970         * dirty/young flags set by hardware.
1971         */
1972        entry = pmdp_invalidate(vma, addr, pmd);
1973
1974        entry = pmd_modify(entry, newprot);
1975        if (preserve_write)
1976                entry = pmd_mk_savedwrite(entry);
1977        ret = HPAGE_PMD_NR;
1978        set_pmd_at(mm, addr, pmd, entry);
1979        BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
1980unlock:
1981        spin_unlock(ptl);
1982        return ret;
1983}
1984
1985/*
1986 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
1987 *
1988 * Note that if it returns page table lock pointer, this routine returns without
1989 * unlocking page table lock. So callers must unlock it.
1990 */
1991spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1992{
1993        spinlock_t *ptl;
1994        ptl = pmd_lock(vma->vm_mm, pmd);
1995        if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
1996                        pmd_devmap(*pmd)))
1997                return ptl;
1998        spin_unlock(ptl);
1999        return NULL;
2000}
2001
2002/*
2003 * Returns true if a given pud maps a thp, false otherwise.
2004 *
2005 * Note that if it returns true, this routine returns without unlocking page
2006 * table lock. So callers must unlock it.
2007 */
2008spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2009{
2010        spinlock_t *ptl;
2011
2012        ptl = pud_lock(vma->vm_mm, pud);
2013        if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2014                return ptl;
2015        spin_unlock(ptl);
2016        return NULL;
2017}
2018
2019#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2020int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2021                 pud_t *pud, unsigned long addr)
2022{
2023        spinlock_t *ptl;
2024
2025        ptl = __pud_trans_huge_lock(pud, vma);
2026        if (!ptl)
2027                return 0;
2028        /*
2029         * For architectures like ppc64 we look at deposited pgtable
2030         * when calling pudp_huge_get_and_clear. So do the
2031         * pgtable_trans_huge_withdraw after finishing pudp related
2032         * operations.
2033         */
2034        pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
2035        tlb_remove_pud_tlb_entry(tlb, pud, addr);
2036        if (vma_is_dax(vma)) {
2037                spin_unlock(ptl);
2038                /* No zero page support yet */
2039        } else {
2040                /* No support for anonymous PUD pages yet */
2041                BUG();
2042        }
2043        return 1;
2044}
2045
2046static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2047                unsigned long haddr)
2048{
2049        VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2050        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2051        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2052        VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2053
2054        count_vm_event(THP_SPLIT_PUD);
2055
2056        pudp_huge_clear_flush_notify(vma, haddr, pud);
2057}
2058
2059void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2060                unsigned long address)
2061{
2062        spinlock_t *ptl;
2063        struct mmu_notifier_range range;
2064
2065        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
2066                                address & HPAGE_PUD_MASK,
2067                                (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2068        mmu_notifier_invalidate_range_start(&range);
2069        ptl = pud_lock(vma->vm_mm, pud);
2070        if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2071                goto out;
2072        __split_huge_pud_locked(vma, pud, range.start);
2073
2074out:
2075        spin_unlock(ptl);
2076        /*
2077         * No need to double call mmu_notifier->invalidate_range() callback as
2078         * the above pudp_huge_clear_flush_notify() did already call it.
2079         */
2080        mmu_notifier_invalidate_range_only_end(&range);
2081}
2082#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2083
2084static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2085                unsigned long haddr, pmd_t *pmd)
2086{
2087        struct mm_struct *mm = vma->vm_mm;
2088        pgtable_t pgtable;
2089        pmd_t _pmd;
2090        int i;
2091
2092        /*
2093         * Leave pmd empty until pte is filled note that it is fine to delay
2094         * notification until mmu_notifier_invalidate_range_end() as we are
2095         * replacing a zero pmd write protected page with a zero pte write
2096         * protected page.
2097         *
2098         * See Documentation/vm/mmu_notifier.rst
2099         */
2100        pmdp_huge_clear_flush(vma, haddr, pmd);
2101
2102        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2103        pmd_populate(mm, &_pmd, pgtable);
2104
2105        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2106                pte_t *pte, entry;
2107                entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2108                entry = pte_mkspecial(entry);
2109                pte = pte_offset_map(&_pmd, haddr);
2110                VM_BUG_ON(!pte_none(*pte));
2111                set_pte_at(mm, haddr, pte, entry);
2112                pte_unmap(pte);
2113        }
2114        smp_wmb(); /* make pte visible before pmd */
2115        pmd_populate(mm, pmd, pgtable);
2116}
2117
2118static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2119                unsigned long haddr, bool freeze)
2120{
2121        struct mm_struct *mm = vma->vm_mm;
2122        struct page *page;
2123        pgtable_t pgtable;
2124        pmd_t old_pmd, _pmd;
2125        bool young, write, soft_dirty, pmd_migration = false;
2126        unsigned long addr;
2127        int i;
2128
2129        VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2130        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2131        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2132        VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
2133                                && !pmd_devmap(*pmd));
2134
2135        count_vm_event(THP_SPLIT_PMD);
2136
2137        if (!vma_is_anonymous(vma)) {
2138                _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
2139                /*
2140                 * We are going to unmap this huge page. So
2141                 * just go ahead and zap it
2142                 */
2143                if (arch_needs_pgtable_deposit())
2144                        zap_deposited_table(mm, pmd);
2145                if (vma_is_dax(vma))
2146                        return;
2147                page = pmd_page(_pmd);
2148                if (!PageDirty(page) && pmd_dirty(_pmd))
2149                        set_page_dirty(page);
2150                if (!PageReferenced(page) && pmd_young(_pmd))
2151                        SetPageReferenced(page);
2152                page_remove_rmap(page, true);
2153                put_page(page);
2154                add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
2155                return;
2156        } else if (is_huge_zero_pmd(*pmd)) {
2157                /*
2158                 * FIXME: Do we want to invalidate secondary mmu by calling
2159                 * mmu_notifier_invalidate_range() see comments below inside
2160                 * __split_huge_pmd() ?
2161                 *
2162                 * We are going from a zero huge page write protected to zero
2163                 * small page also write protected so it does not seems useful
2164                 * to invalidate secondary mmu at this time.
2165                 */
2166                return __split_huge_zero_page_pmd(vma, haddr, pmd);
2167        }
2168
2169        /*
2170         * Up to this point the pmd is present and huge and userland has the
2171         * whole access to the hugepage during the split (which happens in
2172         * place). If we overwrite the pmd with the not-huge version pointing
2173         * to the pte here (which of course we could if all CPUs were bug
2174         * free), userland could trigger a small page size TLB miss on the
2175         * small sized TLB while the hugepage TLB entry is still established in
2176         * the huge TLB. Some CPU doesn't like that.
2177         * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
2178         * 383 on page 93. Intel should be safe but is also warns that it's
2179         * only safe if the permission and cache attributes of the two entries
2180         * loaded in the two TLB is identical (which should be the case here).
2181         * But it is generally safer to never allow small and huge TLB entries
2182         * for the same virtual address to be loaded simultaneously. So instead
2183         * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
2184         * current pmd notpresent (atomically because here the pmd_trans_huge
2185         * must remain set at all times on the pmd until the split is complete
2186         * for this pmd), then we flush the SMP TLB and finally we write the
2187         * non-huge version of the pmd entry with pmd_populate.
2188         */
2189        old_pmd = pmdp_invalidate(vma, haddr, pmd);
2190
2191        pmd_migration = is_pmd_migration_entry(old_pmd);
2192        if (unlikely(pmd_migration)) {
2193                swp_entry_t entry;
2194
2195                entry = pmd_to_swp_entry(old_pmd);
2196                page = pfn_to_page(swp_offset(entry));
2197                write = is_write_migration_entry(entry);
2198                young = false;
2199                soft_dirty = pmd_swp_soft_dirty(old_pmd);
2200        } else {
2201                page = pmd_page(old_pmd);
2202                if (pmd_dirty(old_pmd))
2203                        SetPageDirty(page);
2204                write = pmd_write(old_pmd);
2205                young = pmd_young(old_pmd);
2206                soft_dirty = pmd_soft_dirty(old_pmd);
2207        }
2208        VM_BUG_ON_PAGE(!page_count(page), page);
2209        page_ref_add(page, HPAGE_PMD_NR - 1);
2210
2211        /*
2212         * Withdraw the table only after we mark the pmd entry invalid.
2213         * This's critical for some architectures (Power).
2214         */
2215        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2216        pmd_populate(mm, &_pmd, pgtable);
2217
2218        for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2219                pte_t entry, *pte;
2220                /*
2221                 * Note that NUMA hinting access restrictions are not
2222                 * transferred to avoid any possibility of altering
2223                 * permissions across VMAs.
2224                 */
2225                if (freeze || pmd_migration) {
2226                        swp_entry_t swp_entry;
2227                        swp_entry = make_migration_entry(page + i, write);
2228                        entry = swp_entry_to_pte(swp_entry);
2229                        if (soft_dirty)
2230                                entry = pte_swp_mksoft_dirty(entry);
2231                } else {
2232                        entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
2233                        entry = maybe_mkwrite(entry, vma);
2234                        if (!write)
2235                                entry = pte_wrprotect(entry);
2236                        if (!young)
2237                                entry = pte_mkold(entry);
2238                        if (soft_dirty)
2239                                entry = pte_mksoft_dirty(entry);
2240                }
2241                pte = pte_offset_map(&_pmd, addr);
2242                BUG_ON(!pte_none(*pte));
2243                set_pte_at(mm, addr, pte, entry);
2244                atomic_inc(&page[i]._mapcount);
2245                pte_unmap(pte);
2246        }
2247
2248        /*
2249         * Set PG_double_map before dropping compound_mapcount to avoid
2250         * false-negative page_mapped().
2251         */
2252        if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
2253                for (i = 0; i < HPAGE_PMD_NR; i++)
2254                        atomic_inc(&page[i]._mapcount);
2255        }
2256
2257        if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
2258                /* Last compound_mapcount is gone. */
2259                __dec_node_page_state(page, NR_ANON_THPS);
2260                if (TestClearPageDoubleMap(page)) {
2261                        /* No need in mapcount reference anymore */
2262                        for (i = 0; i < HPAGE_PMD_NR; i++)
2263                                atomic_dec(&page[i]._mapcount);
2264                }
2265        }
2266
2267        smp_wmb(); /* make pte visible before pmd */
2268        pmd_populate(mm, pmd, pgtable);
2269
2270        if (freeze) {
2271                for (i = 0; i < HPAGE_PMD_NR; i++) {
2272                        page_remove_rmap(page + i, false);
2273                        put_page(page + i);
2274                }
2275        }
2276}
2277
2278void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2279                unsigned long address, bool freeze, struct page *page)
2280{
2281        spinlock_t *ptl;
2282        struct mmu_notifier_range range;
2283
2284        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
2285                                address & HPAGE_PMD_MASK,
2286                                (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2287        mmu_notifier_invalidate_range_start(&range);
2288        ptl = pmd_lock(vma->vm_mm, pmd);
2289
2290        /*
2291         * If caller asks to setup a migration entries, we need a page to check
2292         * pmd against. Otherwise we can end up replacing wrong page.
2293         */
2294        VM_BUG_ON(freeze && !page);
2295        if (page && page != pmd_page(*pmd))
2296                goto out;
2297
2298        if (pmd_trans_huge(*pmd)) {
2299                page = pmd_page(*pmd);
2300                if (PageMlocked(page))
2301                        clear_page_mlock(page);
2302        } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
2303                goto out;
2304        __split_huge_pmd_locked(vma, pmd, range.start, freeze);
2305out:
2306        spin_unlock(ptl);
2307        /*
2308         * No need to double call mmu_notifier->invalidate_range() callback.
2309         * They are 3 cases to consider inside __split_huge_pmd_locked():
2310         *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
2311         *  2) __split_huge_zero_page_pmd() read only zero page and any write
2312         *    fault will trigger a flush_notify before pointing to a new page
2313         *    (it is fine if the secondary mmu keeps pointing to the old zero
2314         *    page in the meantime)
2315         *  3) Split a huge pmd into pte pointing to the same page. No need
2316         *     to invalidate secondary tlb entry they are all still valid.
2317         *     any further changes to individual pte will notify. So no need
2318         *     to call mmu_notifier->invalidate_range()
2319         */
2320        mmu_notifier_invalidate_range_only_end(&range);
2321}
2322
2323void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2324                bool freeze, struct page *page)
2325{
2326        pgd_t *pgd;
2327        p4d_t *p4d;
2328        pud_t *pud;
2329        pmd_t *pmd;
2330
2331        pgd = pgd_offset(vma->vm_mm, address);
2332        if (!pgd_present(*pgd))
2333                return;
2334
2335        p4d = p4d_offset(pgd, address);
2336        if (!p4d_present(*p4d))
2337                return;
2338
2339        pud = pud_offset(p4d, address);
2340        if (!pud_present(*pud))
2341                return;
2342
2343        pmd = pmd_offset(pud, address);
2344
2345        __split_huge_pmd(vma, pmd, address, freeze, page);
2346}
2347
2348void vma_adjust_trans_huge(struct vm_area_struct *vma,
2349                             unsigned long start,
2350                             unsigned long end,
2351                             long adjust_next)
2352{
2353        /*
2354         * If the new start address isn't hpage aligned and it could
2355         * previously contain an hugepage: check if we need to split
2356         * an huge pmd.
2357         */
2358        if (start & ~HPAGE_PMD_MASK &&
2359            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2360            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2361                split_huge_pmd_address(vma, start, false, NULL);
2362
2363        /*
2364         * If the new end address isn't hpage aligned and it could
2365         * previously contain an hugepage: check if we need to split
2366         * an huge pmd.
2367         */
2368        if (end & ~HPAGE_PMD_MASK &&
2369            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2370            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2371                split_huge_pmd_address(vma, end, false, NULL);
2372
2373        /*
2374         * If we're also updating the vma->vm_next->vm_start, if the new
2375         * vm_next->vm_start isn't page aligned and it could previously
2376         * contain an hugepage: check if we need to split an huge pmd.
2377         */
2378        if (adjust_next > 0) {
2379                struct vm_area_struct *next = vma->vm_next;
2380                unsigned long nstart = next->vm_start;
2381                nstart += adjust_next << PAGE_SHIFT;
2382                if (nstart & ~HPAGE_PMD_MASK &&
2383                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2384                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2385                        split_huge_pmd_address(next, nstart, false, NULL);
2386        }
2387}
2388
2389static void unmap_page(struct page *page)
2390{
2391        enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
2392                TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
2393        bool unmap_success;
2394
2395        VM_BUG_ON_PAGE(!PageHead(page), page);
2396
2397        if (PageAnon(page))
2398                ttu_flags |= TTU_SPLIT_FREEZE;
2399
2400        unmap_success = try_to_unmap(page, ttu_flags);
2401        VM_BUG_ON_PAGE(!unmap_success, page);
2402}
2403
2404static void remap_page(struct page *page)
2405{
2406        int i;
2407        if (PageTransHuge(page)) {
2408                remove_migration_ptes(page, page, true);
2409        } else {
2410                for (i = 0; i < HPAGE_PMD_NR; i++)
2411                        remove_migration_ptes(page + i, page + i, true);
2412        }
2413}
2414
2415static void __split_huge_page_tail(struct page *head, int tail,
2416                struct lruvec *lruvec, struct list_head *list)
2417{
2418        struct page *page_tail = head + tail;
2419
2420        VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
2421
2422        /*
2423         * Clone page flags before unfreezing refcount.
2424         *
2425         * After successful get_page_unless_zero() might follow flags change,
2426         * for exmaple lock_page() which set PG_waiters.
2427         */
2428        page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2429        page_tail->flags |= (head->flags &
2430                        ((1L << PG_referenced) |
2431                         (1L << PG_swapbacked) |
2432                         (1L << PG_swapcache) |
2433                         (1L << PG_mlocked) |
2434                         (1L << PG_uptodate) |
2435                         (1L << PG_active) |
2436                         (1L << PG_workingset) |
2437                         (1L << PG_locked) |
2438                         (1L << PG_unevictable) |
2439                         (1L << PG_dirty)));
2440
2441        /* ->mapping in first tail page is compound_mapcount */
2442        VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
2443                        page_tail);
2444        page_tail->mapping = head->mapping;
2445        page_tail->index = head->index + tail;
2446
2447        /* Page flags must be visible before we make the page non-compound. */
2448        smp_wmb();
2449
2450        /*
2451         * Clear PageTail before unfreezing page refcount.
2452         *
2453         * After successful get_page_unless_zero() might follow put_page()
2454         * which needs correct compound_head().
2455         */
2456        clear_compound_head(page_tail);
2457
2458        /* Finally unfreeze refcount. Additional reference from page cache. */
2459        page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
2460                                          PageSwapCache(head)));
2461
2462        if (page_is_young(head))
2463                set_page_young(page_tail);
2464        if (page_is_idle(head))
2465                set_page_idle(page_tail);
2466
2467        page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
2468
2469        /*
2470         * always add to the tail because some iterators expect new
2471         * pages to show after the currently processed elements - e.g.
2472         * migrate_pages
2473         */
2474        lru_add_page_tail(head, page_tail, lruvec, list);
2475}
2476
2477static void __split_huge_page(struct page *page, struct list_head *list,
2478                pgoff_t end, unsigned long flags)
2479{
2480        struct page *head = compound_head(page);
2481        pg_data_t *pgdat = page_pgdat(head);
2482        struct lruvec *lruvec;
2483        int i;
2484
2485        lruvec = mem_cgroup_page_lruvec(head, pgdat);
2486
2487        /* complete memcg works before add pages to LRU */
2488        mem_cgroup_split_huge_fixup(head);
2489
2490        for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
2491                __split_huge_page_tail(head, i, lruvec, list);
2492                /* Some pages can be beyond i_size: drop them from page cache */
2493                if (head[i].index >= end) {
2494                        ClearPageDirty(head + i);
2495                        __delete_from_page_cache(head + i, NULL);
2496                        if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
2497                                shmem_uncharge(head->mapping->host, 1);
2498                        put_page(head + i);
2499                }
2500        }
2501
2502        ClearPageCompound(head);
2503        /* See comment in __split_huge_page_tail() */
2504        if (PageAnon(head)) {
2505                /* Additional pin to swap cache */
2506                if (PageSwapCache(head))
2507                        page_ref_add(head, 2);
2508                else
2509                        page_ref_inc(head);
2510        } else {
2511                /* Additional pin to page cache */
2512                page_ref_add(head, 2);
2513                xa_unlock(&head->mapping->i_pages);
2514        }
2515
2516        spin_unlock_irqrestore(&pgdat->lru_lock, flags);
2517
2518        remap_page(head);
2519
2520        for (i = 0; i < HPAGE_PMD_NR; i++) {
2521                struct page *subpage = head + i;
2522                if (subpage == page)
2523                        continue;
2524                unlock_page(subpage);
2525
2526                /*
2527                 * Subpages may be freed if there wasn't any mapping
2528                 * like if add_to_swap() is running on a lru page that
2529                 * had its mapping zapped. And freeing these pages
2530                 * requires taking the lru_lock so we do the put_page
2531                 * of the tail pages after the split is complete.
2532                 */
2533                put_page(subpage);
2534        }
2535}
2536
2537int total_mapcount(struct page *page)
2538{
2539        int i, compound, ret;
2540
2541        VM_BUG_ON_PAGE(PageTail(page), page);
2542
2543        if (likely(!PageCompound(page)))
2544                return atomic_read(&page->_mapcount) + 1;
2545
2546        compound = compound_mapcount(page);
2547        if (PageHuge(page))
2548                return compound;
2549        ret = compound;
2550        for (i = 0; i < HPAGE_PMD_NR; i++)
2551                ret += atomic_read(&page[i]._mapcount) + 1;
2552        /* File pages has compound_mapcount included in _mapcount */
2553        if (!PageAnon(page))
2554                return ret - compound * HPAGE_PMD_NR;
2555        if (PageDoubleMap(page))
2556                ret -= HPAGE_PMD_NR;
2557        return ret;
2558}
2559
2560/*
2561 * This calculates accurately how many mappings a transparent hugepage
2562 * has (unlike page_mapcount() which isn't fully accurate). This full
2563 * accuracy is primarily needed to know if copy-on-write faults can
2564 * reuse the page and change the mapping to read-write instead of
2565 * copying them. At the same time this returns the total_mapcount too.
2566 *
2567 * The function returns the highest mapcount any one of the subpages
2568 * has. If the return value is one, even if different processes are
2569 * mapping different subpages of the transparent hugepage, they can
2570 * all reuse it, because each process is reusing a different subpage.
2571 *
2572 * The total_mapcount is instead counting all virtual mappings of the
2573 * subpages. If the total_mapcount is equal to "one", it tells the
2574 * caller all mappings belong to the same "mm" and in turn the
2575 * anon_vma of the transparent hugepage can become the vma->anon_vma
2576 * local one as no other process may be mapping any of the subpages.
2577 *
2578 * It would be more accurate to replace page_mapcount() with
2579 * page_trans_huge_mapcount(), however we only use
2580 * page_trans_huge_mapcount() in the copy-on-write faults where we
2581 * need full accuracy to avoid breaking page pinning, because
2582 * page_trans_huge_mapcount() is slower than page_mapcount().
2583 */
2584int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
2585{
2586        int i, ret, _total_mapcount, mapcount;
2587
2588        /* hugetlbfs shouldn't call it */
2589        VM_BUG_ON_PAGE(PageHuge(page), page);
2590
2591        if (likely(!PageTransCompound(page))) {
2592                mapcount = atomic_read(&page->_mapcount) + 1;
2593                if (total_mapcount)
2594                        *total_mapcount = mapcount;
2595                return mapcount;
2596        }
2597
2598        page = compound_head(page);
2599
2600        _total_mapcount = ret = 0;
2601        for (i = 0; i < HPAGE_PMD_NR; i++) {
2602                mapcount = atomic_read(&page[i]._mapcount) + 1;
2603                ret = max(ret, mapcount);
2604                _total_mapcount += mapcount;
2605        }
2606        if (PageDoubleMap(page)) {
2607                ret -= 1;
2608                _total_mapcount -= HPAGE_PMD_NR;
2609        }
2610        mapcount = compound_mapcount(page);
2611        ret += mapcount;
2612        _total_mapcount += mapcount;
2613        if (total_mapcount)
2614                *total_mapcount = _total_mapcount;
2615        return ret;
2616}
2617
2618/* Racy check whether the huge page can be split */
2619bool can_split_huge_page(struct page *page, int *pextra_pins)
2620{
2621        int extra_pins;
2622
2623        /* Additional pins from page cache */
2624        if (PageAnon(page))
2625                extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
2626        else
2627                extra_pins = HPAGE_PMD_NR;
2628        if (pextra_pins)
2629                *pextra_pins = extra_pins;
2630        return total_mapcount(page) == page_count(page) - extra_pins - 1;
2631}
2632
2633/*
2634 * This function splits huge page into normal pages. @page can point to any
2635 * subpage of huge page to split. Split doesn't change the position of @page.
2636 *
2637 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
2638 * The huge page must be locked.
2639 *
2640 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
2641 *
2642 * Both head page and tail pages will inherit mapping, flags, and so on from
2643 * the hugepage.
2644 *
2645 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
2646 * they are not mapped.
2647 *
2648 * Returns 0 if the hugepage is split successfully.
2649 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
2650 * us.
2651 */
2652int split_huge_page_to_list(struct page *page, struct list_head *list)
2653{
2654        struct page *head = compound_head(page);
2655        struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
2656        struct anon_vma *anon_vma = NULL;
2657        struct address_space *mapping = NULL;
2658        int count, mapcount, extra_pins, ret;
2659        bool mlocked;
2660        unsigned long flags;
2661        pgoff_t end;
2662
2663        VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
2664        VM_BUG_ON_PAGE(!PageLocked(page), page);
2665        VM_BUG_ON_PAGE(!PageCompound(page), page);
2666
2667        if (PageWriteback(page))
2668                return -EBUSY;
2669
2670        if (PageAnon(head)) {
2671                /*
2672                 * The caller does not necessarily hold an mmap_sem that would
2673                 * prevent the anon_vma disappearing so we first we take a
2674                 * reference to it and then lock the anon_vma for write. This
2675                 * is similar to page_lock_anon_vma_read except the write lock
2676                 * is taken to serialise against parallel split or collapse
2677                 * operations.
2678                 */
2679                anon_vma = page_get_anon_vma(head);
2680                if (!anon_vma) {
2681                        ret = -EBUSY;
2682                        goto out;
2683                }
2684                end = -1;
2685                mapping = NULL;
2686                anon_vma_lock_write(anon_vma);
2687        } else {
2688                mapping = head->mapping;
2689
2690                /* Truncated ? */
2691                if (!mapping) {
2692                        ret = -EBUSY;
2693                        goto out;
2694                }
2695
2696                anon_vma = NULL;
2697                i_mmap_lock_read(mapping);
2698
2699                /*
2700                 *__split_huge_page() may need to trim off pages beyond EOF:
2701                 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
2702                 * which cannot be nested inside the page tree lock. So note
2703                 * end now: i_size itself may be changed at any moment, but
2704                 * head page lock is good enough to serialize the trimming.
2705                 */
2706                end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
2707        }
2708
2709        /*
2710         * Racy check if we can split the page, before unmap_page() will
2711         * split PMDs
2712         */
2713        if (!can_split_huge_page(head, &extra_pins)) {
2714                ret = -EBUSY;
2715                goto out_unlock;
2716        }
2717
2718        mlocked = PageMlocked(page);
2719        unmap_page(head);
2720        VM_BUG_ON_PAGE(compound_mapcount(head), head);
2721
2722        /* Make sure the page is not on per-CPU pagevec as it takes pin */
2723        if (mlocked)
2724                lru_add_drain();
2725
2726        /* prevent PageLRU to go away from under us, and freeze lru stats */
2727        spin_lock_irqsave(&pgdata->lru_lock, flags);
2728
2729        if (mapping) {
2730                XA_STATE(xas, &mapping->i_pages, page_index(head));
2731
2732                /*
2733                 * Check if the head page is present in page cache.
2734                 * We assume all tail are present too, if head is there.
2735                 */
2736                xa_lock(&mapping->i_pages);
2737                if (xas_load(&xas) != head)
2738                        goto fail;
2739        }
2740
2741        /* Prevent deferred_split_scan() touching ->_refcount */
2742        spin_lock(&pgdata->split_queue_lock);
2743        count = page_count(head);
2744        mapcount = total_mapcount(head);
2745        if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
2746                if (!list_empty(page_deferred_list(head))) {
2747                        pgdata->split_queue_len--;
2748                        list_del(page_deferred_list(head));
2749                }
2750                if (mapping)
2751                        __dec_node_page_state(page, NR_SHMEM_THPS);
2752                spin_unlock(&pgdata->split_queue_lock);
2753                __split_huge_page(page, list, end, flags);
2754                if (PageSwapCache(head)) {
2755                        swp_entry_t entry = { .val = page_private(head) };
2756
2757                        ret = split_swap_cluster(entry);
2758                } else
2759                        ret = 0;
2760        } else {
2761                if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
2762                        pr_alert("total_mapcount: %u, page_count(): %u\n",
2763                                        mapcount, count);
2764                        if (PageTail(page))
2765                                dump_page(head, NULL);
2766                        dump_page(page, "total_mapcount(head) > 0");
2767                        BUG();
2768                }
2769                spin_unlock(&pgdata->split_queue_lock);
2770fail:           if (mapping)
2771                        xa_unlock(&mapping->i_pages);
2772                spin_unlock_irqrestore(&pgdata->lru_lock, flags);
2773                remap_page(head);
2774                ret = -EBUSY;
2775        }
2776
2777out_unlock:
2778        if (anon_vma) {
2779                anon_vma_unlock_write(anon_vma);
2780                put_anon_vma(anon_vma);
2781        }
2782        if (mapping)
2783                i_mmap_unlock_read(mapping);
2784out:
2785        count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
2786        return ret;
2787}
2788
2789void free_transhuge_page(struct page *page)
2790{
2791        struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
2792        unsigned long flags;
2793
2794        spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2795        if (!list_empty(page_deferred_list(page))) {
2796                pgdata->split_queue_len--;
2797                list_del(page_deferred_list(page));
2798        }
2799        spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2800        free_compound_page(page);
2801}
2802
2803void deferred_split_huge_page(struct page *page)
2804{
2805        struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
2806        unsigned long flags;
2807
2808        VM_BUG_ON_PAGE(!PageTransHuge(page), page);
2809
2810        spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2811        if (list_empty(page_deferred_list(page))) {
2812                count_vm_event(THP_DEFERRED_SPLIT_PAGE);
2813                list_add_tail(page_deferred_list(page), &pgdata->split_queue);
2814                pgdata->split_queue_len++;
2815        }
2816        spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2817}
2818
2819static unsigned long deferred_split_count(struct shrinker *shrink,
2820                struct shrink_control *sc)
2821{
2822        struct pglist_data *pgdata = NODE_DATA(sc->nid);
2823        return READ_ONCE(pgdata->split_queue_len);
2824}
2825
2826static unsigned long deferred_split_scan(struct shrinker *shrink,
2827                struct shrink_control *sc)
2828{
2829        struct pglist_data *pgdata = NODE_DATA(sc->nid);
2830        unsigned long flags;
2831        LIST_HEAD(list), *pos, *next;
2832        struct page *page;
2833        int split = 0;
2834
2835        spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2836        /* Take pin on all head pages to avoid freeing them under us */
2837        list_for_each_safe(pos, next, &pgdata->split_queue) {
2838                page = list_entry((void *)pos, struct page, mapping);
2839                page = compound_head(page);
2840                if (get_page_unless_zero(page)) {
2841                        list_move(page_deferred_list(page), &list);
2842                } else {
2843                        /* We lost race with put_compound_page() */
2844                        list_del_init(page_deferred_list(page));
2845                        pgdata->split_queue_len--;
2846                }
2847                if (!--sc->nr_to_scan)
2848                        break;
2849        }
2850        spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2851
2852        list_for_each_safe(pos, next, &list) {
2853                page = list_entry((void *)pos, struct page, mapping);
2854                if (!trylock_page(page))
2855                        goto next;
2856                /* split_huge_page() removes page from list on success */
2857                if (!split_huge_page(page))
2858                        split++;
2859                unlock_page(page);
2860next:
2861                put_page(page);
2862        }
2863
2864        spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2865        list_splice_tail(&list, &pgdata->split_queue);
2866        spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2867
2868        /*
2869         * Stop shrinker if we didn't split any page, but the queue is empty.
2870         * This can happen if pages were freed under us.
2871         */
2872        if (!split && list_empty(&pgdata->split_queue))
2873                return SHRINK_STOP;
2874        return split;
2875}
2876
2877static struct shrinker deferred_split_shrinker = {
2878        .count_objects = deferred_split_count,
2879        .scan_objects = deferred_split_scan,
2880        .seeks = DEFAULT_SEEKS,
2881        .flags = SHRINKER_NUMA_AWARE,
2882};
2883
2884#ifdef CONFIG_DEBUG_FS
2885static int split_huge_pages_set(void *data, u64 val)
2886{
2887        struct zone *zone;
2888        struct page *page;
2889        unsigned long pfn, max_zone_pfn;
2890        unsigned long total = 0, split = 0;
2891
2892        if (val != 1)
2893                return -EINVAL;
2894
2895        for_each_populated_zone(zone) {
2896                max_zone_pfn = zone_end_pfn(zone);
2897                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
2898                        if (!pfn_valid(pfn))
2899                                continue;
2900
2901                        page = pfn_to_page(pfn);
2902                        if (!get_page_unless_zero(page))
2903                                continue;
2904
2905                        if (zone != page_zone(page))
2906                                goto next;
2907
2908                        if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
2909                                goto next;
2910
2911                        total++;
2912                        lock_page(page);
2913                        if (!split_huge_page(page))
2914                                split++;
2915                        unlock_page(page);
2916next:
2917                        put_page(page);
2918                }
2919        }
2920
2921        pr_info("%lu of %lu THP split\n", split, total);
2922
2923        return 0;
2924}
2925DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
2926                "%llu\n");
2927
2928static int __init split_huge_pages_debugfs(void)
2929{
2930        debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
2931                            &split_huge_pages_fops);
2932        return 0;
2933}
2934late_initcall(split_huge_pages_debugfs);
2935#endif
2936
2937#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2938void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
2939                struct page *page)
2940{
2941        struct vm_area_struct *vma = pvmw->vma;
2942        struct mm_struct *mm = vma->vm_mm;
2943        unsigned long address = pvmw->address;
2944        pmd_t pmdval;
2945        swp_entry_t entry;
2946        pmd_t pmdswp;
2947
2948        if (!(pvmw->pmd && !pvmw->pte))
2949                return;
2950
2951        flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
2952        pmdval = *pvmw->pmd;
2953        pmdp_invalidate(vma, address, pvmw->pmd);
2954        if (pmd_dirty(pmdval))
2955                set_page_dirty(page);
2956        entry = make_migration_entry(page, pmd_write(pmdval));
2957        pmdswp = swp_entry_to_pmd(entry);
2958        if (pmd_soft_dirty(pmdval))
2959                pmdswp = pmd_swp_mksoft_dirty(pmdswp);
2960        set_pmd_at(mm, address, pvmw->pmd, pmdswp);
2961        page_remove_rmap(page, true);
2962        put_page(page);
2963}
2964
2965void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
2966{
2967        struct vm_area_struct *vma = pvmw->vma;
2968        struct mm_struct *mm = vma->vm_mm;
2969        unsigned long address = pvmw->address;
2970        unsigned long mmun_start = address & HPAGE_PMD_MASK;
2971        pmd_t pmde;
2972        swp_entry_t entry;
2973
2974        if (!(pvmw->pmd && !pvmw->pte))
2975                return;
2976
2977        entry = pmd_to_swp_entry(*pvmw->pmd);
2978        get_page(new);
2979        pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
2980        if (pmd_swp_soft_dirty(*pvmw->pmd))
2981                pmde = pmd_mksoft_dirty(pmde);
2982        if (is_write_migration_entry(entry))
2983                pmde = maybe_pmd_mkwrite(pmde, vma);
2984
2985        flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
2986        if (PageAnon(new))
2987                page_add_anon_rmap(new, vma, mmun_start, true);
2988        else
2989                page_add_file_rmap(new, true);
2990        set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
2991        if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
2992                mlock_vma_page(new);
2993        update_mmu_cache_pmd(vma, address, pvmw->pmd);
2994}
2995#endif
2996